class RPolicy:

    def __init__(self, nact, rnn_units=256):
        cells = [GRUCell(rnn_units, kernel_initializer=orthogonal()) for _ in range(2)]
        self.gru = MultiRNNCell(cells)
        self.state = self.gru.zero_state(batch_size=1, dtype=tf.float32)

        self.obs_ph = tf.placeholder(dtype=tf.float32)
        fc1 = dense(self.obs_ph, rnn_units, activation=elu, kernel_initializer=orthogonal(), name='fc1')
        expand = tf.expand_dims(fc1, axis=0, name='expand')
        rnn_out, self.state = dynamic_rnn(self.gru, expand, initial_state=self.state)
        reshape = tf.reshape(rnn_out, shape=[-1, rnn_units], name='reshape')

        self.logits = dense(reshape, nact, kernel_initializer=orthogonal(0.01), name='logits')
        self.pi = tf.nn.softmax(self.logits, name='pi')
        self.action = boltzmann(self.pi)
        self.value = dense(self.logits, 1, kernel_initializer=orthogonal(), name='value')

    def forward(self, obs, sess):
        feed_dict = {self.obs_ph: obs}
        action, value = sess.run([self.action, self.pi], feed_dict=feed_dict)
        return action, value

    def reset(self):
        self.state = self.gru.zero_state(batch_size=1, dtype=tf.float32)
class CRPolicy:

    def __init__(self, h, w, c, nact, rnn_units=256, cnn_units=32):
        cells = [GRUCell(rnn_units, kernel_initializer=orthogonal()) for _ in range(2)]
        self.gru = MultiRNNCell(cells)
        self.state = self.gru.zero_state(batch_size=1, dtype=tf.float32)

        self.obs_ph = tf.placeholder(dtype=tf.float32, shape=[None, h, w, c])
        cv1 = conv2d(self.obs_ph, cnn_units, 3, strides=2, activation=elu, kernel_initializer=orthogonal(), name='cv1')
        cv2 = conv2d(cv1, cnn_units, 3, strides=2, activation=elu, kernel_initializer=orthogonal(), name='cv2')
        cv3 = conv2d(cv2, cnn_units, 3, strides=2, activation=elu, kernel_initializer=orthogonal(), name='cv3')
        cv4 = conv2d(cv3, cnn_units, 3, strides=2, activation=elu, kernel_initializer=orthogonal(), name='cv4')
        flat = flatten(cv4, name='flatten')
        fc1 = dense(flat, rnn_units, activation=elu, kernel_initializer=orthogonal(), name='fc1')
        expand = tf.expand_dims(fc1, axis=0, name='expand')
        rnn_out, self.state = dynamic_rnn(self.gru, expand, initial_state=self.state)
        reshape = tf.reshape(rnn_out, shape=[-1, rnn_units], name='reshape')

        self.logits = dense(reshape, nact, kernel_initializer=orthogonal(0.01), name='logits')
        self.pi = tf.nn.softmax(self.logits, name='pi')
        self.action = boltzmann(self.pi)
        value = dense(reshape, 1, kernel_initializer=orthogonal(), name='value')
        self.value = tf.squeeze(value)

    def forward(self, obs, sess):
        feed_dict = {self.obs_ph: obs}
        action, value = sess.run([self.action, self.value], feed_dict=feed_dict)
        return action, value

    def reset(self):
        self.state = self.gru.zero_state(batch_size=1, dtype=tf.float32)
Example #3
0
    def __init__(self, batch_size, seq_length, n_layers, rnn_size, vocab_size,
                 scope, **ignored_args):
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.rnn_size = rnn_size
        self.vocab_size = vocab_size
        self.n_layers = n_layers
        self.scope = scope
        self.grad_clip = 5.0

        self.input_data = tf.placeholder(tf.int32,
                                         (self.seq_length, self.batch_size))
        self.target_data = tf.placeholder(tf.int32,
                                          (self.seq_length, self.batch_size))
        self.embedding = tf.get_variable("embedding",
                                         (self.vocab_size, self.rnn_size))
        embedded = tf.nn.embedding_lookup(self.embedding, self.input_data)
        # embedded.shape = (self.seq_length, self.batch_size, self.rnn_size)
        self.softmax_w = tf.get_variable("softmax_w",
                                         (self.rnn_size, self.vocab_size))
        self.softmax_b = tf.get_variable("softmax_b", (self.vocab_size, ))
        self.learning_rate = tf.placeholder(tf.float32, ())

        cell = MultiRNNCell(
            [BasicLSTMCell(self.rnn_size) for _ in range(self.n_layers)])
        state = self.init_state = cell.zero_state(batch_size=self.batch_size,
                                                  dtype=tf.float32)
        logits = []  # .shape = (seq_length, batch_size, vocab_size)

        with tf.variable_scope(self.scope):
            for i in range(self.seq_length):
                output, state = cell(
                    embedded[i],
                    state)  # output.shape = (batch_size, rnn_size)
                logits.append(output @ self.softmax_w + self.softmax_b)
                tf.get_variable_scope().reuse_variables()

        self.final_state = state
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=self.target_data, logits=logits)
        self.cost = tf.reduce_mean(loss)

        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                                          self.grad_clip)
        self.train_op = tf.train.AdamOptimizer(
            self.learning_rate).apply_gradients(zip(grads, tvars))

        # sample model

        self.sample_input_char = tf.placeholder(tf.int32)
        embedded = tf.nn.embedding_lookup(
            self.embedding, tf.reshape(self.sample_input_char, (1, )))
        self.sample_init_state = cell.zero_state(batch_size=1,
                                                 dtype=tf.float32)
        with tf.variable_scope(self.scope, reuse=True):
            output, self.sample_final_state = cell(embedded,
                                                   self.sample_init_state)
            logits = output @ self.softmax_w + self.softmax_b
            self.sample_output_probs = tf.nn.softmax(logits[0])
Example #4
0
def self_rnn(input, units=128, layer_num = 2, parallel_iterations=64, name='gru', reuse=False):
    with tf.variable_scope(name_or_scope=name):
        with tf.variable_scope('enc'):
            encoder_rnn = MultiRNNCell([GRUCell(units) for _ in range(layer_num)])
        with tf.variable_scope('dec'):
            decoder_rnn = MultiRNNCell([ResidualWrapper(GRUCell(units)) for _ in range(layer_num)])

        rnn_tot = input.shape[1]
        batch = input.shape[0]

        cond = lambda x, *_: tf.less(x, rnn_tot)

        with tf.variable_scope('pre'):
            cnt = tf.zeros((), dtype=tf.int32)
            encoder_init_state = encoder_rnn.zero_state(batch, dtype=tf.float32)
            decoder_init_state = decoder_rnn.zero_state(batch, dtype=tf.float32)
            res_ta = tf.TensorArray(dtype=tf.float32, size=rnn_tot)
            input_time_major = tf.transpose(input, (1, 0, 2))

        def body(cnt, encoder_pre, decoder_pre, res_ta):
            input = input_time_major[cnt]
            with tf.variable_scope('enc'):
                output_enc, new_enc_state = encoder_rnn(input, encoder_pre)
            with tf.variable_scope('dec'):
                output_dec, new_dec_state = decoder_rnn(output_enc, decoder_pre)
            res_ta = res_ta.write(cnt, output_dec)
            cnt = tf.add(cnt, 1)
            return cnt, new_enc_state, new_dec_state, res_ta


        res_cnt, encoder_res, decoder_res, final_res_ta = tf.while_loop(cond, body, loop_vars=[cnt, encoder_init_state, decoder_init_state, res_ta], parallel_iterations=parallel_iterations)
        # final_res_ta = tf.stack(final_res_ta)
        final_res = final_res_ta.stack()

        return final_res
Example #5
0
    def _dynamic_birnn(self, x, seq_len, batch_size, max_seq_len):

        cell_fw = MultiRNNCell([DropoutWrapper(GRUCell(cell_hidden))
                                    for cell_hidden in self.cell_hidden])
        cell_bw = MultiRNNCell([DropoutWrapper(GRUCell(cell_hidden)) for cell_hidden in self.cell_hidden])

        init_state_fw = cell_fw.zero_state(batch_size, dtype=tf.float32)
        init_state_bw = cell_bw.zero_state(batch_size, dtype=tf.float32)

        outputs, states = tf.nn.bidirectional_dynamic_rnn(
            cell_fw=cell_fw,
            cell_bw=cell_bw,
            inputs=x,
            initial_state_fw=init_state_fw,
            initial_state_bw=init_state_bw,
            sequence_length=seq_len
        )

        outputs = (outputs[0] + outputs[1]) / 2

        outputs = tf.reduce_sum(outputs, axis=1)
        outputs = tf.divide(outputs, tf.cast(seq_len[:, None], tf.float32))

        fc = tf.layers.dense(outputs, 1000)
        fc = tf.nn.leaky_relu(fc, 0.2)
        fc = tf.layers.dense(fc, self.n_class)

        return fc
Example #6
0
    def build_decoder_cell(self, encoder_outputs, encoder_final_state,
                           hidden_size, cell_type, layer_size):
        """
        构建解码器所有层
        :param encoder_outputs:
        :param encoder_state:
        :param hidden_size:
        :param cell_type:
        :param layer_size:
        :return:
        """
        sequence_length = self.encoder_inputs_length
        if self.mode == 'decode':
            encoder_outputs = tf.contrib.seq2seq.tile_batch(
                encoder_outputs, multiplier=self.beam_width)
            encoder_final_state = tf.contrib.seq2seq.tile_batch(
                encoder_final_state, multiplier=self.beam_width)
            sequence_length = tf.contrib.seq2seq.tile_batch(
                sequence_length, multiplier=self.beam_width)

        if self.bidirection:
            cell = MultiRNNCell([
                self.one_cell(hidden_size * 2, cell_type)
                for _ in range(layer_size)
            ])
        else:
            cell = MultiRNNCell([
                self.one_cell(hidden_size, cell_type)
                for _ in range(layer_size)
            ])
        # 使用attention机制
        self.attention_mechanism = BahdanauAttention(
            num_units=self.hidden_size,
            memory=encoder_outputs,
            memory_sequence_length=sequence_length)

        def cell_input_fn(inputs, attention):
            mul = 2 if self.bidirection else 1
            attn_projection = layers.Dense(self.hidden_size * mul,
                                           dtype=tf.float32,
                                           use_bias=False,
                                           name='attention_cell_input_fn')
            return attn_projection(array_ops.concat([inputs, attention], -1))

        cell = AttentionWrapper(cell=cell,
                                attention_mechanism=self.attention_mechanism,
                                attention_layer_size=self.hidden_size,
                                cell_input_fn=cell_input_fn,
                                name='Attention_Wrapper')
        if self.mode == 'decode':
            decoder_initial_state = cell.zero_state(
                batch_size=self.batch_size * self.beam_width,
                dtype=tf.float32).clone(cell_state=encoder_final_state)
        else:
            decoder_initial_state = cell.zero_state(
                batch_size=self.batch_size,
                dtype=tf.float32).clone(cell_state=encoder_final_state)
        return cell, decoder_initial_state
Example #7
0
class CRPolicy(tf.keras.Model):
    def __init__(self, n_actions, rnn_units=256, conv_units=32):
        super(CRPolicy, self).__init__()
        cells = [
            GRUCell(rnn_units, kernel_initializer=orthogonal())
            for _ in range(2)
        ]
        self.gru = MultiRNNCell(cells)
        self.state = self.gru.zero_state(batch_size=1, dtype=tf.float32)
        self.cv1 = Conv2D(conv_units,
                          3,
                          strides=2,
                          activation='elu',
                          kernel_initializer=orthogonal())
        self.cv2 = Conv2D(conv_units,
                          3,
                          strides=2,
                          activation='elu',
                          kernel_initializer=orthogonal())
        self.cv3 = Conv2D(conv_units,
                          3,
                          strides=2,
                          activation='elu',
                          kernel_initializer=orthogonal())
        self.cv4 = Conv2D(conv_units,
                          3,
                          strides=2,
                          activation='elu',
                          kernel_initializer=orthogonal())
        self.flatten = Flatten()
        self.fc1 = Dense(rnn_units, kernel_initializer=orthogonal())
        self.pol = Dense(n_actions, kernel_initializer=orthogonal(0.01))
        self.val = Dense(1, kernel_initializer=orthogonal())

    def call(self, obs):
        x = tf.constant(obs, dtype=tf.float32)
        x = self.cv1(x)
        x = self.cv2(x)
        x = self.cv3(x)
        x = self.cv4(x)
        x = self.flatten(x)
        x = self.fc1(x)
        x = tf.expand_dims(x, axis=0)
        x, self.state = dynamic_rnn(self.gru, x, initial_state=self.state)
        x = tf.reshape(x, shape=[-1, 256])
        return self.pol(x), self.val(x)

    def reset(self):
        self.state = self.gru.zero_state(batch_size=1, dtype=tf.float32)
Example #8
0
    def _simple_lstm(self, sequences):
        with tf.variable_scope("sequence_encoder"):
            bn = batch_norm(self.batch_size)

            in_cell = MultiRNNCell(
                [
                    # tf.contrib.rnn.DropoutWrapper(
                    LSTMCell(self.input_size, state_is_tuple=True)
                    #     output_keep_prob=self.keep_prob,
                    #     state_keep_prob=self.keep_prob
                    # ) for _ in range(self.num_layers)
                ],
                state_is_tuple=True)

            # state = tf.random_normal((self.batch_size, self.input_size))
            # initial_state = (LSTMStateTuple(state, state),) * self.num_layers
            initial_state = in_cell.zero_state(self.batch_size, tf.float32)

            # using length we select the last output per sequence which
            # represents the sequence encoding
            self.length = tf.placeholder(tf.int32,
                                         shape=(self.batch_size, ),
                                         name="lengths")
            self.enc_outs, self.enc_state = tf.nn.dynamic_rnn(
                in_cell,
                inputs=bn(sequences),
                initial_state=initial_state,
                sequence_length=self.length,
                dtype=tf.float32)
            return tf.gather_nd(
                self.enc_outs,
                tf.stack([tf.range(self.batch_size), self.length - 1], axis=1))
Example #9
0
def simple_embed_rnn(inputs,
                     batch_size,
                     num_units,
                     num_layers,
                     num_residual_layers,
                     num_classes,
                     sequence_length=None,
                     initializer=None,
                     dropout=0,
                     unit_type='lstm'):
    if initializer is None:
        initializer = get_initializer('uniform', init_weight=math.sqrt(3))
    embeddings = tf.get_variable('embedding_simple_rnn',
                                 [num_classes, num_units],
                                 initializer=initializer,
                                 dtype=tf.float32)
    encoder_inputs = tf.nn.embedding_lookup(embeddings, inputs)
    if num_layers > 1:
        cells = []
        for i in range(num_layers):
            cells.append(single_rnn_cell(unit_type,
                                         num_units,
                                         dropout,
                                         residual_connection=(i >= num_layers - num_residual_layers)))
        cell = MultiRNNCell(cells)
    else:
        cell = single_rnn_cell(unit_type, num_units, dropout)
    state = cell.zero_state(batch_size, tf.float32)
    encoder_outputs, encoder_state = tf.nn.dynamic_rnn(cell,
                                                       encoder_inputs,
                                                       sequence_length=sequence_length,
                                                       initial_state=state)

    return encoder_outputs, encoder_state
Example #10
0
    def _dynamic_rnn(self, x, seq_len, batch_size, max_seq_len):

        cell = MultiRNNCell([GRUCell(cell_hidden) for cell_hidden in self.cell_hidden])
        init_state = cell.zero_state(batch_size, dtype=tf.float32)
        outputs, state = tf.nn.dynamic_rnn(
            cell,
            inputs=x,
            initial_state=init_state,
            sequence_length=seq_len
        )

        if not self.avg_output:
            index = tf.range(0, batch_size) * max_seq_len + (seq_len - 1)
            outputs = tf.reshape(outputs, [-1, self.cell_hidden[-1]])
            outputs = tf.gather(outputs, index)
        else:
            # outputs = tf.Print(outputs, [tf.shape(outputs)], message="output shape")
            outputs = tf.reduce_sum(outputs, axis=1)
            outputs = tf.divide(outputs, tf.cast(seq_len[:, None], tf.float32))
            # outputs = tf.Print(outputs, [tf.shape(outputs)], message="output shape")

        fc = tf.layers.dense(outputs, 1000)
        fc = tf.nn.leaky_relu(fc, 0.2)
        fc = tf.layers.dense(fc, self.n_class)

        return fc
Example #11
0
class DecoderRNNV1(RNNCell):

    def __init__(self, out_units, attention_cell: AttentionRNN,
                 trainable=True, name=None, **kwargs):
        super(DecoderRNNV1, self).__init__(name=name, trainable=trainable, **kwargs)

        self._cell = MultiRNNCell([
            OutputProjectionWrapper(attention_cell, out_units),
            ResidualWrapper(GRUCell(out_units)),
            ResidualWrapper(GRUCell(out_units)),
        ], state_is_tuple=True)

    @property
    def state_size(self):
        return self._cell.state_size

    @property
    def output_size(self):
        return self._cell.output_size

    def zero_state(self, batch_size, dtype):
        return self._cell.zero_state(batch_size, dtype)

    def compute_output_shape(self, input_shape):
        return tf.TensorShape([input_shape[0], input_shape[1], self.output_size])

    def call(self, inputs, state):
        return self._cell(inputs, state)
Example #12
0
class Policy:
    """
    Our policy. Takes as input the current state as a list of
    input/feedback indices and outputs the action distribution.
    """

    def __init__(self):
        self.guess_embedding = Embedding(config.max_guesses + 1, 
                                         config.guess_embedding_size)
        self.feedback_embedding = Embedding(config.max_feedback + 1,
                                            config.feedback_embedding_size)
        self.lstm = MultiRNNCell([
            LSTMCell(config.lstm_hidden_size), 
            LSTMCell(config.lstm_hidden_size)
        ])

        self.dense = tf.layers.Dense(config.max_guesses)

    @property
    def variables(self):
        """Return all the trainable parameters"""
        return [
            *self.guess_embedding.variables,
            *self.feedback_embedding.variables,
            *self.lstm.variables,
            *self.dense.variables
        ]

    @property
    def named_variables(self):
        """Method to ensure variables across different 'Policy'
        instances are named consistently"""
        return dict(zip(map(str, itertools.count()), self.variables))


    def __call__(self, game_state, with_softmax=True):
        """
        Do a forward pass to get the action distribution
        """

        state = self.lstm.zero_state(1, tf.float32)

        for guess, feedback in game_state:
            guess_tensor = tf.reshape(tf.convert_to_tensor(guess), (1,))
            feedback_tensor = tf.reshape(tf.convert_to_tensor(feedback), (1,))
            guess_embedded = self.guess_embedding(guess_tensor)
            feedback_embedded = self.feedback_embedding(feedback_tensor)

            combined_embedded = tf.concat([guess_embedded,
                                            feedback_embedded],
                                            axis=-1)
            # can I do multiple inputs to the LSTM instead of concatenating?

            output, state = self.lstm(combined_embedded, state)

        logits = self.dense(output)
        if with_softmax:
            return tf.nn.softmax(logits)
        return logits
Example #13
0
	def build_rnn(self, in_layer, nodes, batch_size, num_layers=2, mode='RNN'):
	    if mode.upper()=='RNN':
	        cell = MultiRNNCell([BasicRNNCell(nodes) for _ in range(num_layers)])
	    elif mode.upper()=='LSTM':
	        cell = MultiRNNCell([BasicLSTMCell(nodes) for _ in range(num_layers)])
	    initial_state = cell.zero_state(batch_size, tf.float32)
	    outputs, state = tf.nn.dynamic_rnn(cell, in_layer, initial_state=initial_state)
	    return initial_state, outputs, state
Example #14
0
    def setup_decoder_cell(self, config, keep_prob, use_beam_search,
                           init_state, attention_states, attention_lengths):
        batch_size = get_state_shape(init_state)[0]
        if use_beam_search:
            attention_states = tile_batch(attention_states,
                                          multiplier=self.beam_width)
            init_state = nest.map_structure(
                lambda s: tile_batch(s, self.beam_width), init_state)
            attention_lengths = tile_batch(attention_lengths,
                                           multiplier=self.beam_width)
            batch_size = batch_size * self.beam_width

        attention_size = shape(attention_states, -1)
        attention = getattr(tf.contrib.seq2seq, config.attention_type)(
            attention_size,
            attention_states,
            memory_sequence_length=attention_lengths)

        def cell_input_fn(inputs, attention):
            # define cell input function to keep input/output dimension same
            if not config.use_attention_input_feeding:
                return inputs
            attn_project = tf.layers.Dense(config.hidden_size,
                                           dtype=tf.float32,
                                           name='attn_input_feeding',
                                           activation=self.activation)
            return attn_project(tf.concat([inputs, attention], axis=-1))

        cells = _setup_decoder_cell(config, keep_prob)
        if config.top_attention:  # apply attention mechanism only on the top decoder layer
            cells[-1] = AttentionWrapper(
                cells[-1],
                attention_mechanism=attention,
                name="AttentionWrapper",
                attention_layer_size=config.hidden_size,
                alignment_history=use_beam_search,
                initial_cell_state=init_state[-1],
                cell_input_fn=cell_input_fn)
            init_state = [state for state in init_state]
            init_state[-1] = cells[-1].zero_state(batch_size=batch_size,
                                                  dtype=tf.float32)
            init_state = tuple(init_state)
            cells = MultiRNNCell(cells)
        else:
            cells = MultiRNNCell(cells)
            cells = AttentionWrapper(cells,
                                     attention_mechanism=attention,
                                     name="AttentionWrapper",
                                     attention_layer_size=config.hidden_size,
                                     alignment_history=use_beam_search,
                                     initial_cell_state=init_state,
                                     cell_input_fn=cell_input_fn)
            init_state = cells.zero_state(batch_size=batch_size, dtype=tf.float32) \
                              .clone(cell_state=init_state)
        return cells, init_state
Example #15
0
    def _dynamic_birnn(self, x, seq_len, batch_size, max_seq_len):

        cell_fw = MultiRNNCell([GRUCell(cell_hidden) for cell_hidden in self.cell_hidden])
        cell_bw = MultiRNNCell([GRUCell(cell_hidden) for cell_hidden in self.cell_hidden])
        init_state_fw = cell_fw.zero_state(batch_size, dtype=tf.float32)
        init_state_bw = cell_bw.zero_state(batch_size, dtype=tf.float32)

        outputs, states = tf.nn.bidirectional_dynamic_rnn(
            cell_fw=cell_fw,
            cell_bw=cell_bw,
            inputs=x,
            initial_state_fw=init_state_fw,
            initial_state_bw=init_state_bw,
            sequence_length=seq_len
        )

        # outputs = tf.concat(outputs, 2)
        #
        # if not self.avg_output:
        #     index = tf.range(0, batch_size) * max_seq_len + (seq_len - 1)
        #     outputs = tf.reshape(outputs, [-1, self.cell_hidden[-1] * 2])
        #     outputs = tf.gather(outputs, index)
        # else:
        #     outputs = tf.reduce_sum(outputs, axis=1)
        #     outputs = tf.divide(outputs, tf.cast(seq_len[:, None], tf.float32))

        outputs = (outputs[0] + outputs[1]) / 2

        if not self.avg_output:
            index = tf.range(0, batch_size) * max_seq_len + (seq_len - 1)
            outputs = tf.reshape(outputs, [-1, self.cell_hidden[-1]])
            outputs = tf.gather(outputs, index)
        else:
            outputs = tf.reduce_sum(outputs, axis=1)
            outputs = tf.divide(outputs, tf.cast(seq_len[:, None], tf.float32))

        fc = tf.layers.dense(outputs, 1000)
        fc = tf.nn.leaky_relu(fc, 0.2)
        fc = tf.layers.dense(fc, self.n_class)

        return fc
Example #16
0
    def unidirectional(self, inputs, controller_config, memory_unit_config, analyse=False, reuse=False):
        """
        Connects unidirectional controller and memory unit and performs scan over sequence
        Args:
            inputs:                 TF tensor, input sequence
            controller_config:      dict, configuration of the controller
            memory_unit_config:     dict, configuration of the memory unit
            analyse:                bool, do analysis
            reuse:                  bool, reuse

        Returns:        TF tensor, output sequence; TF tensor, hidden states

        """

        with tf.variable_scope("controller"):
            controller_list = get_rnn_cell_list(controller_config, name='controller', reuse=reuse, seed=self.seed,
                                                dtype=self.dtype)

        if controller_config['connect'] == 'sparse':
            memory_input_size = controller_list[-1].output_size
            mu_cell = get_memory_unit(memory_input_size, memory_unit_config, 'memory_unit', analyse=analyse,
                                      reuse=reuse)
            cell = MultiRNNCell(controller_list + [mu_cell])
        else:
            controller_cell = HolisticMultiRNNCell(controller_list)
            memory_input_size = controller_cell.output_size
            mu_cell = get_memory_unit(memory_input_size, memory_unit_config, 'memory_unit', analyse=analyse,
                                      reuse=reuse)
            cell = MultiRNNCell([controller_cell, mu_cell])

        batch_size = inputs.get_shape()[1].value
        cell_init_states = cell.zero_state(batch_size, dtype=self.dtype)
        output_init = tf.zeros([batch_size, cell.output_size], dtype=self.dtype)

        if analyse:
            output_init = (output_init, mu_cell.analyse_state(batch_size, dtype=self.dtype))

        init_states = (output_init, cell_init_states)

        def step(pre_states, inputs):
            pre_rnn_output, pre_rnn_states = pre_states

            if analyse:
                pre_rnn_output = pre_rnn_output[0]

            controller_inputs = tf.concat([inputs, pre_rnn_output], axis=-1)
            rnn_output, rnn_states = cell(controller_inputs, pre_rnn_states)
            return (rnn_output, rnn_states)

        outputs, states = tf.scan(step, inputs, initializer=init_states, parallel_iterations=32)

        return outputs, states
Example #17
0
def sequence2class_lstm_inference_graph(embedding_matrix,
                                        batch_size=25,
                                        num_layers=2,
                                        num_classes=2,
                                        state_size=100):

    input_sequence = tf.placeholder(tf.int32, [batch_size, None],
                                    name='input_sequence')

    labels = tf.placeholder(tf.int32, [batch_size, 2], name='labels')

    embeddings = tf.constant(embedding_matrix)

    embeddings = tf.stop_gradient(embeddings)

    rnn_input = tf.nn.embedding_lookup(embeddings, input_sequence)

    lstm = BasicLSTMCell
    lstms_list = [
        lstm(state_size, state_is_tuple=True)
        for _ in range(0, num_layers - 1)
    ]

    cell = MultiRNNCell(lstms_list, state_is_tuple=True)

    initial_state = cell.zero_state(batch_size, tf.float32)

    rnn_outputs, final_state = tf.nn.dynamic_rnn(cell,
                                                 rnn_input,
                                                 initial_state=initial_state)

    with tf.variable_scope('softmax'):
        W = tf.get_variable('W', [state_size, num_classes], dtype=tf.float32)
        b = tf.get_variable('b', [num_classes],
                            initializer=tf.constant_initializer(0.0),
                            dtype=tf.float32)

    logits = tf.matmul(rnn_outputs[:, -1, :], W) + b

    predictions = tf.nn.softmax(logits)
    compare_score_label = tf.equal(tf.argmax(predictions, axis=1),
                                   tf.argmax(labels, axis=1))

    accuracy = tf.reduce_mean(tf.cast(compare_score_label, tf.float32))

    return dict(input_sequence=input_sequence,
                labels=labels,
                accuracy=accuracy,
                initial_state=initial_state,
                final_state=final_state,
                predictions=predictions)
Example #18
0
class DecoderRNNV2(RNNCell):
    def __init__(self,
                 out_units,
                 attention_cell: AttentionRNN,
                 is_training,
                 zoneout_factor_cell=0.0,
                 zoneout_factor_output=0.0,
                 lstm_impl=LSTMImpl.LSTMCell,
                 trainable=True,
                 name=None,
                 dtype=None,
                 **kwargs):
        super(DecoderRNNV2, self).__init__(name=name,
                                           trainable=trainable,
                                           **kwargs)

        self._cell = MultiRNNCell([
            attention_cell,
            ZoneoutLSTMCell(out_units,
                            is_training,
                            zoneout_factor_cell,
                            zoneout_factor_output,
                            lstm_impl=lstm_impl,
                            dtype=dtype),
            ZoneoutLSTMCell(out_units,
                            is_training,
                            zoneout_factor_cell,
                            zoneout_factor_output,
                            lstm_impl=lstm_impl,
                            dtype=dtype),
        ],
                                  state_is_tuple=True)

    @property
    def state_size(self):
        return self._cell.state_size

    @property
    def output_size(self):
        return self._cell.output_size

    def zero_state(self, batch_size, dtype):
        return self._cell.zero_state(batch_size, dtype)

    def compute_output_shape(self, input_shape):
        return tf.TensorShape(
            [input_shape[0], input_shape[1], self.output_size])

    def call(self, inputs, state):
        return self._cell(inputs, state)
Example #19
0
class RPolicy(tf.keras.Model):
    def __init__(self, n_actions, rnn_units=256):
        super(RPolicy, self).__init__()
        cells = [
            GRUCell(rnn_units, kernel_initializer=orthogonal())
            for _ in range(2)
        ]
        self.gru = MultiRNNCell(cells)
        self.state = self.gru.zero_state(batch_size=1, dtype=tf.float32)
        self.fc1 = Dense(rnn_units, activation='relu')
        self.pol = Dense(n_actions)
        self.val = Dense(1)

    def call(self, obs):
        x = tf.constant(obs, dtype=tf.float32)
        x = self.fc1(x)
        x = tf.expand_dims(x, axis=0)
        x, self.state = dynamic_rnn(self.gru, x, initial_state=self.state)
        x = tf.reshape(x, shape=[-1, 256])
        return self.pol(x), self.val(x)

    def reset(self):
        self.state = self.gru.zero_state(batch_size=1, dtype=tf.float32)
Example #20
0
    def _init(self, sequence, targets, authors):
        batch_size = tf.shape(sequence)[0]

        sequence_lengths = tf.cast(tf.count_nonzero(sequence, axis=1), tf.int32)
        embedding = tf.Variable(
            tf.random_normal((self._vocab_size, self._embed_size)),
            name='char_embedding'
        )
        context = tf.Variable(
            tf.random_normal((self._author_size, self._ctx_size)),
            name='ctx_embedding'
        )

        embedded_sequence = tf.nn.embedding_lookup(embedding, sequence)
        embedded_authors = tf.nn.embedding_lookup(context, authors)

        gpu = lambda x: '/gpu:{}'.format(x % self._num_gpu)

        if self._training:
            dropout = lambda x: DropoutWrapper(
                x, 1.0-self._input_dropout, 1.0-self._output_dropout)
            helper = TrainingHelper(embedded_sequence, sequence_lengths)
        else:
            dropout = lambda x: x
            helper = SampleEmbeddingHelper(embedding, sequence[:,0], 2)

        base = lambda x: ContextWrapper(self._cell(x), embedded_authors)
        wrap = lambda i, cell: DeviceWrapper(dropout(cell), gpu(i))
        cells = [wrap(i, base(self._cell_size)) for i in range(self._cell_num)]
        cell = MultiRNNCell(cells)

        init_state = cell.zero_state(batch_size, tf.float32)
        dense = tf.layers.Dense(
            self._vocab_size, self._activation, name='fully_connected'
        )
        decoder = BasicDecoder(cell, helper, init_state, dense)
        output, _, _ = dynamic_decode(decoder, swap_memory=True)
        logits = output.rnn_output

        weights = tf.sequence_mask(sequence_lengths, dtype=tf.float32)
        loss = tf.contrib.seq2seq.sequence_loss(
            logits,
            targets,
            weights
        )

        out = output.sample_id

        return targets, loss, out
    def _build_rnn_graph(self):
        input_ = tf.placeholder(tf.float32,
                                [None, self.n_steps, self.n_inputs],
                                name='input')
        labels = tf.placeholder(tf.float32, [None, self.n_classes],
                                name='labels')

        with tf.name_scope("LSTM") as scope:

            def single_cell():
                return DropoutWrapper(GRUCell(self.n_hidden),
                                      output_keep_prob=self._config.keep_prob)

            multi_layer_cell = MultiRNNCell(
                [single_cell() for _ in range(self._config.num_layers)])
            initial_state = multi_layer_cell.zero_state(
                self._config.batch_size, tf.float32)

        output, state = tf.nn.dynamic_rnn(multi_layer_cell,
                                          input_,
                                          dtype=tf.float32)
        output_flattened = tf.reshape(output, [-1, self.n_hidden])

        with tf.name_scope("softmax") as scope:
            with tf.variable_scope("softmax_params"):
                softmax_w = tf.get_variable("softmax_w",
                                            [self.n_hidden, self.n_classes])
                softmax_b = tf.get_variable("softmax_b", [self.n_classes])
            logits = tf.nn.xw_plus_b(output_flattened, softmax_w, softmax_b)
            loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels,
                                                           logits=logits,
                                                           name="sigmoid")

        # Minimize loss using Adam Optimizer
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=self.learning_rate).minimize(loss)

        # Make predictions
        correct_prediction = tf.equal(tf.round(tf.nn.sigmoid(logits)), labels)

        # Accuracy of individual classes being correct
        self.acc_of_class = tf.reduce_mean(
            tf.cast(correct_prediction, tf.float32))
        # Accuracy of entire label being correct
        all_labels_true = tf.reduce_min(
            tf.cast(correct_prediction, tf.float32), 1)
        self.acc_of_label = tf.reduce_mean(all_labels_true)
    def _build_rnn_graph_lstm(self, inputs, weights, config, is_training):
        """Build the inference graph using canonical LSTM cells."""
        # Slightly better results can be obtained with forget gate biases
        # initialized to 1 but the hyperparameters of the model would need to be
        # different than reported in the paper.

        self.size = config.hidden_size

        cell0_w, cell0_b, cell1_w, cell1_b = weights

        cell0 = BayesianLSTMCell(self.size,
                                 cell0_w,
                                 cell0_b,
                                 reuse=not is_training)
        cell1 = BayesianLSTMCell(self.size,
                                 cell1_w,
                                 cell1_b,
                                 reuse=not is_training)

        cell = MultiRNNCell([cell0, cell1], state_is_tuple=True)

        #    cell = tf.contrib.rnn.BasicLSTMCell(
        #          config.hidden_size, forget_bias=0.0, state_is_tuple=True,
        #          reuse=not is_training)
        #
        #    cell = tf.contrib.rnn.MultiRNNCell(
        #        [cell for _ in range(config.num_layers)], state_is_tuple=True)

        self._initial_state = cell.zero_state(config.batch_size, data_type())
        state = self._initial_state

        outputs = []
        with tf.variable_scope("RNN"):
            for time_step in range(self.num_steps):
                if time_step > 0: tf.get_variable_scope().reuse_variables()
                (cell_output, state) = cell(inputs[:, time_step, :], state)
                outputs.append(cell_output)

        output = tf.concat(outputs, 1)
        output = tf.reshape(output, [-1, config.hidden_size])

        return output, state
Example #23
0
class RPolicy(tf.keras.Model):

    def __init__(self, n_actions):
        super(RPolicy, self).__init__()
        cells = [GRUCell(128, kernel_initializer=orthogonal(np.sqrt(2))) for _ in range(2)]
        self.gru = MultiRNNCell(cells)
        self.s0 = self.gru.zero_state(batch_size=1, dtype=tf.float32)
        self.fc1 = Dense(128, activation='relu', kernel_initializer=orthogonal(np.sqrt(2)))
        self.fc2 = Dense(100, activation='relu', kernel_initializer=orthogonal(np.sqrt(2)))
        self.fc3 = Dense(100, activation='relu', kernel_initializer=orthogonal(np.sqrt(2)))
        self.pol = Dense(n_actions, kernel_initializer=orthogonal(0.01))
        self.val = Dense(1, kernel_initializer=orthogonal(np.sqrt(2)))

    def call(self, obs, state):
        x = tf.constant(obs, dtype=tf.float32)
        x = self.fc1(x)
        x = tf.expand_dims(x, axis=0)
        x, state = dynamic_rnn(self.gru, x, initial_state=state)
        x = tf.reshape(x, shape=[-1, 128])
        pi = self.fc2(x)
        v = self.fc3(x)
        return self.pol(pi), self.val(v), state
Example #24
0
def RNN(X, weights, biases):
    # hidden layer for input to cell
    ########################################
    X = tf.reshape(X, [-1, n_inputs])
    X_in = tf.matmul(X, weights['in']) + biases['in']
    X_in = tf.reshape(X_in, [-1, n_steps, n_hidden_units])

    # multi rnn cell
    ##########################################
    cell = MultiRNNCell(
        [BasicLSTMCell(n_hidden_units) for _ in range(layer_num)])
    init_state = cell.zero_state(batch_size, dtype=tf.float32)
    outputs, final_state = tf.nn.dynamic_rnn(cell,
                                             X_in,
                                             initial_state=init_state,
                                             time_major=False)

    # hidden layer for output as the final results
    #############################################
    outputs = tf.unstack(tf.transpose(outputs, [1, 0, 2]))
    results = tf.matmul(outputs[-1],
                        weights['out']) + biases['out']  # shape = (128, 10)

    return results
Example #25
0
        def decode(helper, scope, reuse=None):
            with tf.variable_scope(scope, reuse=reuse):
                rnn_layers = []
                for i in range(n_decoder_layers):
                    # Create GRUCell with dropout. Do not forget to set the reuse flag properly.
                    cell = tf.nn.rnn_cell.GRUCell(hidden_size, reuse=reuse)
                    cell = tf.nn.rnn_cell.DropoutWrapper(
                        cell, input_keep_prob=self.dropout_ph)
                    rnn_layers.append(cell)

                decoder_cell = MultiRNNCell(rnn_layers)

                # Create a projection wrapper
                decoder_cell = OutputProjectionWrapper(decoder_cell,
                                                       vocab_size,
                                                       reuse=reuse)

                # Create BasicDecoder, pass the defined cell, a helper, and initial state
                # The initial state should be equal to the final state of the encoder!
                initial_state = decoder_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)
                decoder = BasicDecoder(decoder_cell,
                                       helper,
                                       initial_state=initial_state)

                # The first returning argument of dynamic_decode contains two fields:
                #   * rnn_output (predicted logits)
                #   * sample_id (predictions)
                max_iters = tf.reduce_max(self.ground_truth_lengths)
                # max_iters = max_iter
                outputs, _, _ = dynamic_decode(decoder=decoder,
                                               maximum_iterations=max_iters,
                                               output_time_major=False,
                                               impute_finished=True)

                return outputs
Example #26
0
    def __init__(self, is_training, config, input_):
        self._is_training = is_training
        self._input = input_
        self.batch_size = input_.batch_size
        self.num_steps = input_.num_steps
        
        self._input_data = input_.input_data
        size = config.X_dim
        hidden_size = config.hidden_size
        vocab_size = config.vocab_size
        
        self._targets = input_.targets
        # Construct prior
        prior = Prior(config.prior_pi, config.log_sigma1, config.log_sigma2)
        
        # Fetch embeddings
        inputs = input_.input_data
        # Build the BBB LSTM cells
        cells = []
        for i in range(config.num_layers):
            if (i == 0):
                LSTM_input_size = config.X_dim
            else:
                LSTM_input_size = config.hidden_size
                
            cells.append(BayesianLSTMCell(LSTM_input_size, config.hidden_size, prior, is_training,
                                      forget_bias=0.0,
                                      name="bbb_lstm_{}".format(i)))

        cell = MultiRNNCell(cells, state_is_tuple=True)
        self._initial_state = cell.zero_state(config.batch_size, data_type())
        state = self._initial_state
        
        # Forward pass for the truncated mini-batch
        outputs = []
        with tf.variable_scope("RNN"):
            for time_step in range(self.num_steps):
                if time_step > 0: tf.get_variable_scope().reuse_variables()
                (cell_output, state) = cell(inputs[:, time_step, :], state)
                outputs.append(cell_output)
        output = tf.reshape(tf.concat(outputs, 1), [-1, hidden_size])

        # Softmax weights
        softmax_w = sample_posterior((hidden_size, vocab_size), "softmax_w", prior, is_training)
        softmax_b = sample_posterior((vocab_size, 1), "softmax_b", prior, is_training)
        
        logits = tf.nn.xw_plus_b(output, softmax_w, tf.squeeze(softmax_b))
        logits = tf.reshape(logits, [self.batch_size, self.num_steps, vocab_size])
        
        self._output =  tf.nn.softmax(logits)
        
        loss = tf.contrib.seq2seq.sequence_loss(
            logits,
            input_.targets,
            tf.ones([self.batch_size, self.num_steps], dtype=data_type()),
            average_across_timesteps=False,
            average_across_batch=False)

        # Update the cost
        # Remember to divide by batch size
        self._cost = tf.reduce_sum(loss) / self.batch_size
        self._kl_loss = 0.
        self._final_state = state
        
        if not is_training:
            return

        #Compute KL divergence
        #B = number of batches aka the epoch size
        #C = number of truncated sequences in a batch aka batch_size variable
        B = self._input.epoch_size
        C = self.batch_size
        
        kl_loss = tf.add_n(tf.get_collection("KL_layers"), "kl_divergence")
        
        kl_factor = 1.0/(B*C)
        self._kl_loss = kl_factor * kl_loss
        
        self._total_loss = self._cost + self._kl_loss

        self._lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self._total_loss, tvars),
                                          config.max_grad_norm)
        optimizer = tf.train.GradientDescentOptimizer(self._lr)
        self._train_op = optimizer.apply_gradients(
            zip(grads, tvars),
            global_step=tf.contrib.framework.get_or_create_global_step())

        self._new_lr = tf.placeholder(data_type(), shape=[], name="new_learning_rate")
        self._lr_update = tf.assign(self._lr, self._new_lr)
Example #27
0
    def __init__(self, data_size, time_len, unit_size, num_layers, batch_size,
                 learning_rate, feed_previous):
        '''
        Create the basic encoder-decoder seq2seq model
        :param unit_size: number of units in each LSTM layer of the model
        :param num_layers: number of LSTM layers in the model
        :param batch_size: the size of batches used during training
        :param learning_rate: 
        '''

        self.input_size = data_size
        self.time_len = time_len
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate),
                                         trainable=False,
                                         name='lr')
        self.global_step = tf.Variable(0, trainable=False, name='global_step')

        def single_cell():
            return BasicLSTMCell(unit_size)

        cell = single_cell()
        if num_layers > 1:
            cell = MultiRNNCell([single_cell() for _ in range(num_layers)])

        print('state size', cell.state_size)
        print('zero state size',
              cell.zero_state(self.batch_size, dtype=tf.float32))

        # Set placeholder for encoder's inputs
        self.encoder_inputs = []
        self.decoder_inputs = []

        for i in range(self.time_len):
            self.encoder_inputs.append(
                tf.placeholder(shape=[self.batch_size, self.input_size],
                               name='encoder{}'.format(i),
                               dtype=tf.float32))

            self.decoder_inputs.append(
                tf.placeholder(shape=[self.batch_size, self.input_size],
                               name='decoder{}'.format(i),
                               dtype=tf.float32))

        # The purpose is reconstruction, thus the targets should be the reverse of the input
        targets = self.encoder_inputs[::-1]
        outputs, _ = advanced_rnn_seq2seq(
            encoder_inputs=self.encoder_inputs,
            decoder_inputs=self.decoder_inputs,
            cell=cell,
            num_decoder_symbols=self.input_size,
            output_projection=None,
            feed_previous=feed_previous
        )  # the outputs have been projected based on the original lstm outputs

        targets = tf.stack(targets, axis=1)
        self.outputs = tf.stack(outputs, axis=1)
        self.loss = tf.losses.mean_squared_error(targets, self.outputs)
        self.error_vector = tf.abs(self.outputs - targets)

        # set up the train operation
        optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        self.train_op = optimizer.minimize(self.loss,
                                           global_step=self.global_step)

        # the saver for handling all parameters for the model
        self.saver = tf.train.Saver(tf.global_variables())
Example #28
0
    def __init__(self, is_training, config, input_):
        """
        This initializer function will read the hyperparameters, from that it will 
        set the atchitecture of the network.
        
        The is_training flag is nice to build the network. If it is not for training then we do not
        need to builf to the graph the loss function and optimizer.
        
        
        """
        # Variable to know if the model is being used for training 
        self._is_training = is_training
        # TODO: This is the structure we just saw...
        self._input = input_
        
        # Setting the chains properties
        self.batch_size = config.batch_size
        self.num_steps = input_.num_steps
        self._input_data = input_.input_data
        
        input_data_ids = input_.input_data
        
        self._targets = input_.targets
        # Setting the architectute properties
        # Dimensionality of the input !! 
        # TODO: For now we set it the same as the hidden_size. Probably for matrix concatenation purposes ?
       
        # Dimensionality of the output ! In the case of classification, the cardinality of the output
        Y_cardinality = config.Y_cardinality # Size of the output
        
        # Construct prior
        prior = VI.Prior(config.prior_pi, config.log_sigma1, config.log_sigma2)
        
        ########################################################################
        #############  Transform Categorial values (words) into real values vectors ############
        ########################################################################
        # Fetch embeddings
#        with tf.device("/cpu:0"):
#            embedding = VI.sample_posterior([vocab_size, size], "embedding", prior, is_training)
#            inputs = tf.nn.embedding_lookup(embedding, input_.input_data)

        # If we have discrete input X and we want to embed them in random vectors of size "size"
        # We also need to include the cardinality of the output Y.
#        if (type(config.X_dim) != type(None)):
        if (config.embedding == True):
            with tf.device("/cpu:0"):
                embedding = tf.get_variable(
                        "embedding", [Y_cardinality, config.X_dim], dtype=VI.data_type())
                inputs = tf.nn.embedding_lookup(embedding, input_data_ids)
            
            X_dim = config.X_dim
        else:
            X_dim = config.X_dim# inputs.get_shape()[-1].value
            
#            inputs = tf.get_variable("Continous_data_input", [self.batch_size,self.num_steps, X_dim], dtype=VI.data_type(), trainable = False) 
#            inputs.assign(input_data_ids)
#            
#            caca = tf.zeros_initializer(tf.int32)((self.batch_size,Y_cardinality, tf.int32))
#            targets = tf.get_variable("Discrete_Target", [self.batch_size,Y_cardinality], dtype=tf.int32, trainable = False, 
#                                      initializer = caca) 
#            targets.assign(input_.targets)
            
        
#            inputs = tf.Variable(input_data_ids, trainable = False)
#            targets = tf.Variable(input_.targets, trainable = False)
            
            inputs = input_data_ids
            targets = input_.targets
            
        # These are the chains in the Batch. They are represented by a 3D tensor with dimensions
        #     - size_epoch: Number of chains in the batch
        #     - num_steps: Number of elements of the chain
        #     - D:   Dimensionality of the elements of the chain. 
        # TODO: maybe due to the initial embedding that has to be done, all inputs are given when defining the model,
        #       we do not want that, we want them to be in a way where do the preprocessing before and we have chains as placeholder.

        input_chains = inputs[:, :, :]
        
        print ("-----------------------------")
        print ("Input Batch X shape", inputs.shape)
        print ("Input Batch Y shape", targets.shape)
        print ("Input_size: %i"%X_dim)
        print ("Output_size: %i"%Y_cardinality)
        print ("Number of chains in a batch: %i"%self.batch_size)
        print ("Number of elements in a chain: %i"%self.num_steps)
        print ("Number of hidden state neurons LTSM: %i"%config.hidden_size)
        
        ########################################################################
        ############# Start Building the Architecute of the Network ############
        ########################################################################
        
        ######################################################################
        ################  Build and Stack BBB LSTM cells ################
        cells = []
        for i in range(config.num_layers):
            if (i == 0):
                LSTM_input_size = X_dim
            else:
                LSTM_input_size = config.hidden_size
                
            cells.append(BLC.BayesianLSTMCell(LSTM_input_size, config.hidden_size, prior, is_training,
                                      forget_bias=0.0,
                                      name="bbb_lstm_{}".format(i)))
        # The following line will stack the LSTM cells together
        # They just need to follow the interface that we already wrote
        # Notice we use  state_is_tuple=True since the LSTM cells have 2 states C_t and h_t
        DeepLSTMRNN = MultiRNNCell(cells, state_is_tuple=True)
        
        # Initialize the state values to 0 ? 
        # TODO: We need to provide info about the Batch size ? That is the number of chains
        # we want to compute the output at once. 
            
        #####################################################################################
        ################  Propagate the chains in the batch from input to output ################
        
        # Initialization.
        # This is the initial state for the LSTM when we feed it a new chain (is it just the 0s) probably. Then it should output the conditional most lilkely word.
        # We need to give it the batch_size because we are going to propagate the chains in parallel. 
        # initial state will have dimensions [batch_size, (LSTM_hidden_size, LSTM_hidden_size)] since each state of the LSTM is made of the previous 
        self._initial_state = DeepLSTMRNN.zero_state(config.batch_size, VI.data_type())
        state = self._initial_state


        # Forward pass for the truncated mini-batch
        # hs_o: This list will contain in each of its elements, 
        #         the hidden state of the last LSTM of the network
        #         for each of the number of steps (length of the chains that is has to be the same for every chain).
        # Each of this hidden states has dimensions [LSTM_hidden_size, num_batch] since we are computing in parallel for all chains in the batch.

        # Now we propagate the chains in parallel and the initial state through the Deep Bayesian LSTM.
        # At each time step we will save the hidden state of the last LSTM to convert it later to the real output and being able
        # to compute the cost function and the output !

        # TODO: This is probably why we want the chains to have the same length. Also maybe to not having to worry later to weight the
        # cost functions by the length of the chains. Anyway... for now we will just accept it.

        hs_o = []                       
        with tf.variable_scope("RNN"):        # We put all the LSTMs under the name RNN.
            for time_step in range(self.num_steps):  # For each element in the chain
                if (time_step > 0):   # Maybe this is so that we do not create the LSTMS a lot of times in the TensorBoard ?
                    tf.get_variable_scope().reuse_variables()
                
                # Now we start feeding the time_step-th element of each of the chains at the same time to the network, obtaining the state for

                (cell_output, state) = DeepLSTMRNN(input_chains[:,time_step,:], state)
                hs_o.append(cell_output)
        print (["size output state LSTM", cell_output.shape])
        
#        print ("Num steps: %i"%self.num_steps)
        
        # Now we concatenate all the hidden spaces of dimension  [num_batch, LSTM_hidden_size] 
        # into in the list with dimension [num_batch x step_size, LSTM_hidden_size]. At the end of the day
        # all of the hidden spaces will be multiplied by the same weights of the dense softmax layer so we concatenate all of the
        # output hidden spaces for later multiplication.
        hs_o = tf.reshape( tf.concat(hs_o, 1), [-1, config.hidden_size])
        
        print (["Size of the Concatenated output state of the last LSTM for all chains in batch and time-steps in a batch", hs_o.shape])
        ######################################################################
        ################  Build the output layer ############################

        # In our case the output later is just a dense layer that transforms the hidden space
        # of the last LSTM into the prediction of each discrete output (word), applying a softmax 
        # function to the output of the neurons.
        # The parameters of this layer are just the Weights and biases of it.
        
        # The next call function will create the weights if they have not been create before.
        # Identified by the names ""
        # TODO: Not really a TODO, but the important part here is that we changed size vy config.hidden_size
        softmax_w = VI.sample_posterior((config.hidden_size  , Y_cardinality), "softmax_w", prior, is_training)
        softmax_b = VI.sample_posterior((Y_cardinality, 1), "softmax_b", prior, is_training)
        
        print ("Shape of the weights of the output Dense layer",softmax_w.shape)
        print ("Shape of the weights of the output Dense layer",softmax_b.shape)
        ## We propagate the hidden spaces through the network in order to obtain the outout of the network before
        ## the softmax function, which is called the logits. This logits will have dimensions 
        ## [num_batch x step_size, LSTM_hidden_size] that we need to break down further.

        # Logits are the input to the softmax layer !
        logits = tf.nn.xw_plus_b(hs_o, softmax_w, tf.squeeze(softmax_b))
        # We reshape it back to the proper form [chain, sample, output]
        
        print ("Shape of logits after multiplication of ohs", logits.shape)
        logits = tf.reshape(logits, [self.batch_size, self.num_steps, Y_cardinality])
        print ("Shape of logits after reshpaing", logits.shape)
        
        # We can compute the output of the chains !
        # TODO: maybe do not execute this line in the training model to save computation ? Maybe it wouldnt be executed anyway ?
        self._output =  tf.nn.softmax(logits)

        """ This is finally the output of the batch, our prediction of the word,
            for each of the words in the batch. Since we have:
                - self.batch_size number of chains in the batch
                - Each chain has the same number of words: self.num_steps
                - The prediction of each word is the probability of each of the vocab_size variables
        """
        
        #####################################################################################
        ################  Setting the Loss function  ################
        #####################################################################################

        #B = number of batches aka the epoch size
        #C = number of truncated sequences in a batch aka batch_size variable
        B = self._input.epoch_size
        C = self.batch_size
        
        loss = tf.contrib.seq2seq.sequence_loss(
            logits,
            targets,
            tf.ones([self.batch_size, self.num_steps], dtype=VI.data_type()),
            average_across_timesteps=False,
            average_across_batch=False)

        # Update the cost
        # Remember to divide by batch size
        self._cost = tf.reduce_sum(loss) / self.batch_size
        self._kl_loss = 0.
        self._final_state = state
        
        if not is_training:
            return

        #Compute KL divergence

        ## We get the KL loss that was computed during the sampling of the variational posterior !!
        
        kl_loss = tf.add_n(tf.get_collection("KL_layers"), "kl_divergence")
        
        self._kl_loss =  kl_loss /(B*C)
        
        # Compute the final loss, this is a proportion between the likelihood of the data (_cost)
        # And the KL divergence of the posterior 
        
        # TODO: Remove increased by 2 the cost so that the total cost is more influenced
        # on  the data !
        self._total_loss = self._cost + self._kl_loss
        
        #####################################################################################
        ################  Setting the training algorithm  ################
        #####################################################################################
        
        ## Set the trainable variables, the variables for which the gradient with respect to the loss function
        # will be computed and will be modified by the optimizer when the session is run :) 
            
        self._lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self._total_loss, tvars),
                                          config.max_grad_norm)
        
        
        optimizer = tf.train.GradientDescentOptimizer(self._lr)
        self._train_op = optimizer.apply_gradients(
            zip(grads, tvars),
            global_step=tf.contrib.framework.get_or_create_global_step())

        self._new_lr = tf.placeholder(VI.data_type(), shape=[], name="new_learning_rate")
        self._lr_update = tf.assign(self._lr, self._new_lr)
Example #29
0
    def __init__(self,
                 inp,
                 inp_mask,
                 decode_time_steps,
                 hyper_params=None,
                 name='Tacotron'):
        """
        Build the computational graph.
        :param inp:
        :param inp_mask:
        :param decode_time_steps:
        :param hyper_params:
        :param name:
        """
        super(Tacotron, self).__init__(name)
        self.hyper_params = HyperParams(
        ) if hyper_params is None else hyper_params

        with tf.variable_scope(name):
            self.global_step = tf.Variable(0,
                                           name='global_step',
                                           trainable=False)

            batch_size = tf.shape(inp)[0]
            input_time_steps = tf.shape(inp)[1]
            reduc = self.hyper_params.reduction_rate
            output_time_steps = decode_time_steps * reduc

            ### Encoder [begin]
            with tf.variable_scope('character_embedding'):
                embed_inp = EmbeddingLayer(self.hyper_params.embed_class,
                                           self.hyper_params.embed_dim)(inp)
            with tf.variable_scope('encoder_pre_net'):
                pre_ed_inp = tf.layers.dropout(tf.layers.dense(
                    embed_inp, 256, tf.nn.relu),
                                               training=False)
                pre_ed_inp = tf.layers.dropout(tf.layers.dense(
                    pre_ed_inp, 128, tf.nn.relu),
                                               training=False)
            encoder_output = modules.cbhg(pre_ed_inp,
                                          training=False,
                                          k=16,
                                          bank_filters=128,
                                          projection_filters=(128, 128),
                                          highway_layers=4,
                                          highway_units=128,
                                          bi_gru_units=128,
                                          sequence_length=inp_mask,
                                          name='encoder_cbhg',
                                          reuse=False)
            ### Encoder [end]

            ### Attention Module
            with tf.variable_scope('attention'):
                att_module = AttentionModule(256,
                                             encoder_output,
                                             sequence_length=inp_mask,
                                             time_major=False)

            ### Decoder [begin]
            att_cell = ZoneoutWrapper(sGRUCell(256), 0.1, False)
            dec_cell = MultiRNNCell(
                [ResidualWrapper(GRUCell(256)) for _ in range(2)])
            # prepare output alpha TensorArray
            with tf.variable_scope('prepare_decode'):
                # prepare output alpha TensorArray
                reduced_time_steps = tf.div(output_time_steps, reduc)
                init_att_cell_state = att_cell.zero_state(
                    batch_size, tf.float32)
                init_dec_cell_state = dec_cell.zero_state(
                    batch_size, tf.float32)
                init_state_tup = tuple(
                    [init_att_cell_state, init_dec_cell_state])
                init_output_ta = tf.TensorArray(size=reduced_time_steps,
                                                dtype=tf.float32)
                init_alpha_ta = tf.TensorArray(size=reduced_time_steps,
                                               dtype=tf.float32)
                go_array = tf.zeros(
                    [batch_size, self.hyper_params.seq2seq_dim],
                    dtype=tf.float32)
                init_context = tf.zeros([batch_size, 256], dtype=tf.float32)
                init_time = tf.constant(0, dtype=tf.int32)
            cond = lambda x, *_: tf.less(x, reduced_time_steps)

            def body(this_time, old_output_ta, old_alpha_ta, old_state_tup,
                     last_context, last_output):
                with tf.variable_scope('decoder_pre_net'):
                    dec_pre_ed_inp = last_output
                    dec_pre_ed_inp = tf.layers.dropout(tf.layers.dense(
                        dec_pre_ed_inp, 256, tf.nn.relu),
                                                       training=True)
                    dec_pre_ed_inp = tf.layers.dropout(tf.layers.dense(
                        dec_pre_ed_inp, 128, tf.nn.relu),
                                                       training=True)
                with tf.variable_scope('attention_rnn'):
                    att_cell_inp = tf.concat([last_context, dec_pre_ed_inp],
                                             axis=-1)
                    att_cell_out, att_cell_state = att_cell(
                        att_cell_inp, old_state_tup[0])
                with tf.variable_scope('attention'):
                    query = att_cell_state
                    context, alpha = att_module(query)
                    new_alpha_ta = old_alpha_ta.write(this_time, alpha)
                with tf.variable_scope('decoder_rnn'):
                    dec_input = tf.layers.dense(
                        tf.concat([att_cell_out, context], axis=-1), 256)
                    dec_cell_out, dec_cell_state = dec_cell(
                        dec_input, old_state_tup[1])
                    dense_out = tf.layers.dense(
                        dec_cell_out, self.hyper_params.seq2seq_dim * reduc)
                    new_output_ta = old_output_ta.write(this_time, dense_out)
                    new_output = dense_out[:, -self.hyper_params.seq2seq_dim:]
                new_state_tup = tuple([att_cell_state, dec_cell_state])
                return tf.add(
                    this_time, 1
                ), new_output_ta, new_alpha_ta, new_state_tup, context, new_output

            # run loop
            _, seq2seq_output_ta, alpha_ta, *_ = tf.while_loop(
                cond, body, [
                    init_time, init_output_ta, init_alpha_ta, init_state_tup,
                    init_context, go_array
                ])
            with tf.variable_scope('reshape_decode'):
                seq2seq_output = tf.reshape(
                    seq2seq_output_ta.stack(),
                    shape=(reduced_time_steps, batch_size,
                           self.hyper_params.seq2seq_dim * reduc))
                seq2seq_output = tf.reshape(
                    tf.transpose(seq2seq_output, perm=(1, 0, 2)),
                    shape=(batch_size, output_time_steps,
                           self.hyper_params.seq2seq_dim))
                self.seq2seq_output = seq2seq_output

                alpha_output = tf.reshape(alpha_ta.stack(),
                                          shape=(reduced_time_steps,
                                                 batch_size, input_time_steps))
                alpha_output = tf.expand_dims(
                    tf.transpose(alpha_output, perm=(1, 0, 2)), -1)
                self.alpha_output = alpha_output
            ### Decoder [end]

            ### PostNet [begin]
            post_output = modules.cbhg(
                seq2seq_output,
                training=False,
                k=8,
                bank_filters=128,
                projection_filters=(256, self.hyper_params.seq2seq_dim),
                highway_layers=4,
                highway_units=128,
                bi_gru_units=128,
                sequence_length=None,
                name='decoder_cbhg',
                reuse=False)
            post_output = tf.layers.dense(post_output,
                                          self.hyper_params.post_dim,
                                          name='post_linear_transform')
            self.post_output = post_output
    def __init__(self, inp, inp_mask, decode_time_steps, ctr_flag, ctr_attention, hyper_params=None, name='Tacotron'):
        """
        Build the computational graph.
        :param inp:
        :param inp_mask:
        :param decode_time_steps:
        :param hyper_params:
        :param name:
        """
        super(Tacotron, self).__init__(name)
        self.hyper_params = HyperParams() if hyper_params is None else hyper_params

        with tf.variable_scope(name):
            self.global_step = tf.Variable(0, name='global_step', trainable=False)

            batch_size = tf.shape(inp)[0]
            input_time_steps = tf.shape(inp)[1]
            reduc = self.hyper_params.reduction_rate
            output_time_steps = decode_time_steps * reduc

            ### Encoder [begin]
            with tf.variable_scope('character_embedding'):
                embed_inp = EmbeddingLayer(self.hyper_params.embed_class, self.hyper_params.embed_dim)(inp)
            with tf.variable_scope("changeToVarible"):
                self.single_style_token = tf.get_variable('style_token', (1, self.hyper_params.styles_kind, self.hyper_params.style_dim), dtype=tf.float32)
                self.style_token = tf.tile(self.single_style_token, (batch_size, 1, 1))
            with tf.variable_scope('encoder_pre_net'):
                pre_ed_inp = tf.layers.dropout(tf.layers.dense(embed_inp, 256, tf.nn.relu), training=False)
                pre_ed_inp = tf.layers.dropout(tf.layers.dense(pre_ed_inp, 128, tf.nn.relu), training=False)
            encoder_output = modules.cbhg(pre_ed_inp, training=False, k=16, bank_filters=128,
                                          projection_filters=(128, 128), highway_layers=4, highway_units=128,
                                          bi_gru_units=128, sequence_length=inp_mask,
                                          name='encoder_cbhg', reuse=False)

            with tf.variable_scope('post_text'):
                all_outputs, _ = tf.nn.dynamic_rnn(cell=GRUCell(256), inputs=encoder_output, sequence_length=inp_mask,
                                               dtype=encoder_output.dtype, parallel_iterations=unkonwn_parallel_iterations)
                all_outputs = tf.transpose(all_outputs, [1, 0, 2])
                static_encoder_output = all_outputs[-1]
            ### Encoder [end]

            ### Attention Module
            with tf.variable_scope('attention'):
                att_module = AttentionModule(256, encoder_output, sequence_length=inp_mask, time_major=False)
            with tf.variable_scope("attention_style"):
                att_module_style = AttentionModule(256, self.style_token, time_major=False)

            ### Decoder [begin]
            att_cell = GRUCell(256)
            dec_cell = MultiRNNCell([ResidualWrapper(GRUCell(256)) for _ in range(2)])
            # prepare output alpha TensorArray
            with tf.variable_scope('prepare_decode'):
                # prepare output alpha TensorArray
                reduced_time_steps = tf.div(output_time_steps, reduc)
                init_att_cell_state = att_cell.zero_state(batch_size, tf.float32)
                init_dec_cell_state = dec_cell.zero_state(batch_size, tf.float32)
                init_state_tup = tuple([init_att_cell_state, init_dec_cell_state])
                init_output_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32)
                init_alpha_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32)
                init_weight_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32)
                init_weight_per_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32)
                init_alpha_style_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32)
                go_array = tf.zeros([batch_size, self.hyper_params.seq2seq_dim], dtype=tf.float32)
                init_context = tf.zeros([batch_size, 256], dtype=tf.float32)
                init_context_style = tf.zeros([batch_size, 256], dtype=tf.float32)
                init_time = tf.constant(0, dtype=tf.int32)
            cond = lambda x, *_: tf.less(x, reduced_time_steps)
            def body(this_time, old_output_ta, old_alpha_ta, old_alpha_style_ta, old_weight_ta, old_weight_per_ta,
                     old_state_tup, last_context, last_context_style, last_output):
                with tf.variable_scope('decoder_pre_net'):
                    dec_pre_ed_inp = last_output
                    dec_pre_ed_inp = tf.layers.dropout(tf.layers.dense(dec_pre_ed_inp, 256, tf.nn.relu), training=False)
                    dec_pre_ed_inp = tf.layers.dropout(tf.layers.dense(dec_pre_ed_inp, 128, tf.nn.relu), training=False)
                with tf.variable_scope('attention_rnn'):
                    # dec_pre_ed_inp = tf.Print(dec_pre_ed_inp, [dec_pre_ed_inp[0]], message='dec', summarize=10)
                    att_cell_inp = tf.concat([last_context, dec_pre_ed_inp], axis=-1)
                    att_cell_out, att_cell_state = att_cell(att_cell_inp, old_state_tup[0])
                with tf.variable_scope('attention'):
                    query = att_cell_state[0]
                    context, alpha = att_module(query)
                    new_alpha_ta = old_alpha_ta.write(this_time, alpha)
                with tf.variable_scope("attention_style"):
                    query_style = att_cell_state[0]
                    context_style, alpha_style = att_module_style(query_style)
                    alpha_style = tf.cond(tf.equal(ctr_flag, 1), lambda: ctr_attention, lambda: alpha_style)
                    alpha_style = tf.Print(alpha_style, [alpha_style], message='alpha:', summarize=10)
                    context_style = tf.cond(tf.equal(ctr_flag, 1),
                                            lambda: tf.reduce_sum(tf.expand_dims(alpha_style, axis=-1) * self.style_token, axis=1),
                                            lambda: context_style)
                    context_style = tf.Print(context_style, [context_style], message='style:', summarize=10)
                    # alpha_style = ctr_attention
                    # alpha_style = tf.Print(alpha_style, [alpha_style], message='alpha', summarize=20)
                    # context_style = tf.reduce_sum(tf.expand_dims(alpha_style, axis=-1) * self.style_token, axis=1)
                    # context_style = tf.Print(context_style, [context_style], message='ctxt_style', summarize=20)
                    new_alpha_style_ta = old_alpha_style_ta.write(this_time, alpha_style)
                with tf.variable_scope("weighting"):
                    weight_input = tf.concat([static_encoder_output, dec_pre_ed_inp], axis=-1)
                    weighting = tf.layers.dense(weight_input, 2, tf.nn.sigmoid)
                    # weighting = tf.Print(weighting, [weighting[1]], message='weighting')
                    weighting = tf.nn.softmax(weighting)
                    weight_text, weight_style = tf.split(weighting, [1, 1], -1)
                    # weight_text = tf.Print(weight_text, [weight_text], message='weight_text:', summarize=20)
                    weight_style = tf.Print(weight_style, [weight_style], message='weight_style:')
                    new_weight_ta = old_weight_ta.write(this_time, weight_text)
                with tf.variable_scope('decoder_rnn'):
                    weighting_context = weight_text * context + weight_style * context_style
                    weight_per = tf.reduce_mean(tf.abs(weight_style * context_style) / (
                            tf.abs(weight_text * context) + tf.abs(weight_style * context_style)))
                    new_weight_per_ta = old_weight_per_ta.write(this_time, weight_per)
                    dec_input = tf.layers.dense(tf.concat([att_cell_out, weighting_context], axis=-1), 256)
                    # dec_input = tf.layers.dense(tf.concat([att_cell_out, context], axis=-1), 256)
                    dec_cell_out, dec_cell_state = dec_cell(dec_input, old_state_tup[1])
                    dense_out = tf.layers.dense(dec_cell_out, self.hyper_params.seq2seq_dim * reduc)
                    new_output_ta = old_output_ta.write(this_time, dense_out)
                    new_output = dense_out[:, -self.hyper_params.seq2seq_dim:]
                new_state_tup = tuple([att_cell_state, dec_cell_state])
                return tf.add(this_time, 1), new_output_ta, new_alpha_ta, new_alpha_style_ta, new_weight_ta,\
                       new_weight_per_ta, new_state_tup, context, context_style, new_output


            # run loop
            _, seq2seq_output_ta, alpha_ta, alpha_style_ta, weight_ta, weight_per_ta, *_ = tf.while_loop(cond, body, [init_time,
                                                                                                                      init_output_ta,
                                                                                                                      init_alpha_ta,
                                                                                                                      init_alpha_style_ta,
                                                                                                                      init_weight_ta,
                                                                                                                      init_weight_per_ta,
                                                                                                                      init_state_tup,
                                                                                                                      init_context,
                                                                                                                      init_context_style,
                                                                                                                      go_array
                                                                                                                      ])
            with tf.variable_scope('reshape_decode'):
                seq2seq_output = tf.reshape(seq2seq_output_ta.stack(),
                                            shape=(reduced_time_steps, batch_size, self.hyper_params.seq2seq_dim * reduc))
                seq2seq_output = tf.reshape(tf.transpose(seq2seq_output, perm=(1, 0, 2)),
                                            shape=(batch_size, output_time_steps, self.hyper_params.seq2seq_dim))
                self.seq2seq_output = seq2seq_output

                alpha_output = tf.reshape(alpha_ta.stack(),
                                          shape=(reduced_time_steps, batch_size, input_time_steps))
                alpha_output = tf.expand_dims(tf.transpose(alpha_output, perm=(1, 0, 2)), -1)
                self.alpha_output = alpha_output

                alpha_output_style = tf.reshape(alpha_style_ta.stack(),
                                                shape=(reduced_time_steps, batch_size, self.hyper_params.styles_kind))
                alpha_output_style = tf.expand_dims(tf.transpose(alpha_output_style, perm=(1, 0, 2)), -1)  # batch major
                self.alpha_output_style = alpha_output_style

                weight_ta = tf.reshape(weight_ta.stack(), shape=(reduced_time_steps, batch_size, 1))
                weight_ta = tf.transpose(weight_ta, perm=(1, 0, 2))
                self.weight_ta = weight_ta

                weight_per_ta = tf.reshape(weight_per_ta.stack(), shape=(reduced_time_steps, 1))
                self.weight_per_ta = weight_per_ta
            ### Decoder [end]

            ### PostNet [begin]
            post_output = modules.cbhg(seq2seq_output, training=False, k=8, bank_filters=128,
                                       projection_filters=(256, self.hyper_params.seq2seq_dim),
                                       highway_layers=4, highway_units=128,
                                       bi_gru_units=128, sequence_length=None,
                                       name='decoder_cbhg', reuse=False)
            post_output = tf.layers.dense(post_output, self.hyper_params.post_dim, name='post_linear_transform')
            self.post_output = post_output
Example #31
0
    def build_decoder_cell(self, encoder_outputs, encoder_state):
        """
        构建解码器cell
        :param encoder_outputs:
        :param encoder_state:
        :return:
        """
        encoder_input_length = self.encoder_inputs_length
        batch_size = self.batch_size

        if self.bidirection:
            encoder_state = encoder_state[-self.depth:]

        if self.time_major:
            encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2))

        if self.use_beamsearch_decode:
            # 复制多份
            encoder_outputs = seq2seq.tile_batch(
                encoder_outputs, multiplier=self.beam_width
            )
            encoder_state = seq2seq.tile_batch(
                encoder_state, multiplier=self.beam_width
            )
            encoder_input_length = seq2seq.tile_batch(
                self.encoder_inputs_length, multiplier=self.beam_width
            )
            batch_size *= self.beam_width

        if self.attention_type.lower() == 'luong':
            self.attention_mechanism = LuongAttention(
                num_units=self.hidden_size,
                memory=encoder_outputs,
                memory_sequence_length=encoder_input_length
            )
        else:
            self.attention_mechanism = BahdanauAttention(
                num_units=self.hidden_size,
                memory=encoder_outputs,
                memory_sequence_length=encoder_input_length
            )

        cell = MultiRNNCell([
            self.build_single_cell(
                self.hidden_size,
                use_residual=self.use_residual)
            for _ in range(self.depth)
        ])

        alignment_history = (
            self.mode != 'train' and not self.use_beamsearch_decode
        )

        def cell_input_fn(inputs, attention):
            if not self.use_residual:
                return array_ops.concat([inputs, attention], -1)

            attn_projection = layers.Dense(self.hidden_size,
                                           dtype=tf.float32,
                                           use_bias=False,
                                           name='attention_cell_input_fn')
            return attn_projection(array_ops.concat([inputs, attention], -1))

        cell = AttentionWrapper(
                                cell=cell,
                                attention_mechanism=self.attention_mechanism,
                                attention_layer_size=self.hidden_size,
                                alignment_history=alignment_history,
                                cell_input_fn=cell_input_fn,
                                name='Attention_Wrapper'
        )

        decoder_initial_state = cell.zero_state(
            batch_size, tf.float32)

        # 传递encoder状态
        decoder_initial_state = decoder_initial_state.clone(
            cell_state=encoder_state
        )

        return cell, decoder_initial_state
    def __init__(self, is_training, config, input_):
        self._is_training = is_training
        self._input = input_
        self.batch_size = input_.batch_size
        self.num_steps = input_.num_steps

        self._input_data = input_.input_data
        size = config.X_dim
        hidden_size = config.hidden_size
        vocab_size = config.vocab_size

        self._targets = input_.targets
        # Construct prior
        prior = Prior(config.prior_pi, config.log_sigma1, config.log_sigma2)

        # Fetch embeddings
        inputs = input_.input_data
        # Build the BBB LSTM cells
        cells = []
        for i in range(config.num_layers):
            if (i == 0):
                LSTM_input_size = config.X_dim
            else:
                LSTM_input_size = config.hidden_size

            cells.append(
                BayesianLSTMCell(LSTM_input_size,
                                 config.hidden_size,
                                 prior,
                                 is_training,
                                 forget_bias=0.0,
                                 name="bbb_lstm_{}".format(i)))

        cell = MultiRNNCell(cells, state_is_tuple=True)
        self._initial_state = cell.zero_state(config.batch_size, data_type())
        state = self._initial_state

        # Forward pass for the truncated mini-batch
        outputs = []
        with tf.variable_scope("RNN"):
            for time_step in range(self.num_steps):
                if time_step > 0: tf.get_variable_scope().reuse_variables()
                (cell_output, state) = cell(inputs[:, time_step, :], state)
                outputs.append(cell_output)
        output = tf.reshape(tf.concat(outputs, 1), [-1, hidden_size])

        # Softmax weights
        softmax_w = sample_posterior((hidden_size, vocab_size), "softmax_w",
                                     prior, is_training)
        softmax_b = sample_posterior((vocab_size, 1), "softmax_b", prior,
                                     is_training)

        logits = tf.nn.xw_plus_b(output, softmax_w, tf.squeeze(softmax_b))
        logits = tf.reshape(logits,
                            [self.batch_size, self.num_steps, vocab_size])

        self._output = tf.nn.softmax(logits)

        loss = tf.contrib.seq2seq.sequence_loss(
            logits,
            input_.targets,
            tf.ones([self.batch_size, self.num_steps], dtype=data_type()),
            average_across_timesteps=False,
            average_across_batch=False)

        # Update the cost
        # Remember to divide by batch size
        self._cost = tf.reduce_sum(loss) / self.batch_size
        self._kl_loss = 0.
        self._final_state = state

        if not is_training:
            return

        #Compute KL divergence
        #B = number of batches aka the epoch size
        #C = number of truncated sequences in a batch aka batch_size variable
        B = self._input.epoch_size
        C = self.batch_size

        kl_loss = tf.add_n(tf.get_collection("KL_layers"), "kl_divergence")

        kl_factor = 1.0 / (B * C)
        self._kl_loss = kl_factor * kl_loss

        self._total_loss = self._cost + self._kl_loss

        self._lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(
            tf.gradients(self._total_loss, tvars), config.max_grad_norm)
        optimizer = tf.train.GradientDescentOptimizer(self._lr)
        self._train_op = optimizer.apply_gradients(
            zip(grads, tvars),
            global_step=tf.contrib.framework.get_or_create_global_step())

        self._new_lr = tf.placeholder(data_type(),
                                      shape=[],
                                      name="new_learning_rate")
        self._lr_update = tf.assign(self._lr, self._new_lr)
Example #33
0
class RNN(object):
	def __init__(self, config, batch_size, seq_len, **kwargs):
		assert config.name is not None
		assert config.num_hidden > 0
		assert config.num_layers > 0
		assert config.num_classes > 0
		self._config = config
		self._batch_size = batch_size
		self._seq_len = seq_len
		self._input_ids = None
		self._target_ids = None
		self._stack = None
		self._state = None
		self._probs = None
		self._loss = None		
		dropout = kwargs.get('dropout', 0)
		assert 0 <= dropout < 1
		self._build_network(dropout)

	def _build_network(self, dropout):
		# Legend for tensor shapes below:
		# 	B := batch size
		# 	C := number of classes
		# 	H := number of hidden units (aka layer size)
		# 	S := sequence length

		# keep a reference to _config to make code below simpler
		config = self._config

		# Create size BxS input and target placeholder tensors
		# These will be filled in with actual values at session runtime
		data_dims = [self._batch_size, self._seq_len]
		self._input_ids = tf.placeholder(tf.int32, data_dims)
		self._target_ids = tf.placeholder(tf.int64, data_dims)

		# Create an embedding tensor to represent integer inputs into H dimensions
		# This must be done on the CPU, according to:
		# https://github.com/tensorflow/tensorflow/blob/r0.7/tensorflow/examples/tutorials/word2vec/word2vec_basic.py#L143
		# (Ops and variables pinned to the CPU because of missing GPU implementation)
		with tf.device("/cpu:0"):
			# embeddings is a CxH tensor
			embeddings = tf.get_variable('embeddings', [config.num_classes, config.num_hidden])
			# embedded is a BxSxH tensor
			embedded = tf.nn.embedding_lookup(embeddings, self._input_ids)
			# sequences is a list of length S containing Bx1xH tensors
			sequences = tf.split(embedded, self._seq_len, 1)
			# perform a "squeeze" on each item in the sequence list 
			# inputs is a list of length S containing BxH tensors
			inputs = [tf.squeeze(seq, [1]) for seq in sequences]
		
		# create LSTM cell and stack
		cell = BasicLSTMCell(config.num_hidden)
		if dropout > 0:
			keep_prob = 1 - dropout
			cell = DropoutWrapper(cell, output_keep_prob=keep_prob)
		self._stack = MultiRNNCell([cell]*config.num_layers)
		self._state = self._stack.zero_state(self._batch_size, tf.float32)

		# Pump the inputs through the RNN layers
		# outputs is a list of length S containing BxH tensors
		outputs, self._state = static_rnn(self._stack, inputs, initial_state=self._state)
		# assert len(outputs) == self._seq_len
		#assert outputs[0].get_shape() == (self._batch_size, config.num_hidden), outputs[0].get_shape()

		# Softmax weight tensor is HxC
		W_soft = tf.get_variable('W_soft', [config.num_hidden, config.num_classes])
		# Softmax bias tensor is Cx1
		b_soft = tf.get_variable('b_soft', [config.num_classes])

		# Reshape the output so that we can use it with the softmax weights and bias:
		# 	- concat makes list into a BxSH tensor,
		# 	- reshape converts the BxSH tensor into a BSxH tensor
		output = tf.reshape(tf.concat(outputs, 1), [-1, config.num_hidden])
		#assert output.get_shape() == (self._batch_size*self._seq_len, config.num_hidden), output.get_shape()

		# logits is a (BSxH).(HxC) + 1xC = BSxC + 1xC = BSxC tensor
		logits = tf.nn.xw_plus_b(output, W_soft, b_soft)
		#assert logits.get_shape() == (self._batch_size*self._seq_len, config.num_classes), logits.get_shape()

		# probs is a BSxC tensor, with entry (i,j) containing the probability that batch i is class j
		self._probs = tf.nn.softmax(logits)
		#assert self._probs.get_shape() == (self._batch_size*self._seq_len, config.num_classes), self._probs.get_shape()

		# targets is a BSx1 tensor
		targets = tf.reshape(self._target_ids, [self._batch_size*self._seq_len])
		# cross_entropy is a BSx1 tensor
		cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets)
		#assert cross_entropy.get_shape() == (self._batch_size*self._seq_len)
		
		# loss is a scalar containing the mean of cross_entropy losses
		self._loss = tf.reduce_mean(cross_entropy)

	def reset_initial_state(self):
		self._state = self._stack.zero_state(self._batch_size, tf.float32)

	@property
	def config(self):
		return _config

	@property
	def inputs(self):
		return self._input_ids

	@inputs.setter
	def inputs(self, input_ids):
		self._input_ids = input_ids

	@property
	def targets(self):
		return self._target_ids

	@targets.setter
	def targets(self, target_ids):
		self._target_ids = target_ids

	@property
	def initial_state(self):
		return self._state

	@property
	def probs(self):
		return self._probs

	@property
	def loss(self):
		return self._loss