Example #1
0
def Decoder_LSTM(inputs, sequence_length, attention_mechanism, is_training= False):
    '''
    In inference, input and sequence_length will be ignoired.
    '''
    cell_List = [];
    for index in range(hp.Decoder.LSTM.Nums):
        cell_List.append(ZoneoutLSTMCell(
            num_units= hp.Decoder.LSTM.Cell_Size,
            is_training= is_training,
            cell_zoneout_rate= hp.Decoder.LSTM.Zoneout_Rate,
            output_zoneout_rate= hp.Decoder.LSTM.Zoneout_Rate
            ))
    lstm_Cell = tf.nn.rnn_cell.MultiRNNCell(cell_List);
    
    attention_Wrapped_Cell = AttentionWrapper(
        cell= lstm_Cell,
        attention_mechanism= attention_mechanism,
        attention_layer_size=None,
        alignment_history=True,
        cell_input_fn=None,
        output_attention= False,
        initial_cell_state=None,
        name=None,
        attention_layer=None
        )

    helper = Decoder_Helper(
        inputs= inputs, #Mel
        sequence_length= sequence_length,   #Mel_length
        time_major= False,
        is_training= is_training,
        name= None
        )
    decoder = Decoder_Decoder(
        cell= attention_Wrapped_Cell,
        helper= helper,
        initial_state= attention_Wrapped_Cell.zero_state(tf.shape(inputs)[0], tf.float32)
        )
    final_outputs, final_state, _ = Decoder_Dynamic_Decode(
        decoder= decoder,
        impute_finished= False  #True
        )

    return final_outputs, final_state
Example #2
0
 def _build_train(self, config):
   # decode
   if config.model_name == "fasttext_flat":
     self.logits = tf.contrib.layers.fully_connected(self.first_attention, config.n_classes, activation_fn=None)
     print("logits:", self.logits.get_shape())
     self.logits = tf.reshape(self.logits, [-1, config.n_classes])
   elif config.model_name == "RCNN_flat":
     self.logits = tf.contrib.layers.fully_connected(self.xx_final, config.n_classes, activation_fn=None)
     print("logits:", self.logits.get_shape())
     self.logits = tf.reshape(self.logits, [-1, config.n_classes])
   else:
     encoder_state = rnn.LSTMStateTuple(self.xx_final, self.xx_final)
     attention_mechanism = BahdanauAttention(config.decode_size, memory=self.xx_context, memory_sequence_length=self.x_seq_length)
     cell = AttentionWrapper(self.lstm, attention_mechanism, output_attention=False)
     cell_state = cell.zero_state(dtype=tf.float32, batch_size=config.batch_size)
     cell_state = cell_state.clone(cell_state=encoder_state, attention=self.first_attention)
     train_helper = TrainingHelper(self.yy, self.y_seq_length)
     train_decoder = BasicDecoder(cell, train_helper, cell_state, output_layer=self.output_l)
     self.decoder_outputs_train, decoder_state_train, decoder_seq_train = dynamic_decode(train_decoder, impute_finished=True)
     self.logits = self.decoder_outputs_train.rnn_output
     print("logits:", self.logits.get_shape())
Example #3
0
def pointer_net(inputs, input_lengths, n_pointers, word_matrix, cell_type, n_layers, n_units,
                dropout_prob, is_training=True):
    """Pointer network.

    Args:
        inputs (tensor):        Inputs to pointer network (typically output of previous RNN)
        input_lengths (tensor): Actual non-padded lengths of each input sequence
        n_pointers (int):       Number of pointers to generate
        word_matrix (tensor):   Embedding matrix of word vectors
        cell_type (method):     Cell type to use
        n_layers (int):         Number of layers in RNN (same for encoder & decoder)
        n_units (int):          Number of units in RNN cell (same for encoder & decoder)
        dropout_prob (float):   Dropout probability
        is_training (bool):     Whether the model is training or testing
    """
    batch_size, seq_length, _ = inputs.get_shape().as_list()
    vocab_size = word_matrix.get_shape().as_list()[0]

    # instantiate RNN cell; only use dropout during training
    def _rnn_cell():
        keep_prob = 1 - dropout_prob if is_training else 1
        return DropoutWrapper(cell_type(n_units), output_keep_prob=keep_prob)

    enc_cell = MultiRNNCell([_rnn_cell() for _ in range(n_layers)]) if n_layers > 1 else _rnn_cell()
    encoded, _ = tf.nn.dynamic_rnn(enc_cell, inputs, input_lengths, dtype=tf.float32)

    attention = BahdanauAttention(n_units, encoded, memory_sequence_length=input_lengths)
    # TODO: find permanent solution (InferenceHelper?)
    start_tokens = tf.constant(START_TOKEN, shape=[batch_size], dtype=tf.int32)
    helper = GreedyEmbeddingHelper(word_matrix, start_tokens, END_TOKEN)

    dec_cell = MultiRNNCell([_rnn_cell() for _ in range(n_layers)]) if n_layers > 1 else _rnn_cell()
    attn_cell = AttentionWrapper(dec_cell, attention, alignment_history=True)
    out_cell = tf.contrib.rnn.OutputProjectionWrapper(attn_cell, vocab_size)
    decoder = BasicDecoder(out_cell, helper, attn_cell.zero_state(batch_size, tf.float32))
    _, states, _ = dynamic_decode(decoder, maximum_iterations=n_pointers, impute_finished=True)
    probs = tf.reshape(states.alignment_history.stack(), [n_pointers, batch_size, seq_length])
    return probs
Example #4
0
    def add_decoder_cell(self, encoder_outputs, encoder_states, hidden_size,
                         cell_type, num_layers):
        encoder_seq_len = self.source_len
        if self.mode == 'decode':
            encoder_outputs = tf.contrib.seq2seq.tile_batch(
                encoder_outputs, multiplier=self.beam_size)
            encoder_states = tf.contrib.seq2seq.tile_batch(
                encoder_states, multiplier=self.beam_size)
            encoder_seq_len = tf.contrib.seq2seq.tile_batch(
                encoder_seq_len, multiplier=self.beam_size)

        hidden_size_ = hidden_size * 2 if self.bidirection else hidden_size
        cell = MultiRNNCell([
            self.one_cell(hidden_size_, cell_type) for _ in range(num_layers)
        ])
        self.attention = BahdanauAttention(self.hidden_size, encoder_outputs,
                                           encoder_seq_len)

        def cell_input_fn(inputs, attention):
            att_proj = tf.layers.Dense(hidden_size_,
                                       dtype=tf.float32,
                                       use_bias=False,
                                       name='att_proj')

            return att_proj(tf.concat([inputs, attention], axis=-1))

        decoder_cell = AttentionWrapper(cell=cell,
                                        attention_mechanism=self.attention,
                                        attention_layer_size=hidden_size,
                                        cell_input_fn=cell_input_fn,
                                        name='attentionwrapper')

        d_size = self.beam_size * self.batch_size if self.mode == 'decode' else self.batch_size
        decoder_initial_state = decoder_cell.zero_state(
            batch_size=d_size,
            dtype=tf.float32).clone(cell_state=encoder_states)

        return decoder_cell, decoder_initial_state
Example #5
0
    def __graph__(self):

        # encoder
        encoder_outputs, encoder_state = self.encoder()

        # decoder
        with tf.variable_scope('decoder'): ##作用域,'/'
            encoder_inputs_length = self.encoder_inputs_length
            if self.beam_search:
                # 如果使用beam_search,则需要将encoder的输出进行tile_batch,其实就是复制beam_size份。
                print("use beamsearch decoding..")
                encoder_outputs = tile_batch(encoder_outputs, multiplier=self.beam_size)
                encoder_state = nest.map_structure(lambda s: tf.contrib.seq2seq.tile_batch(s, self.beam_size), encoder_state)
                encoder_inputs_length = tile_batch(encoder_inputs_length, multiplier=self.beam_size)

            # 定义要使用的attention机制。
            attention_mechanism = BahdanauAttention(num_units=self.rnn_size,
                                                    memory=encoder_outputs,
                                                    memory_sequence_length=encoder_inputs_length)
            # 定义decoder阶段要是用的RNNCell,然后为其封装attention wrapper
            decoder_cell = self.create_rnn_cell()
            decoder_cell = AttentionWrapper(cell=decoder_cell,
                                            attention_mechanism=attention_mechanism,
                                            attention_layer_size=self.rnn_size,
                                            name='Attention_Wrapper')
            # 如果使用beam_seach则batch_size = self.batch_size * self.beam_size
            batch_size = self.batch_size if not self.beam_search else self.batch_size * self.beam_size

            # 定义decoder阶段的初始化状态,直接使用encoder阶段的最后一个隐层状态进行赋值
            decoder_initial_state = decoder_cell.zero_state(batch_size=batch_size,
                                                            dtype=tf.float32).clone(cell_state=encoder_state)

            output_layer = tf.layers.Dense(self.vocab_size, kernel_initializer=tf.truncated_normal_initializer(
                                                            mean=0.0,9
                                                            stddev=0.1))

            if self.mode == 'train':
Example #6
0
def decoding_layer(decoding_embed_inp, embeddings, encoding_op, encoding_st,
                   v_size, fr_len, en_len, max_en_len, rnn_cell_size, word2int,
                   dropout_prob, batch_size, n_layers):

    for l in range(n_layers):
        with tf.variable_scope('decs_rnn_layer_{}'.format(l)):
            #gru = tf.contrib.rnn.GRUCell(rnn_len)
            gru = get_rnn_cell(rnn_cell_size, dropout_prob)
            decoding_cell = tf.contrib.rnn.DropoutWrapper(
                gru, input_keep_prob=dropout_prob)
    out_l = Dense(v_size,
                  kernel_initializer=tf.truncated_normal_initializer(
                      mean=0.0, stddev=0.1))

    attention = BahdanauAttention(rnn_cell_size,
                                  encoding_op,
                                  fr_len,
                                  normalize=False,
                                  name='BahdanauAttention')
    decoding_cell = AttentionWrapper(decoding_cell, attention, rnn_len)
    attention_zero_state = decoding_cell.zero_state(batch_size, tf.float32)
    attention_zero_state = attention_zero_state.clone(
        cell_state=encoding_st[0])
    with tf.variable_scope("decoding_layer"):
        logits_tr = training_decoding_layer(decoding_embed_inp, en_len,
                                            decoding_cell,
                                            attention_zero_state, out_l,
                                            v_size, max_en_len)
    with tf.variable_scope("decoding_layer", reuse=True):
        logits_inf = inference_decoding_layer(embeddings, word2int["TOKEN_GO"],
                                              word2int["TOKEN_EOS"],
                                              decoding_cell,
                                              attention_zero_state, out_l,
                                              max_en_len, batch_size)

    return logits_tr, logits_inf
Example #7
0
def model_fn(features, labels, mode, params):
    embedding_encoder = tf.get_variable('embedding_encoder',
                                        shape=(params.vocab_size,
                                               params.emb_size))
    table = lookup_ops.index_to_string_table_from_file(params.word_vocab_file)

    question_emb = tf.nn.embedding_lookup(embedding_encoder,
                                          features['question_words'])
    passage_emb = tf.nn.embedding_lookup(embedding_encoder,
                                         features['passage_words'])

    question_words_length = features['question_length']
    passage_words_length = features['passage_length']

    answer_start, answer_end = features['answer_start'], features['answer_end']
    answer_start = tf.concat([tf.expand_dims(answer_start, -1)] * 50, -1)
    answer_end = tf.concat([tf.expand_dims(answer_end, -1)] * 50, -1)

    with tf.variable_scope('passage_encoding'):
        passage_enc, (_, passage_bw_state) = biGRU(tf.concat(
            [passage_emb, answer_start, answer_end], -1),
                                                   passage_words_length,
                                                   params,
                                                   layers=params.layers)

    with tf.variable_scope('question_encoding'):
        question_enc, (_, question_bw_state) = biGRU(question_emb,
                                                     question_words_length,
                                                     params,
                                                     layers=params.layers)

    # output_enc = masked_concat(question_enc, passage_enc, question_words_length, passage_words_length)

    decoder_state_layer = Dense(params.units,
                                activation=tf.tanh,
                                use_bias=True,
                                name='decoder_state_init')
    decoder_init_state = tuple(
        decoder_state_layer(
            tf.concat([passage_bw_state[i], question_bw_state[i]], -1))
        for i in range(params.layers))

    question_att = BahdanauAttention(
        params.units,
        question_enc,
        memory_sequence_length=question_words_length)
    passage_att = BahdanauAttention(
        params.units, passage_enc, memory_sequence_length=passage_words_length)

    decoder_cell = AttentionWrapper(MultiRNNCell(
        [GRUCell(params.units) for _ in range(params.layers)]),
                                    [question_att, passage_att],
                                    initial_cell_state=decoder_init_state)

    batch_size = params.batch_size  # if mode != tf.estimator.ModeKeys.PREDICT else 1

    if mode == tf.estimator.ModeKeys.TRAIN:
        answer_emb = tf.nn.embedding_lookup(embedding_encoder,
                                            features['answer_words'])
        helper = TrainingHelper(answer_emb, features['answer_length'])
    else:
        helper = GreedyEmbeddingHelper(
            embedding_encoder, tf.fill([batch_size], params.tgt_sos_id),
            params.tgt_eos_id)

    projection_layer = Dense(params.vocab_size, use_bias=False)

    decoder = SNetDecoder(decoder_cell,
                          helper,
                          decoder_cell.zero_state(batch_size, tf.float32),
                          output_layer=projection_layer,
                          params=params)

    outputs, _, outputs_length = dynamic_decode(
        decoder, maximum_iterations=params.answer_max_words)
    logits = outputs.rnn_output

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'answer': table.lookup(tf.cast(outputs.sample_id, tf.int64))
        }
        export_outputs = {
            'prediction': tf.estimator.export.PredictOutput(predictions)
        }

        return tf.estimator.EstimatorSpec(mode,
                                          predictions=predictions,
                                          export_outputs=export_outputs)

    # logits = tf.Print(logits, [outputs.sample_id, labels], summarize=1000)

    labels = tf.stop_gradient(labels[:, :tf.reduce_max(outputs_length)])

    crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
                                                              logits=logits)
    target_weights = tf.sequence_mask(outputs_length, dtype=logits.dtype)
    loss = tf.reduce_sum(crossent * target_weights) / params.batch_size

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdadeltaOptimizer(learning_rate=1)
        global_step = tf.train.get_or_create_global_step()

        grads = optimizer.compute_gradients(loss)
        gradients, variables = zip(*grads)
        capped_grads, _ = tf.clip_by_global_norm(gradients, params.grad_clip)
        train_op = optimizer.apply_gradients(zip(capped_grads, variables),
                                             global_step=global_step)

        return EstimatorSpec(
            mode,
            loss=loss,
            train_op=train_op,
        )

    if mode == tf.estimator.ModeKeys.EVAL:
        return EstimatorSpec(mode,
                             loss=loss,
                             eval_metric_ops={
                                 'rouge-l':
                                 rouge_l(outputs.sample_id, labels,
                                         outputs_length,
                                         features['answer_length'], params,
                                         table),
                             })
Example #8
0
    def __init__(self,
                 inputs,
                 targets,
                 src_vocab_size,
                 src_max_length,
                 tgt_vocab_size,
                 tgt_max_length,
                 emb_dim,
                 num_units,
                 batch_size,
                 eos_token,
                 is_train,
                 share_embeddings=False,
                 teacher_forcing=False):

        xavier = tf.contrib.layers.xavier_initializer
        start_tokens = tf.zeros([batch_size], dtype=tf.int32)
        input_lengths = tf.argmin(tf.abs(inputs - eos_token), axis=-1, output_type=tf.int32)

        target_lengths = tf.argmin(tf.abs(targets - eos_token), axis=-1, output_type=tf.int32)

        input_embedding_table = tf.get_variable("encoder_embedding", [src_vocab_size, emb_dim], initializer=xavier(), dtype=tf.float32)
        input_embedding = tf.nn.embedding_lookup(input_embedding_table, inputs)
        encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units, state_is_tuple=False)
        encoder_cell = tf.nn.rnn_cell.DropoutWrapper(cell=encoder_cell,
                                                     input_keep_prob=0.8,
                                                     output_keep_prob=1.0)

        #   encoder_outputs: [max_time, batch_size, num_units]
        #   encoder_state: [batch_size, num_units]
        (encoder_output,
         encoder_state) = tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_cell,
                                                          cell_bw=encoder_cell,
                                                          inputs=input_embedding,
                                                          sequence_length=input_lengths,
                                                          dtype=tf.float32,
                                                          time_major=False)

        encoder_output = tf.concat(encoder_output, axis=2)
        encoder_state = tf.concat([encoder_state[0], encoder_state[1]], axis=1)

        if share_embeddings:
            assert src_vocab_size == tgt_vocab_size
            target_embedding_table = input_embedding_table
        else:
            target_embedding_table = tf.get_variable("decoder_embedding", [src_vocab_size, emb_dim], initializer=xavier(), dtype=tf.float32)

        prefixed_targets = tf.concat([tf.expand_dims(start_tokens, 1), targets], axis=1)
        target_embedding = tf.nn.embedding_lookup(target_embedding_table, prefixed_targets)

        if teacher_forcing:
            helper = TrainingHelper(target_embedding,
                                    target_lengths + 1,
                                    time_major=False)
        else:
            helper = GreedyEmbeddingHelper(target_embedding_table, start_tokens, eos_token)

        decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units * 2, state_is_tuple=False)
        projection_layer = tf.layers.Dense(tgt_vocab_size, use_bias=False)

        attention_mechanism = BahdanauAttention(num_units,
                                                encoder_output,
                                                memory_sequence_length=input_lengths)

        decoder_cell = AttentionWrapper(decoder_cell,
                                        attention_mechanism,
                                        attention_layer_size=num_units)
        #decoder_cell = tf.nn.rnn_cell.DropoutWrapper(cell=decoder_cell,
        #                                             input_keep_prob=0.8,
        #                                             output_keep_prob=1.0)

        encoder_state = decoder_cell.zero_state(batch_size, tf.float32).clone(cell_state=encoder_state)
        decoder = BasicDecoder(cell=decoder_cell,
                               helper=helper,
                               initial_state=encoder_state,
                               output_layer=projection_layer)

        decoder_outputs, states, lengths = dynamic_decode(decoder,
                                                          output_time_major=False,
                                                          impute_finished=True,
                                                          maximum_iterations=tgt_max_length)
        unpadded_logits = decoder_outputs.rnn_output
        missing_elems = tgt_max_length - tf.shape(unpadded_logits)[1]
        padding = [[0, 0], [0, missing_elems], [0, 0]]
        logits = tf.pad(unpadded_logits, padding, 'CONSTANT', constant_values=0.)

        weights = tf.sequence_mask(target_lengths + 1, # the "+1" is to include EOS
                                   maxlen=tgt_max_length,
                                   dtype=tf.float32)
        #self.mle_loss = sequence_loss(targets=targets,
        #                              logits=logits,
        #                              weights=weights,
        #                              average_across_batch=True)

        crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets, logits=logits)
        mle_loss = (tf.reduce_sum(crossent * weights) / batch_size)
        preds = decoder_outputs.sample_id

        self.preds = preds
        self.logits = logits
        self.mle_loss = mle_loss
Example #9
0
            def __init__(self,
                         name,
                         input_reprs,
                         roll_direction=0,
                         activate=True,
                         is_translate=False,
                         word_in=None,
                         encoder_reprs=encoder.bi_reprs):
                self.name = name
                with tf.variable_scope(name + '/predictions'):
                    #decoder_state = tf.layers.dense(input_reprs, config.projection_size, name='encoder_to_decoder')
                    decoder_state = input_reprs

                    with tf.variable_scope('word_embeddings_vi'):
                        word_embedding_matrix = tf.get_variable(
                            'word_embedding_matrix_vi',
                            initializer=pretrained_embeddings_vi)
                        if is_translate:
                            word_embeddings = tf.nn.embedding_lookup(
                                word_embedding_matrix, word_in)
                        else:
                            word_embeddings = tf.nn.embedding_lookup(
                                word_embedding_matrix, words_tgt_in)
                        word_embeddings = tf.nn.dropout(
                            word_embeddings, inputs.keep_prob)
                        word_embeddings *= tf.get_variable('emb_scale',
                                                           initializer=1.0)

                    decoder_lstm = model_helpers.lstm_cell(
                        config.bidirectional_sizes[0], inputs.keep_prob,
                        config.projection_size)

                    decoder_output_layer = tf.layers.Dense(n_classes,
                                                           name='predict')

                    if not is_translate:
                        attention_mechanism = LuongAttention(
                            num_units=config.attention_units,
                            memory=encoder_reprs,
                            memory_sequence_length=size_sr,
                            scale=True)
                        attention_cell = AttentionWrapper(
                            decoder_lstm,
                            attention_mechanism,
                            attention_layer_size=config.attention_units)

                        batch_size = tf.shape(words_tgt_in)[0]
                        decoder_initial_state = attention_cell.zero_state(
                            dtype=tf.float32,
                            batch_size=batch_size * config.beam_width)
                        decoder_state = decoder_initial_state.clone(
                            cell_state=decoder_state)

                        helper = tf.contrib.seq2seq.TrainingHelper(
                            word_embeddings, size_tgt)

                        decoder = tf.contrib.seq2seq.BasicDecoder(
                            attention_cell, helper, decoder_state,
                            decoder_output_layer)

                        outputs, state, _ = tf.contrib.seq2seq.dynamic_decode(
                            decoder)
                        # swap_memory=True)

                        self.logits = outputs.rnn_output
                    else:
                        if config.decode_mode == 'greedy':
                            helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                                word_embedding_matrix,
                                [embeddings.START, embeddings.START],
                                embeddings.END)

                            decoder = tf.contrib.seq2seq.BasicDecoder(
                                decoder_lstm, helper, decoder_state,
                                decoder_output_layer)
                        elif config.decode_mode == 'beam':
                            encoder_reprs = tf.contrib.seq2seq.tile_batch(
                                encoder_reprs, multiplier=config.beam_width)
                            decoder_state = tf.contrib.seq2seq.tile_batch(
                                decoder_state, multiplier=config.beam_width)
                            size_src = tf.contrib.seq2seq.tile_batch(
                                size_sr, multiplier=config.beam_width)

                            attention_mechanism = LuongAttention(
                                num_units=config.attention_units,
                                memory=encoder_reprs,
                                memory_sequence_length=size_src,
                                scale=True)
                            attention_cell = AttentionWrapper(
                                decoder_lstm,
                                attention_mechanism,
                                attention_layer_size=config.attention_units)

                            batch_size = 2
                            decoder_initial_state = attention_cell.zero_state(
                                dtype=tf.float32,
                                batch_size=batch_size * config.beam_width)
                            decoder_state = decoder_initial_state.clone(
                                cell_state=decoder_state)

                            #decoder_state = tf.contrib.seq2seq.tile_batch(
                            #  decoder_state, multiplier=config.beam_width)

                            decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                                cell=attention_cell,
                                embedding=word_embedding_matrix,
                                start_tokens=[
                                    embeddings.START, embeddings.START
                                ],
                                end_token=embeddings.END,
                                initial_state=decoder_state,
                                beam_width=config.beam_width,
                                output_layer=decoder_output_layer)

                        outputs, state, _ = tf.contrib.seq2seq.dynamic_decode(
                            decoder,
                            maximum_iterations=config.max_translate_length)
                        #swap_memory=True)

                        if config.decode_mode == 'greedy':
                            self.sample_ids = outputs.sample_id
                        elif config.decode_mode == 'beam':
                            self.sample_ids = outputs.predicted_ids
                    '''
          outputs, state = tf.nn.dynamic_rnn(
            model_helpers.lstm_cell(config.bidirectional_sizes[0], inputs.keep_prob,
                                    config.projection_size),
            word_embeddings,
            initial_state=decoder_state,
            dtype=tf.float32,
            sequence_length=size_tgt,
            scope='predictlstm'
          )
          '''

                    self.state = state

                    #self.logits = tf.layers.dense(outputs, n_classes, name='predict')
                    #self.logits = tf.layers.dense(outputs.rnn_output, n_classes, name='predict')

                if is_translate:
                    return

                targets = words_tgt_out
                targets *= (1 - inputs.label_smoothing)
                targets += inputs.label_smoothing / n_classes
                self.loss = model_helpers.masked_ce_loss(
                    self.logits, targets, inputs.mask)
    def add_multilayer_rnn_op(self):
        """
        Adds logits to self
        """
        with tf.variable_scope("bi-lstm"):
            _inputs = self.input_feature_embeddings
            for n in range(self.num_layers):
                with tf.variable_scope(None, default_name="bidirectional-rnn"):
                    if self.rnn_unit == 'lstm':
                        cell_fw = rnn.LSTMCell(self.hidden_dim,
                                               forget_bias=1.,
                                               state_is_tuple=True)
                        cell_bw = rnn.LSTMCell(self.hidden_dim,
                                               forget_bias=1.,
                                               state_is_tuple=True)
                    elif self.rnn_unit == 'gru':
                        cell_fw = rnn.GRUCell(self.hidden_dim)
                        cell_bw = rnn.GRUCell(self.hidden_dim)
                    elif self.rnn_unit == 'rnn':
                        cell_fw = rnn.BasicRNNCell(self.hidden_dim)
                        cell_bw = rnn.BasicRNNCell(self.hidden_dim)
                    else:
                        raise ValueError('rnn_unit must in (lstm, gru, rnn)!')

                    initial_state_fw = cell_fw.zero_state(tf.shape(
                        self.input_feature_embeddings)[0],
                                                          dtype=tf.float32)
                    initial_state_bw = cell_bw.zero_state(tf.shape(
                        self.input_feature_embeddings)[0],
                                                          dtype=tf.float32)
                    (output, state) = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw,
                        cell_bw,
                        _inputs,
                        self.sequence_lengths,
                        initial_state_fw,
                        initial_state_bw,
                        dtype=tf.float32)
                    _inputs = tf.concat(output, 2)
            self.output = tf.nn.dropout(_inputs, self.dropout_pl)

        if self.is_attention:
            with tf.variable_scope('attention'):
                embedding_dim = self.hidden_dim * 2
                attn_mech = BahdanauAttention(embedding_dim, _inputs,
                                              self.sequence_lengths)
                dec_cell = rnn.LSTMCell(self.hidden_dim, state_is_tuple=True)
                attn_cell = AttentionWrapper(dec_cell, attn_mech,
                                             embedding_dim)
                attn_zero = attn_cell.zero_state(tf.shape(
                    self.input_feature_embeddings)[0],
                                                 dtype=tf.float32)
                helper = TrainingHelper(inputs=_inputs,
                                        sequence_length=self.sequence_lengths)
                decoder = BasicDecoder(cell=attn_cell,
                                       helper=helper,
                                       initial_state=attn_zero)
                final_outputs, final_state, final_sequence_length = dynamic_decode(
                    decoder)

            self.output = tf.nn.dropout(final_outputs.rnn_output,
                                        self.dropout_pl)

        with tf.variable_scope("proj"):
            W = tf.get_variable("W",
                                shape=[2 * self.hidden_dim, self.num_class],
                                dtype=tf.float32)

            b = tf.get_variable("b",
                                shape=[self.num_class],
                                dtype=tf.float32,
                                initializer=tf.zeros_initializer())

            s = tf.shape(self.output)
            output = tf.reshape(self.output, [-1, 2 * self.hidden_dim])
            pred = tf.matmul(output, W) + b
            self.logits = tf.reshape(pred, [-1, s[1], self.num_class])
Example #11
0
                                           dtype='float',
                                           sequence_length=x_seq_length)
xx_context = outputs  # tf.concat(outputs, 2)   # [None, DL, 2*hd]
xx_final = output_states[0]  # tf.concat(output_states, 1)  # [None, 2*hd]
x_mask = tf.cast(x_mask, "float")
first_attention = tf.reduce_mean(xx_context, 1)  # [None, 2*hd]
# decode
output_l = layers_core.Dense(n_classes, use_bias=True)
encoder_state = rnn.LSTMStateTuple(xx_final, xx_final)
attention_mechanism = BahdanauAttention(hidden_size,
                                        memory=xx_context,
                                        memory_sequence_length=x_seq_length)

lstm = rnn.LayerNormBasicLSTMCell(hidden_size, dropout_keep_prob=keep_prob)
cell = AttentionWrapper(lstm, attention_mechanism, output_attention=False)
cell_state = cell.zero_state(dtype=tf.float32, batch_size=train_batch_size)
cell_state = cell_state.clone(cell_state=encoder_state,
                              attention=first_attention)
train_helper = TrainingHelper(yy, y_seq_length)
train_decoder = BasicDecoder(cell,
                             train_helper,
                             cell_state,
                             output_layer=output_l)
decoder_outputs_train, decoder_state_train, decoder_seq_train = dynamic_decode(
    train_decoder, impute_finished=True)

# infer_decoder/beam_search
tiled_inputs = tile_batch(xx_context, multiplier=beam_width)
tiled_sequence_length = tile_batch(x_seq_length, multiplier=beam_width)
tiled_first_attention = tile_batch(first_attention, multiplier=beam_width)
attention_mechanism = BahdanauAttention(
Example #12
0
    def build_decoder_cell(self, encoder_outputs, encoder_state):
        """ 构建解码器cell """
        encoder_inputs_length = self.encoder_inputs_length
        batch_size = self.batch_size

        if self.bidirectional:
            encoder_state = encoder_state[-self.depth:]

        if self.time_major:
            encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2))

        if self.use_beamsearch_decode:
            encoder_outputs = seq2seq.tile_batch(encoder_outputs,
                                                 multiplier=self.beam_width)
            encoder_state = seq2seq.tile_batch(encoder_state,
                                               multiplier=self.beam_width)
            encoder_inputs_length = seq2seq.tile_batch(
                self.encoder_inputs_length, multiplier=self.beam_width)
            #如果使用了beamsearch, 那么输入应该是beam_width的倍数等于batch_size的
            batch_size *= self.beam_width

        if self.attention_type.lower() == 'luong':
            self.attention_mechanism = LuongAttention(
                num_units=self.hidden_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_inputs_length)
        else:
            #BahdanauAttention 就是初始化时传入 num_units 以及 Encoder Outputs,然后调时传入 query 用即可得到权重变量 alignments。
            self.attention_mechanism = BahdanauAttention(
                num_units=self.hidden_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_inputs_length)

        cell = MultiRNNCell([
            self.build_signle_cell(self.hidden_units,
                                   use_residual=self.use_residual)
            for _ in range(self.depth)
        ])
        # 在非训练(预测)模式,并且没开启 beamsearch 的时候,打开 attention 历史信息
        alignment_history = (self.mode != 'train'
                             and not self.use_beamsearch_decode)

        def cell_input_fn(inputs, attention):
            """ 根据attn_input_feeding属性来判断是否在attention计算前进行一次投影的计算"""
            if not self.use_residual:
                return array_ops.concat([inputs, attention], -1)

            attn_projection = layers.Dense(self.hidden_units,
                                           dtype=tf.float32,
                                           use_bias=False,
                                           name='attention_cell_input_fn')
            return attn_projection(array_ops.concat([inputs, attention], -1))

        attention_cell = AttentionWrapper(
            cell=cell,
            attention_mechanism=self.attention_mechanism,
            attention_layer_size=self.hidden_units,
            alignment_history=alignment_history,
            cell_input_fn=cell_input_fn,
            name='AttentionWrapper')
        # 空状态
        decoder_initial_state = attention_cell.zero_state(
            batch_size, tf.float32)

        #传递encoder的状态  定义decoder阶段的初始化状态,直接使用encoder阶段的最后一个隐层状态进行赋值
        decoder_initial_state = decoder_initial_state.clone(
            cell_state=encoder_state)
        return attention_cell, decoder_initial_state
Example #13
0
                                     h=encoder_final_state_h)

#Shape: (batch_size, time_step, hidden_units)
encoder_outputs = tf.transpose(
    tf.concat((encoder_fw_outputs, encoder_bw_outputs), 2), [1, 0, 2])

decoder_cell = LSTMCell(hidden_units * 2)

attention_mechanism = BahdanauAttention(attention_units, encoder_outputs)
attention_cell = AttentionWrapper(decoder_cell, attention_mechanism)

copynet_cell = CopyNetWrapper(attention_cell, encoder_outputs, input_ids,
                              vocab_size, gen_vocab_size)

decoder_initial_state = copynet_cell.zero_state(
    batch_size, tf.float32).clone(cell_state=attention_cell.zero_state(
        batch_size=batch_size, dtype=tf.float32))

helper = tf.contrib.seq2seq.TrainingHelper(targets_embedded,
                                           targets_lengths,
                                           time_major=True)
#helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings, tf.ones([batch_size], dtype=tf.int32), 0)

decoder = tf.contrib.seq2seq.BasicDecoder(copynet_cell,
                                          helper,
                                          decoder_initial_state,
                                          output_layer=None)
decoder_outputs, final_state, coder_seq_length = tf.contrib.seq2seq.dynamic_decode(
    decoder=decoder)
decoder_logits, decoder_ids = decoder_outputs

#labels = tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32)
Example #14
0
    def __graph__(self):

        # encoder
        encoder_outputs, encoder_state = self.encoder()

        # decoder
        with tf.variable_scope('decoder'):
            encoder_inputs_length = self.encoder_inputs_length
            if self.beam_search:
                # 如果使用beam_search,则需要将encoder的输出进行tile_batch,其实就是复制beam_size份。
                print("use beamsearch decoding..")
                # 将encoder的输出复制beam_size份。
                encoder_outputs = tile_batch(encoder_outputs,
                                             multiplier=self.beam_size)
                # 将隐藏层状态复制beam_size份,隐层状态包括h和c两个,所以应用lambda表达式。
                encoder_state = nest.map_structure(
                    lambda s: tf.contrib.seq2seq.tile_batch(s, self.beam_size),
                    encoder_state)
                # 将encoder的输入长度复制bea_size份。
                encoder_inputs_length = tile_batch(encoder_inputs_length,
                                                   multiplier=self.beam_size)

            # 定义要使用的attention机制。
            # 使用Bahdanau Attention
            attention_mechanism = BahdanauAttention(
                num_units=self.rnn_size,  # 隐层的维度
                memory=encoder_outputs,  # encoder的输出
                # memory的mask,通过句子长度判断结尾。
                memory_sequence_length=encoder_inputs_length)
            # 定义decoder阶段要是用的RNNCell,然后为其封装attention wrapper
            decoder_cell = self.create_rnn_cell()
            # AttentionWrapper()用于封装带attention机制的RNN网络
            decoder_cell = AttentionWrapper(
                cell=decoder_cell,  # decoder的网络
                attention_mechanism=attention_mechanism,  # attention实例
                attention_layer_size=self.rnn_size,  # TODO:哪个维度
                name='Attention_Wrapper'  # 该AttentionWrapper名字
            )
            # 如果使用beam_seach则batch_size = self.batch_size * self.beam_size
            batch_size = self.batch_size if not self.beam_search else self.batch_size * self.beam_size

            # 定义decoder阶段的初始化状态,直接使用encoder阶段的最后一个隐层状态进行赋值
            # zero_state()先全部初始化为0,再clone()将encoder的最后一个隐层状态初始化为当前decoder的隐层状态
            decoder_initial_state = decoder_cell.zero_state(
                batch_size=batch_size,
                dtype=tf.float32).clone(cell_state=encoder_state)
            # 一个全连接层作为输出层,softmax输出为vocab_size,相当于多分类。
            # tf.truncated_normal_initializer()生成截断的正太分布。mean参数指明均值,stddev参数指明方差。
            output_layer = tf.layers.Dense(
                self.vocab_size,
                kernel_initializer=tf.truncated_normal_initializer(mean=0.0,
                                                                   stddev=0.1))

            # 如果是训练截断
            if self.mode == 'train':
                # decoder训练
                # decoder的网络、初始状态和输出层。
                self.decoder_outputs = self.decoder_train(
                    decoder_cell, decoder_initial_state, output_layer)
                # loss,使用sequence_loss计算。
                # logits:输出的预测值;targets:真实值;mask:权重比例,根据targets句子长度得到的。
                self.loss = sequence_loss(logits=self.decoder_outputs,
                                          targets=self.decoder_targets,
                                          weights=self.mask)

                # 当你想知道 learning rate 如何变化时,目标函数如何变化时,就可以通过向节点附加 tf.summary.scalar 操作来分别输出学习速度和期望误差,
                # 可以给每个 scalary_summary 分配一个有意义的标签为 'learning rate' 和 'loss function',执行后就可以看到可视化的图表。
                tf.summary.scalar('loss', self.loss)
                # 在 TensorFlow 中,所有的操作只有当你执行,或者一个操作依赖于它的输出时才会运行。
                # 为了生成 summaries,我们需要运行所有 summary nodes,所以就用 tf.summary.merge_all 来将它们合并为一个操作,
                # 这样就可以产生所有的 summary data。
                self.summary_op = tf.summary.merge_all()

                # optimizer使用Adam
                optimizer = tf.train.AdamOptimizer(self.learing_rate)
                # 获取所有参数
                trainable_params = tf.trainable_variables()
                # 所有参数根据loss进行梯度下降.
                gradients = tf.gradients(self.loss, trainable_params)
                # 梯度截断,防止梯度爆炸.
                clip_gradients, _ = tf.clip_by_global_norm(
                    gradients, self.max_gradient_norm)
                # 优化器应用梯度更新所有参数.apply_gradients()里传入(梯度,变量)的元组.
                self.train_op = optimizer.apply_gradients(
                    zip(clip_gradients, trainable_params))
            elif self.mode == 'decode':
                # 解码阶段
                self.decoder_predict_decode = self.decoder_decode(
                    decoder_cell, decoder_initial_state, output_layer)
Example #15
0
    def __init__(self,
                 vocab_size,
                 learning_rate,
                 encoder_size,
                 max_length,
                 embedding_size,
                 sos_token,
                 eos_token,
                 unk_token,
                 beam_size=5):
        self.vocab_size = vocab_size
        self.lr = learning_rate
        self.encoder_size = encoder_size
        self.max_length = max_length
        self.embedding_size = embedding_size
        self.SOS_token = sos_token
        self.EOS_token = eos_token
        self.UNK_token = unk_token
        self.beam_search_size = beam_size
        with tf.variable_scope('placeholder_and_embedding'):
            self.query = tf.placeholder(shape=(None, None), dtype=tf.int32)
            self.query_length = tf.placeholder(shape=(None, ), dtype=tf.int32)
            self.reply = tf.placeholder(shape=(None, None), dtype=tf.int32)
            self.reply_length = tf.placeholder(shape=(None, ), dtype=tf.int32)
            self.decoder_inputs = tf.placeholder(shape=(None, None),
                                                 dtype=tf.int32)
            self.decoder_target = tf.placeholder(shape=(None, None),
                                                 dtype=tf.int32)
            self.decoder_length = tf.placeholder(shape=(None, ),
                                                 dtype=tf.int32)
            self.batch_size = tf.placeholder(shape=(), dtype=tf.int32)
            self.embedding_pl = tf.placeholder(dtype=tf.float32,
                                               shape=(self.vocab_size,
                                                      embedding_size),
                                               name='embedding_source_pl')
            word_embedding = tf.get_variable(name='word_embedding',
                                             shape=(self.vocab_size,
                                                    embedding_size),
                                             dtype=tf.float32,
                                             trainable=True)
            self.init_embedding = word_embedding.assign(self.embedding_pl)
            self.max_target_sequence_length = tf.reduce_max(
                self.decoder_length, name='max_target_len')
            self.mask = tf.sequence_mask(self.decoder_length,
                                         self.max_target_sequence_length,
                                         dtype=tf.float32,
                                         name='masks')

        with tf.variable_scope("query_encoder"):
            self.query_encoder = deep_components.gru_encoder(
                word_embedding, self.encoder_size)
            query_out, query_state = self.query_encoder(
                seq_index=self.query, seq_len=self.query_length)
        with tf.variable_scope("reply_encoder"):
            self.reply_encoder = deep_components.gru_encoder(
                word_embedding, self.encoder_size)
            reply_out, reply_state = self.reply_encoder(
                seq_index=self.reply, seq_len=self.reply_length)
        with tf.variable_scope("decoder"):
            combined_encoder_state = tf.concat([query_state, reply_state],
                                               axis=1)
            tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
                combined_encoder_state, multiplier=self.beam_search_size)
            tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
                query_out, multiplier=self.beam_search_size)
            tiled_sequence_length = tf.contrib.seq2seq.tile_batch(
                self.query_length, multiplier=self.beam_search_size)
            decoder_gru = GRUCell(self.encoder_size * 2)
            attention_mechanism = BahdanauAttention(
                num_units=self.encoder_size,
                memory=tiled_encoder_outputs,
                memory_sequence_length=tiled_sequence_length)
            attention_cell = AttentionWrapper(
                decoder_gru,
                attention_mechanism,
                attention_layer_size=self.encoder_size)
            decoder_initial_state_beam = attention_cell.zero_state(
                dtype=tf.float32,
                batch_size=tf.cast(self.batch_size * self.beam_search_size,
                                   dtype=tf.int32)).clone(
                                       cell_state=tiled_encoder_final_state)
            #############################
            #attention_cell=decoder_gru
            #decoder_initial_state_beam = tiled_encoder_final_state
            ##############################
            decode_out_layer = tf.layers.Dense(self.vocab_size,
                                               name='output_layer',
                                               _reuse=tf.AUTO_REUSE)
        with tf.variable_scope("seq2seq-train"):
            # train
            self.tiled_d_in = tile_batch(self.decoder_inputs,
                                         multiplier=self.beam_search_size)
            self.tiled_d_tgt = tile_batch(self.decoder_target,
                                          multiplier=self.beam_search_size)
            train_helper = TrainingHelper(
                tf.contrib.seq2seq.tile_batch(
                    tf.nn.embedding_lookup(word_embedding,
                                           self.decoder_inputs),
                    multiplier=self.beam_search_size),
                sequence_length=tile_batch(self.decoder_length,
                                           multiplier=self.beam_search_size),
                name="train_helper")
            train_decoder = BasicDecoder(
                attention_cell,
                train_helper,
                initial_state=decoder_initial_state_beam,
                output_layer=decode_out_layer)
            self.dec_output, _, self.gen_len = dynamic_decode(
                train_decoder,
                impute_finished=True,
                maximum_iterations=self.max_target_sequence_length)
            #self.gen_max_len=tf.reduce_max(self.gen_len)
            #self.padding=tf.zeros(shape=(self.batch_size,self.max_length-self.gen_max_len,self.vocab_size),dtype=tf.float32)
            #self.padding=tile_batch(self.padding,multiplier=self.beam_search_size)
            self.dec_logits = tf.identity(self.dec_output.rnn_output)
            #self.dec_logits = tf.concat((self.dec_logits,self.padding),axis=1)
            self.decoder_target_mask = tile_batch(
                self.mask, multiplier=self.beam_search_size)
            self.cost = sequence_loss(
                self.dec_logits,
                tile_batch(self.decoder_target,
                           multiplier=self.beam_search_size),
                self.decoder_target_mask)
            self.optimizer = tf.train.AdamOptimizer(
                learning_rate=self.lr).minimize(self.cost)
        with tf.variable_scope("seq2seq_beam_search_generate"):
            start_tokens = tf.ones([
                self.batch_size,
            ], tf.int32) * self.SOS_token
            beam_infer_decoder = BeamSearchDecoder(
                attention_cell,
                embedding=word_embedding,
                end_token=self.EOS_token,
                start_tokens=start_tokens,
                initial_state=decoder_initial_state_beam,
                beam_width=self.beam_search_size,
                output_layer=decode_out_layer)
            self.bs_outputs, _, _ = dynamic_decode(
                beam_infer_decoder, maximum_iterations=self.max_length)
        with tf.variable_scope("greedy_generate"):
            decoding_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                embedding=word_embedding,
                start_tokens=start_tokens,
                end_token=self.EOS_token)
            inference_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell=attention_cell,
                helper=decoding_helper,
                initial_state=decoder_initial_state_beam,
                output_layer=decode_out_layer)
            self.greedy_outputs, _, _ = dynamic_decode(
                inference_decoder, maximum_iterations=self.max_length)
Example #16
0
	def __init__(self,n_session, pretrainedEmbeddings=[]):
		tf.reset_default_graph()
		self.n_sess = n_session
		self.sess = tf.Session()#config=CONFIG_TF)
		self.learning_rate = tf.placeholder(tf.float32)

		hidden_units = config['HIDDEN_UNITS']
		attention_units = config['ATTENTION_UNITS']
		vocab_size = config['VOCAB_SIZE']
		gen_vocab_size = config['GEN_VOCAB_SIZE']
		embed_size = config['EMBED_SIZE']

		self.paragraphs = tf.placeholder(shape=(None, None), dtype=tf.float32, name='paragraphs')
		self.ans_locs = tf.placeholder(shape=(None, None), dtype=tf.float32, name='ans_locs')
		self.encoder_inputs_lengths = tf.placeholder(shape=(None,), dtype=tf.int32, name='encoder_inputs_lengths')
		self.targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='targets')
		self.targets_lengths = tf.placeholder(shape=(None,), dtype=tf.int32, name='targets_lengths')

		paragraphs = self.paragraphs
		ans_locs = self.ans_locs
		encoder_inputs_lengths = self.encoder_inputs_lengths
		targets = self.targets
		targets_lengths = self.targets_lengths

		input_ids = tf.cast(paragraphs, tf.int32)


		batch_size, max_time = tf.unstack(tf.shape(paragraphs))

		# Load pretrained embeddings if any
		if pretrainedEmbeddings != []:
			embeddings = tf.Variable(pretrainedEmbeddings, dtype=tf.float32)
		else:
			embeddings = tf.Variable(tf.random_uniform([vocab_size, embed_size], -0.01, 0.01), dtype=tf.float32)

		paragraphs_embedded = tf.nn.embedding_lookup(embeddings, tf.transpose(tf.cast(paragraphs, tf.int32), [1,0]))

		start_tokens = tf.ones([batch_size], dtype=tf.int32)
		decoder_inputs = tf.concat([tf.expand_dims(start_tokens, 1), targets], 1)
		decoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, tf.transpose(decoder_inputs, [1,0]))

		encoder_inputs = tf.concat([paragraphs_embedded, tf.expand_dims(tf.cast(tf.transpose(ans_locs, [1,0]), tf.float32), axis=2)],axis=2)

		encoder_cell_fw = LSTMCell(hidden_units)
		encoder_cell_bw = LSTMCell(hidden_units)


		((encoder_fw_outputs,encoder_bw_outputs),(encoder_fw_final_state,encoder_bw_final_state)) = (
								tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_cell_fw,
																					cell_bw=encoder_cell_bw,
																					inputs=encoder_inputs,
																					sequence_length=encoder_inputs_lengths,
																					dtype=tf.float32, time_major=True)
								)

		encoder_final_state_c = tf.concat((encoder_fw_final_state.c, encoder_bw_final_state.c), 1)

		encoder_final_state_h = tf.concat((encoder_fw_final_state.h, encoder_bw_final_state.h), 1)

		encoder_final_state = LSTMStateTuple(
				c=encoder_final_state_c,
				h=encoder_final_state_h
		)


		#Shape: (batch_size, time_step, hidden_units)
		encoder_outputs = tf.transpose(tf.concat((encoder_fw_outputs, encoder_bw_outputs), 2), [1,0,2])

		decoder_cell = LSTMCell(hidden_units*2)

		attention_mechanism = BahdanauAttention(attention_units, encoder_outputs)
		attention_cell = AttentionWrapper(decoder_cell, attention_mechanism)

		copynet_cell = CopyNetWrapper(attention_cell, encoder_outputs, input_ids, vocab_size, gen_vocab_size)

		decoder_initial_state = copynet_cell.zero_state(batch_size, tf.float32).clone(cell_state=attention_cell.zero_state(batch_size=batch_size, dtype=tf.float32))

		helper = tf.contrib.seq2seq.TrainingHelper(decoder_inputs_embedded, targets_lengths, time_major=True)
		#helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings, tf.ones([batch_size], dtype=tf.int32), 0)

		decoder = tf.contrib.seq2seq.BasicDecoder(copynet_cell, helper, decoder_initial_state, output_layer=None)
		decoder_outputs, final_state, coder_seq_length = tf.contrib.seq2seq.dynamic_decode(decoder=decoder)
		decoder_logits, decoder_ids = decoder_outputs

		#LOSS
		decoder_targets = tf.transpose(targets, [1,0])
		labels = tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32)

		decoder_logits_ = tf.transpose(decoder_logits,[1,0,2])

		stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
			labels=labels,
			logits=decoder_logits_
		)

		"""eos = tf.constant(config['EOS'], dtype=tf.int32)
		where_eos_targ = tf.cast(tf.equal(tf.cast(decoder_targets, dtype=tf.int32), eos), tf.float32)
		n_tokens = tf.cast(tf.argmax(where_eos_targ, axis=0), tf.float32)"""

		targets_max_len, _ = tf.unstack(tf.shape(decoder_targets))

		self.loss = tf.reduce_sum(stepwise_cross_entropy, axis=0) / tf.cast(targets_max_len, tf.float32)
		self.loss = tf.reduce_sum(self.loss) / tf.cast(batch_size, tf.float32)
		#self.loss	= tf.Print(self.loss,[tf.nn.softmax(decoder_logits),labels], summarize=100)

		optimizer = tf.train.AdagradOptimizer(self.learning_rate)#tf.train.GradientDescent
		gradients, variables = zip(*optimizer.compute_gradients(self.loss))
		gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
		self.train_op = optimizer.apply_gradients(zip(gradients, variables))#.minimize(self.loss)


		self.saver = tf.train.Saver(max_to_keep=None)
		if os.path.exists("ckpt/"+str(self.n_sess)) == False:
			os.system("mkdir ckpt/"+str(self.n_sess))


		self.sess.run(tf.global_variables_initializer())
Example #17
0
    def build_graph(self):
        # build_graph-train vs validate-train
        print('Building the TensorFlow graph...')
        opts = self.options

        self.graph = tf.Graph()
        with self.graph.as_default():
            self.enc_input = tf.placeholder(
                tf.int32, shape=[opts.batch_size, opts.max_uttr_len_enc])
            self.dec_input = tf.placeholder(
                tf.int32, shape=[opts.batch_size, opts.max_uttr_len_dec])
            self.target = tf.placeholder(
                tf.int32, shape=[opts.batch_size, opts.max_uttr_len_dec])

            self.enc_input_len = tf.placeholder(tf.int32,
                                                shape=[opts.batch_size])
            self.dec_input_len = tf.placeholder(tf.int32,
                                                shape=[opts.batch_size])

            self.VAD = tf.placeholder(tf.float32, shape=[opts.corpus_size, 3])
            self.termfreq = tf.placeholder(tf.float32,
                                           shape=[opts.corpus_size, 1])
            self.VAD_loss = tf.placeholder(tf.float32,
                                           shape=[opts.corpus_size, 1])

            with tf.variable_scope('embedding', reuse=tf.AUTO_REUSE):
                # how to get input_embed for encoder and decoder
                word_embeddings = tf.Variable(tf.random_uniform(
                    [opts.corpus_size, opts.word_embed_size], -1.0, 1.0),
                                              name='embedding')
                #                 word_embeddings = tf.constant(opts.word_embeddings, name = 'word_embeddings')

                enc_input_embed = tf.nn.embedding_lookup(
                    word_embeddings, self.enc_input)
                dec_input_embed = tf.nn.embedding_lookup(
                    word_embeddings, self.dec_input)

                enc_input_VAD = tf.nn.embedding_lookup(self.VAD,
                                                       self.enc_input)
                target_VAD = tf.nn.embedding_lookup(self.VAD, self.target)

                enc_input_tf = tf.nn.embedding_lookup(self.termfreq,
                                                      self.enc_input)
                target_tf = tf.nn.embedding_lookup(self.termfreq, self.target)

                target_VAD_loss = tf.nn.embedding_lookup(
                    self.VAD_loss, self.target)
                target_VAD_loss = tf.squeeze(target_VAD_loss)

            with tf.variable_scope('encoding', reuse=tf.AUTO_REUSE):
                cell_enc = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_enc)
                # bi-directional?
                enc_outputs, _ = tf.nn.dynamic_rnn(
                    cell_enc,
                    enc_input_embed,
                    sequence_length=self.enc_input_len,
                    dtype=tf.float32)

            if opts.mode == 'PREDICT':
                enc_outputs = tile_batch(enc_outputs,
                                         multiplier=opts.beam_width)
                enc_input_embed = tile_batch(enc_input_embed,
                                             multiplier=opts.beam_width)
                enc_input_VAD = tile_batch(enc_input_VAD,
                                           multiplier=opts.beam_width)
                enc_input_tf = tile_batch(enc_input_tf,
                                          multiplier=opts.beam_width)
                tiled_enc_input_len = tile_batch(self.enc_input_len,
                                                 multiplier=opts.beam_width)
            else:
                tiled_enc_input_len = self.enc_input_len

#             with tf.variable_scope('attention', reuse = tf.AUTO_REUSE) as attention_layer:
#                 attention_Wb = tf.layers.Dense(units=3,
#                                              use_bias=False,
#                                              kernel_initializer = tf.truncated_normal_initializer(stddev = 0.1),
#                                              name='attention_Wb')

            with tf.variable_scope('decoding', reuse=tf.AUTO_REUSE) as vs:
                # attn_mechanism: alpha_<t,t'>
                attn_mechanism = MyBahdanauAttention(
                    num_units=opts.attn_depth,
                    memory=enc_outputs,
                    memory_sequence_length=tiled_enc_input_len,
                    enc_input_embed=enc_input_embed,
                    enc_input_VAD=enc_input_VAD,
                    enc_input_tf=enc_input_tf,
                    VAD_mode=opts.VAD_mode)
                cell_dec = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_dec)
                # AttentionWrapper: c?
                cell_dec = AttentionWrapper(cell_dec,
                                            attn_mechanism,
                                            output_attention=False)
                output_layer = tf.layers.Dense(
                    units=opts.corpus_size,
                    kernel_initializer=tf.truncated_normal_initializer(
                        stddev=0.1))

                # Train
                if opts.mode == 'TRAIN':
                    dec_initial_state = cell_dec.zero_state(
                        opts.batch_size, tf.float32)
                    attention = compute_attention(
                        attn_mechanism, dec_initial_state.cell_state)  #(1,256)
                    dec_initial_state = dec_initial_state.clone(
                        attention=attention)
                    outputs_dec, _ = tf.nn.dynamic_rnn(
                        cell=cell_dec,
                        inputs=dec_input_embed,
                        sequence_length=self.dec_input_len,
                        initial_state=dec_initial_state,
                        dtype=tf.float32,
                        scope=vs)
                    # logits: `[batch_size, sequence_length, num_decoder_symbols]`
                    # The logits correspond to the prediction across all classes at each timestep.
                    logits = output_layer.apply(outputs_dec)
                    # batch size * max sentence length; binary; 0 for non-word in orignal sentence; mask
                    sequence_mask = tf.sequence_mask(
                        self.dec_input_len,
                        maxlen=opts.max_uttr_len_dec,
                        dtype=tf.float32)
                    if opts.VAD_mode:
                        weights = sequence_mask * target_VAD_loss  # affective objective function
                    else:
                        weights = sequence_mask
                    # sequence_mask: [batch_size, max_len]
                    # target: [batch_size, max_len] VAD_loss: [batch_size,max_len]
                    # softmax_loss_function(labels=targets, logits=logits_flat) 默认为sparse_softmax_cross_entropy_with_logits
                    self.loss = sequence_loss(logits, self.target, weights)
                    self.loss_batch = sequence_loss(logits,
                                                    self.target,
                                                    weights,
                                                    average_across_batch=False)
                    self.optimizer = tf.train.AdamOptimizer(
                        opts.learning_rate).minimize(self.loss)
                    self.init = tf.global_variables_initializer()

                # Predict
                if opts.mode == 'PREDICT':
                    dec_initial_state = cell_dec.zero_state(
                        opts.batch_size * opts.beam_width, tf.float32)
                    attention = compute_attention(attn_mechanism,
                                                  dec_initial_state.cell_state)
                    dec_initial_state = dec_initial_state.clone(
                        attention=attention)
                    start_tokens = tf.constant(opts.go_index,
                                               dtype=tf.int32,
                                               shape=[opts.batch_size])
                    bs_decoder = BeamSearchDecoder(
                        cell=cell_dec,
                        embedding=word_embeddings,
                        start_tokens=start_tokens,
                        end_token=opts.eos_index,
                        initial_state=dec_initial_state,
                        beam_width=opts.beam_width,
                        output_layer=output_layer)
                    final_outputs, final_state, _ = dynamic_decode(
                        bs_decoder,
                        impute_finished=False,
                        maximum_iterations=opts.max_uttr_len_dec,
                        scope=vs)
                    self.predicted_ids = final_outputs.predicted_ids
                    #                     self.scores = final_outputs.scores # 'FinalBeamSearchDecoderOutput' object has no attribute 'scores'
                    self.prob = final_state.log_probs
                    # log_probs: The log probabilities with shape `[batch_size, beam_width, vocab_size]`.
                    #  logits: Logits at the current time step. A tensor of shape `[batch_size, beam_width, vocab_size]`
                    # step_log_probs = nn_ops.log_softmax(logits) # logsoftmax = logits - log(reduce_sum(exp(logits), axis))
                    # step_log_probs = _mask_probs(step_log_probs, end_token, previously_finished)
                    # total_probs = array_ops.expand_dims(beam_state.log_probs, 2) + step_log_probs
                    #  final_outputs.scores #[batch_size, length, beam_width]

                if opts.mode == 'POST_PREDICT':
                    dec_initial_state = cell_dec.zero_state(
                        opts.batch_size, tf.float32)
                    attention = compute_attention(
                        attn_mechanism, dec_initial_state.cell_state)  #(1,256)
                    dec_initial_state = dec_initial_state.clone(
                        attention=attention)
                    outputs_dec, _ = tf.nn.dynamic_rnn(
                        cell=cell_dec,
                        inputs=dec_input_embed,
                        sequence_length=self.dec_input_len,
                        initial_state=dec_initial_state,
                        dtype=tf.float32,
                        scope=vs)
                    logits = output_layer.apply(outputs_dec)
                    sequence_mask = tf.sequence_mask(
                        self.dec_input_len,
                        maxlen=opts.max_uttr_len_dec,
                        dtype=tf.float32)
                    score = tf.nn.sparse_softmax_cross_entropy_with_logits(
                        labels=self.target, logits=logits)
                    self.prob = -1 * tf.reduce_sum(score * sequence_mask)

            self.tvars = tf.trainable_variables()
            self.saver = tf.train.Saver(max_to_keep=100)
Example #18
0
    def __graph__(self):

        # encoder
        encoder_outputs, encoder_state = self.encoder()

        # decoder
        with tf.variable_scope('decoder'):
            encoder_inputs_length = self.encoder_inputs_length
            if self.beam_search:
                # 如果使用beam_search,则需要将encoder的输出进行tile_batch,其实就是复制beam_size份。
                print("use beamsearch decoding..")
                encoder_outputs = tile_batch(encoder_outputs,
                                             multiplier=self.beam_size)
                encoder_state = nest.map_structure(
                    lambda s: tf.contrib.seq2seq.tile_batch(s, self.beam_size),
                    encoder_state)
                encoder_inputs_length = tile_batch(encoder_inputs_length,
                                                   multiplier=self.beam_size)

            # 定义要使用的attention机制。
            attention_mechanism = BahdanauAttention(
                num_units=self.rnn_size,
                memory=encoder_outputs,
                memory_sequence_length=encoder_inputs_length)
            # 定义decoder阶段要是用的RNNCell,然后为其封装attention wrapper
            decoder_cell = self.create_rnn_cell()
            decoder_cell = AttentionWrapper(
                cell=decoder_cell,
                attention_mechanism=attention_mechanism,
                attention_layer_size=self.rnn_size,
                name='Attention_Wrapper')
            # 如果使用beam_seach则batch_size = self.batch_size * self.beam_size
            batch_size = self.batch_size if not self.beam_search else self.batch_size * self.beam_size

            # 定义decoder阶段的初始化状态,直接使用encoder阶段的最后一个隐层状态进行赋值
            decoder_initial_state = decoder_cell.zero_state(
                batch_size=batch_size,
                dtype=tf.float32).clone(cell_state=encoder_state)

            output_layer = tf.layers.Dense(
                self.vocab_size,
                kernel_initializer=tf.truncated_normal_initializer(mean=0.0,
                                                                   stddev=0.1))

            if self.mode == 'train':
                self.decoder_outputs = self.decoder_train(
                    decoder_cell, decoder_initial_state, output_layer)
                # loss
                self.loss = sequence_loss(logits=self.decoder_outputs,
                                          targets=self.decoder_targets,
                                          weights=self.mask)

                # summary
                tf.summary.scalar('loss', self.loss)
                self.summary_op = tf.summary.merge_all()

                # optimizer
                optimizer = tf.train.AdamOptimizer(self.learing_rate)
                trainable_params = tf.trainable_variables()
                gradients = tf.gradients(self.loss, trainable_params)
                clip_gradients, _ = tf.clip_by_global_norm(
                    gradients, self.max_gradient_norm)
                self.train_op = optimizer.apply_gradients(
                    zip(clip_gradients, trainable_params))
            elif self.mode == 'decode':
                self.decoder_predict_decode = self.decoder_decode(
                    decoder_cell, decoder_initial_state, output_layer)
Example #19
0
    def build_initial_graph(self,
                            encoder_input,
                            len_both,
                            beam_width=1,
                            reuse=False):
        """
        Building initial graph with input for encoder and a fixed beam width
        :param encoder_input: Input that will be processed from encoder
        :param len_both: length of input
        :param beam_width: beam width
        :param reuse: If this graph already exists and should be reused, like in validation graph
        :return: decoder cell and attention zero state
        """
        # look up embeddings for input sequence
        encoder_subject_embedded = tf.nn.embedding_lookup(
            self.embeddings_english, encoder_input)

        # Define variable scope for LSTM Encoder
        with tf.variable_scope("LSTM_Encoder_subject", reuse=reuse):

            # Create a bidirectional lstm with encoder forward cell and encoder backward cell defined as class variables
            outputs, output_states = tf.nn.bidirectional_dynamic_rnn(
                self.encoder_cell_forward,
                self.encoder_cell_backward,
                inputs=encoder_subject_embedded,
                sequence_length=len_both,
                dtype=tf.float32,
                time_major=False)

            # Concat outputs and states of forward and backward lstm
            outputs = tf.concat(outputs, 2)

            # Unpack forward state and backward state vom output states
            forward_states, backward_states = output_states

            # List vor c states (lstm cell state) and h states (lstm hidden state)
            c_states = []
            h_states = []

            # Rearrange state to give them into decoder: concat forward and backward c and h state
            for i, state in enumerate(forward_states):
                c_forward = state[0]
                c_backward = backward_states[i][0]

                c_state = tf.concat([c_forward, c_backward], 1)
                c_states.append(c_state)

                h_forward = state[1]
                h_backward = backward_states[i][1]

                h_state = tf.concat([h_forward, h_backward], 1)
                h_states.append(h_state)

            # List for saving states as tuple
            state_tuples = []

            # Saving states as LSTMStateTuple
            for i, c_state in enumerate(c_states):
                state_tuple = LSTMStateTuple(c_state, h_states[i])
                state_tuples.append(state_tuple)

            # Cast list to tuple
            state_tuples = tuple(state_tuples)

        # multiply rnn output if beam search is used
        outputs = tile_batch(outputs, beam_width)
        len_both = tile_batch(len_both, beam_width)
        encoder_final_state = tile_batch(state_tuples, beam_width)

        # Choose luong or bahdanau attention
        if self.attention == "luong":
            AttentionBuilder = LuongAttention
        elif self.attention == "bahdanau":
            AttentionBuilder = BahdanauAttention
        else:
            print("Attention mechanism not found.")
            sys.exit()

        # Define variable scope for attention mechanism
        with tf.variable_scope("Attention", reuse=reuse):

            # Create an attention mechanism
            attention_mechanism = AttentionBuilder(self.attention_size,
                                                   outputs, len_both)

            # Create Attention wrapper with decoder cell
            decoder_cell = AttentionWrapper(self.decoder_cell,
                                            attention_mechanism,
                                            self.attention_size)

        # Create zero state of decoder cell with specified batch size and beam width
        attn_zero_state = decoder_cell.zero_state(batch_size=self.batch_size *
                                                  beam_width,
                                                  dtype=tf.float32)

        # Set cell state to final decoder cell state
        attn_zero_state = attn_zero_state.clone(cell_state=encoder_final_state)

        return decoder_cell, attn_zero_state
Example #20
0
    def add_decoder_op(self, enc_final_state, enc_hidden_states,
                       output_embed_matrix, training):
        original_enc_final_state = enc_final_state
        flat_enc_final_state = nest.flatten(enc_final_state)
        enc_final_state = tf.concat(flat_enc_final_state, axis=1)
        enc_final_size = int(enc_final_state.get_shape()[1])

        part_logit_preds = dict()
        part_token_preds = dict()
        part_logit_sequence_preds = dict()
        part_token_sequence_preds = dict()
        part_layers = []
        grammar = self.config.grammar
        for i, part in enumerate(('trigger', 'query', 'action')):
            with tf.variable_scope('decode_function_' + part):
                activation = getattr(
                    tf.nn, self.config.function_nonlinearity) if hasattr(
                        tf.nn, self.config.function_nonlinearity) else getattr(
                            tf, self.config.function_nonlinearity)
                layer = tf.contrib.layers.fully_connected(
                    enc_final_state,
                    self.config.function_hidden_size,
                    activation_fn=activation)
                part_layers.append(layer)
                layer_with_dropout = tf.nn.dropout(
                    layer, keep_prob=self.dropout_placeholder, seed=443 * i)
                part_logit_preds[part] = tf.layers.dense(
                    layer_with_dropout, len(grammar.functions[part]))
                part_token_preds[part] = tf.cast(tf.argmax(
                    part_logit_preds[part], axis=1),
                                                 dtype=tf.int32)

        first_value_token = grammar.num_functions + grammar.num_begin_tokens + grammar.num_control_tokens
        num_value_tokens = grammar.output_size - first_value_token
        output_embed_matrix = tf.concat(
            (output_embed_matrix[0:grammar.num_control_tokens],
             output_embed_matrix[first_value_token:]),
            axis=0)

        adjusted_trigger = part_token_preds['trigger'] + (
            grammar.num_control_tokens + grammar.num_begin_tokens)
        adjusted_query = part_token_preds['query'] + (
            grammar.num_control_tokens + grammar.num_begin_tokens +
            len(grammar.functions['trigger']))
        adjusted_action = part_token_preds['action'] + (
            grammar.num_control_tokens + grammar.num_begin_tokens +
            len(grammar.functions['trigger']) +
            len(grammar.functions['query']))

        layer_concat = tf.concat(part_layers, axis=1)
        for i, part in enumerate(('trigger', 'query', 'action')):
            with tf.variable_scope('decode_sequence_' + part):

                def one_decoder_input(i, like):
                    with tf.variable_scope(str(i)):
                        return tf.layers.dense(layer_concat,
                                               like.get_shape()[1])

                flat_decoder_initial_state = [
                    one_decoder_input(i, like)
                    for i, like in enumerate(flat_enc_final_state)
                ]
                decoder_initial_state = nest.pack_sequence_as(
                    original_enc_final_state, flat_decoder_initial_state)
                cell_dec = tf.contrib.rnn.MultiRNNCell([
                    self.make_rnn_cell(i, True)
                    for i in range(self.config.rnn_layers)
                ])

                # uncompress function tokens (to look them up in the grammar)
                if training:
                    adjusted_function_token = self.part_function_placeholders[
                        part]
                else:
                    if part == 'trigger':
                        adjusted_function_token = adjusted_trigger
                    elif part == 'query':
                        adjusted_function_token = adjusted_query
                    elif part == 'action':
                        adjusted_function_token = adjusted_action

                # adjust the sequence to "skip" function tokens
                output_size = grammar.num_control_tokens + num_value_tokens
                output = self.part_sequence_placeholders[part]
                adjusted_output = tf.where(
                    output >= grammar.num_control_tokens,
                    output - (first_value_token - grammar.num_control_tokens),
                    output)

                if self.config.apply_attention:
                    attention = LuongAttention(self.config.decoder_hidden_size,
                                               enc_hidden_states,
                                               self.input_length_placeholder,
                                               probability_fn=tf.nn.softmax)
                    cell_dec = AttentionWrapper(
                        cell_dec,
                        attention,
                        cell_input_fn=lambda inputs, _: inputs,
                        attention_layer_size=self.config.decoder_hidden_size,
                        initial_cell_state=decoder_initial_state)
                    decoder_initial_state = cell_dec.zero_state(
                        self.batch_size, dtype=tf.float32)
                decoder = Seq2SeqDecoder(
                    self.config,
                    self.input_placeholder,
                    self.input_length_placeholder,
                    adjusted_output,
                    self.part_sequence_length_placeholders[part],
                    self.batch_number_placeholder,
                    max_length=MAX_PRIMITIVE_LENGTH)
                rnn_output, sample_ids = decoder.decode(
                    cell_dec,
                    decoder_initial_state,
                    output_size,
                    output_embed_matrix,
                    training,
                    grammar_helper=PrimitiveSequenceGrammarHelper(
                        grammar, adjusted_function_token))
                part_logit_sequence_preds[part] = rnn_output
                part_token_sequence_preds[part] = tf.cast(sample_ids,
                                                          dtype=tf.int32)

        with tf.variable_scope('top_classifier'):
            top_hidden = tf.contrib.layers.fully_connected(
                enc_final_state,
                self.config.first_token_hidden_size,
                activation_fn=tf.tanh)
            top_hidden_with_dropout = tf.nn.dropout(
                top_hidden, keep_prob=self.dropout_placeholder, seed=127)
            top_logits = tf.layers.dense(top_hidden_with_dropout,
                                         grammar.num_begin_tokens)
            top_token = tf.cast(tf.argmax(top_logits, axis=1), dtype=tf.int32)

        with tf.variable_scope('decode_special'):
            output_size = grammar.num_control_tokens + num_value_tokens
            output = self.special_label_placeholder
            adjusted_output = tf.where(
                output >= grammar.num_control_tokens,
                output - (first_value_token - grammar.num_control_tokens),
                output)
            cell_dec = tf.contrib.rnn.MultiRNNCell([
                self.make_rnn_cell(i, True)
                for i in range(self.config.rnn_layers)
            ])

            sequence_length = tf.ones(
                (self.batch_size, ), dtype=tf.int32) * MAX_SPECIAL_LENGTH
            decoder_initial_state = original_enc_final_state
            if self.config.apply_attention:
                attention = LuongAttention(self.config.decoder_hidden_size,
                                           enc_hidden_states,
                                           self.input_length_placeholder,
                                           probability_fn=tf.nn.softmax)
                cell_dec = AttentionWrapper(
                    cell_dec,
                    attention,
                    cell_input_fn=lambda inputs, _: inputs,
                    attention_layer_size=self.config.decoder_hidden_size,
                    initial_cell_state=original_enc_final_state)
                decoder_initial_state = cell_dec.zero_state(self.batch_size,
                                                            dtype=tf.float32)
            decoder = Seq2SeqDecoder(self.config,
                                     self.input_placeholder,
                                     self.input_length_placeholder,
                                     adjusted_output,
                                     sequence_length,
                                     self.batch_number_placeholder,
                                     max_length=MAX_SPECIAL_LENGTH)
            rnn_output, sample_ids = decoder.decode(
                cell_dec,
                decoder_initial_state,
                output_size,
                output_embed_matrix,
                training,
                grammar_helper=SpecialSequenceGrammarHelper(grammar))
            logit_special_sequence = rnn_output
            token_special_sequence = tf.cast(sample_ids, dtype=tf.int32)

        # adjust tokens back to their output code
        adjusted_top = tf.expand_dims(top_token + grammar.num_control_tokens,
                                      axis=1)

        adjusted_special_sequence = tf.where(
            token_special_sequence >= grammar.num_control_tokens,
            token_special_sequence +
            (first_value_token - grammar.num_control_tokens),
            token_special_sequence)

        adjusted_token_sequences = dict()
        for part in ('trigger', 'query', 'action'):
            token_sequence = part_token_sequence_preds[part]
            adjusted_token_sequence = tf.where(
                token_sequence >= grammar.num_control_tokens, token_sequence +
                (first_value_token - grammar.num_control_tokens),
                token_sequence)
            adjusted_token_sequences[part] = adjusted_token_sequence
        # remove EOS from the middle of the sentence
        adjusted_token_sequences['trigger'] = tf.where(
            tf.equal(adjusted_token_sequences['trigger'], grammar.end),
            tf.zeros_like(adjusted_token_sequences['trigger']),
            adjusted_token_sequences['trigger'])
        adjusted_token_sequences['query'] = tf.where(
            tf.equal(adjusted_token_sequences['query'], grammar.end),
            tf.zeros_like(adjusted_token_sequences['query']),
            adjusted_token_sequences['query'])

        adjusted_trigger = tf.expand_dims(adjusted_trigger, axis=1)
        adjusted_query = tf.expand_dims(adjusted_query, axis=1)
        adjusted_action = tf.expand_dims(adjusted_action, axis=1)

        program_sequence = tf.concat(
            (adjusted_top, adjusted_trigger,
             adjusted_token_sequences['trigger'], adjusted_query,
             adjusted_token_sequences['query'], adjusted_action,
             adjusted_token_sequences['action']),
            axis=1)
        full_special_sequence = tf.concat(
            (adjusted_top, adjusted_special_sequence), axis=1)
        # full special sequence is smaller than program sequence, so we need to pad it all the way to the same shape
        full_special_sequence = pad_up_to(full_special_sequence,
                                          tf.shape(program_sequence)[1],
                                          rank=1)

        rule_token = grammar.dictionary['rule'] - grammar.num_control_tokens
        full_sequence = tf.where(tf.equal(top_token, rule_token),
                                 program_sequence, full_special_sequence)

        return ThreePartAlignerResult(top_logits, part_logit_preds,
                                      part_logit_sequence_preds,
                                      logit_special_sequence, full_sequence)
Example #21
0
    def buildModel(self):
        T_in = self.args.T_in
        T_out = self.args.T_out
        D_in = self.args.D_in
        D_out = self.args.D_out
        E = self.args.embedding_dim
        H = self.args.hidden_dim
        SOS = self.args.SOS
        EOS = self.args.EOS
        PAD = self.args.PAD
        beam_width = 3

        # Input
        with tf.name_scope('input'):
            x = tf.placeholder(shape=(None, T_in),
                               dtype=tf.int32,
                               name='encoder_inputs')
            # N, T_out
            y = tf.placeholder(shape=(None, T_out),
                               dtype=tf.int32,
                               name='decoder_inputs')
            # N
            x_len = tf.placeholder(shape=(None, ), dtype=tf.int32)
            # N
            y_len = tf.placeholder(shape=(None, ), dtype=tf.int32)
            # dynamic sample num
            batch_size = tf.shape(x)[0]

            # symbol mask
            sos = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * SOS
            eos = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * EOS
            pad = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * PAD

            # input mask
            x_mask = tf.sequence_mask(x_len, T_in, dtype=tf.float32)
            y_with_sos_mask = tf.sequence_mask(y_len,
                                               T_out + 1,
                                               dtype=tf.float32)
            y_with_pad = tf.concat([y, pad], axis=1)
            eos_mask = tf.one_hot(y_len, depth=T_out + 1, dtype=tf.int32) * EOS

            # masked inputs
            y_with_eos = y_with_pad + eos_mask
            y_with_sos = tf.concat([sos, y], axis=1)

        ## Embedding
        with tf.name_scope('embedding'):
            if self.args.use_pretrained:
                embedding_pretrained = np.fromfile(self.args.pretrained_file,
                                                   dtype=np.float32).reshape(
                                                       (-1, E))
                embedding = tf.Variable(embedding_pretrained, trainable=False)
            else:
                embedding = tf.get_variable(name='embedding',
                                            shape=(D_in, E),
                                            dtype=tf.float32,
                                            initializer=xavier_initializer())
            e_x = tf.nn.embedding_lookup(embedding, x)
            e_y = tf.nn.embedding_lookup(embedding, y_with_sos)
            if self.args.mode == 'train':
                e_x = tf.nn.dropout(e_x, self.args.keep_prob)

        ## Encoder
        with tf.name_scope('encoder'):
            ## Multi-BiLSTM
            fw_cell = rnn.MultiRNNCell([
                rnn.BasicLSTMCell(num_units=H)
                for i in range(self.args.layer_size)
            ])
            bw_cell = rnn.MultiRNNCell([
                rnn.BasicLSTMCell(num_units=H)
                for i in range(self.args.layer_size)
            ])
            bi_encoder_output, bi_encoder_state = tf.nn.bidirectional_dynamic_rnn(
                fw_cell,
                bw_cell,
                e_x,
                sequence_length=x_len,
                dtype=tf.float32,
                time_major=False,
                scope=None)
            encoder_output = bi_encoder_output[0] + bi_encoder_output[1]
            encoder_final_state = bi_encoder_state[0]

        ## Decoder
        with tf.name_scope('decoder'):
            decoder_cell = rnn.MultiRNNCell([
                rnn.BasicLSTMCell(num_units=H)
                for i in range(self.args.layer_size)
            ])
            decoder_lengths = tf.ones(shape=[batch_size],
                                      dtype=tf.int32) * (T_out + 1)

            ## Trainning decoder
            with tf.variable_scope('attention'):
                attention_mechanism = LuongAttention(
                    num_units=H,
                    memory=encoder_output,
                    memory_sequence_length=x_len,
                    name='attention_fn')
            projection_layer = Dense(units=D_out,
                                     kernel_initializer=xavier_initializer())

            train_decoder_cell = AttentionWrapper(
                cell=decoder_cell,
                attention_mechanism=attention_mechanism,
                attention_layer_size=H)
            train_decoder_init_state = train_decoder_cell.zero_state(
                batch_size=batch_size,
                dtype=tf.float32).clone(cell_state=encoder_final_state)
            training_helper = TrainingHelper(e_y,
                                             decoder_lengths,
                                             time_major=False)
            train_decoder = BasicDecoder(
                cell=train_decoder_cell,
                helper=training_helper,
                initial_state=train_decoder_init_state,
                output_layer=projection_layer)
            train_decoder_outputs, _, _ = dynamic_decode(
                train_decoder,
                impute_finished=True,
                maximum_iterations=T_out + 1)
            # N, T_out+1, D_out
            train_decoder_outputs = ln(train_decoder_outputs.rnn_output)

            ## Beam_search decoder
            beam_memory = tile_batch(encoder_output, beam_width)
            beam_memory_state = tile_batch(encoder_final_state, beam_width)
            beam_memory_length = tile_batch(x_len, beam_width)

            with tf.variable_scope('attention', reuse=True):
                beam_attention_mechanism = LuongAttention(
                    num_units=H,
                    memory=beam_memory,
                    memory_sequence_length=beam_memory_length,
                    name='attention_fn')
            beam_decoder_cell = AttentionWrapper(
                cell=decoder_cell,
                attention_mechanism=beam_attention_mechanism,
                attention_layer_size=None)
            beam_decoder_init_state = beam_decoder_cell.zero_state(
                batch_size=batch_size * beam_width,
                dtype=tf.float32).clone(cell_state=beam_memory_state)
            start_tokens = tf.ones((batch_size), dtype=tf.int32) * SOS
            beam_decoder = BeamSearchDecoder(
                cell=beam_decoder_cell,
                embedding=embedding,
                start_tokens=start_tokens,
                end_token=EOS,
                initial_state=beam_decoder_init_state,
                beam_width=beam_width,
                output_layer=projection_layer)
            beam_decoder_outputs, _, _ = dynamic_decode(
                beam_decoder,
                scope=tf.get_variable_scope(),
                maximum_iterations=T_out + 1)
            beam_decoder_result_ids = beam_decoder_outputs.predicted_ids

        with tf.name_scope('loss'):
            logits = tf.nn.softmax(train_decoder_outputs)
            cross_entropy = tf.keras.losses.sparse_categorical_crossentropy(
                y_with_eos, logits)
            loss_mask = tf.sequence_mask(y_len + 1,
                                         T_out + 1,
                                         dtype=tf.float32)
            loss = tf.reduce_sum(cross_entropy * loss_mask) / tf.cast(
                batch_size, dtype=tf.float32)
            prediction = tf.argmax(logits, 2)

        ## train_op
        with tf.name_scope('train'):
            global_step = tf.train.get_or_create_global_step()
            lr = noam_scheme(self.args.lr, global_step, self.args.warmup_steps)
            optimizer = tf.train.AdamOptimizer(lr)

            ## gradient clips
            trainable_params = tf.trainable_variables()
            gradients = tf.gradients(loss, trainable_params)
            clip_gradients, _ = tf.clip_by_global_norm(
                gradients, self.args.gradient_clip_num)
            train_op = optimizer.apply_gradients(zip(clip_gradients,
                                                     trainable_params),
                                                 global_step=global_step)

        # Summary
        with tf.name_scope('summary'):
            tf.summary.scalar('lr', lr)
            tf.summary.scalar('loss', loss)
            tf.summary.scalar('global_step', global_step)
            summaries = tf.summary.merge_all()
        return x, y, x_len, y_len, logits, loss, prediction, beam_decoder_result_ids, global_step, train_op, summaries
Example #22
0
    def build_model(self):
        print('building model... ...')
        with tf.variable_scope('seq2seq_placeholder'):
            self.encoder_inputs = tf.placeholder(tf.int32, [None, None],
                                                 name="encoder_inputs")
            self.decoder_inputs = tf.placeholder(tf.int32, [None, None],
                                                 name="decoder_inputs")
            self.decoder_targets = tf.placeholder(tf.int32, [None, None],
                                                  name="decoder_targets")
            self.decoder_targets_masks = tf.placeholder(tf.float32,
                                                        [None, None],
                                                        name="mask")
            self.encoder_length = tf.placeholder(tf.int32, [None],
                                                 name="encoder_length")
            self.decoder_length = tf.placeholder(tf.int32, [None],
                                                 name="decoder_length")
            self.max_target_sequence_length = tf.reduce_max(
                self.decoder_length, name='max_target_len')

        with tf.variable_scope('seq2seq_embedding'):
            self.embedding = self.init_embedding(self.vocab_size,
                                                 self.embedding_size)

        with tf.variable_scope('seq2seq_encoder'):
            encoder_outputs, encoder_states = build_encoder(
                self.embedding,
                self.encoder_inputs,
                self.encoder_length,
                self.enc_num_layers,
                self.enc_num_units,
                self.enc_cell_type,
                bidir=self.enc_bidir)

        with tf.variable_scope('seq2seq_decoder'):
            encoder_length = self.encoder_length
            if self.beam_search:
                print("use beamsearch decoding..")
                encoder_outputs = tile_batch(encoder_outputs,
                                             multiplier=self.beam_size)
                encoder_states = tile_batch(encoder_states,
                                            multiplier=self.beam_size)
                encoder_length = tile_batch(encoder_length,
                                            multiplier=self.beam_size)

            attention_mechanism = BahdanauAttention(
                num_units=self.attn_num_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_length)

            decoder_cell = create_rnn_cell(self.dec_num_layers,
                                           self.dec_num_units,
                                           self.dec_cell_type)
            decoder_cell = AttentionWrapper(
                cell=decoder_cell,
                attention_mechanism=attention_mechanism,
                attention_layer_size=self.dec_num_units,
                name='Attention_Wrapper')

            batch_size = self.batch_size if not self.beam_search else self.batch_size * self.beam_size

            decoder_initial_state = decoder_cell.zero_state(
                batch_size=batch_size,
                dtype=tf.float32).clone(cell_state=encoder_states)

            output_layer = tf.layers.Dense(self.vocab_size,
                                           use_bias=False,
                                           name='output_projection')

            if self.mode == 'train':
                decoder_inputs_embedded = tf.nn.embedding_lookup(
                    self.embedding, self.decoder_inputs)
                # training helper的作用就是决定下一个时序的decoder的输入为给定的decoder inputs, 而不是上一个时刻的输出
                training_helper = tf.contrib.seq2seq.TrainingHelper(
                    inputs=decoder_inputs_embedded,
                    sequence_length=self.decoder_length,
                    name='training_helper')

                training_decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell=decoder_cell,
                    helper=training_helper,
                    initial_state=decoder_initial_state,
                    output_layer=output_layer)

                decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder=training_decoder,
                    impute_finished=True,
                    maximum_iterations=self.max_target_sequence_length)

                self.decoder_logits_train = decoder_outputs.rnn_output

                self.loss = tf.contrib.seq2seq.sequence_loss(
                    logits=self.decoder_logits_train,
                    targets=self.decoder_targets,
                    weights=self.decoder_targets_masks)

                optimizer = tf.train.AdamOptimizer(self.learning_rate)
                trainable_params = tf.trainable_variables()
                gradients = tf.gradients(self.loss, trainable_params)
                clip_gradients, _ = tf.clip_by_global_norm(
                    gradients, self.max_gradient_norm)
                self.train_op = optimizer.apply_gradients(
                    zip(clip_gradients, trainable_params))

            elif self.mode == 'infer':
                start_tokens = tf.ones([
                    self.batch_size,
                ], tf.int32) * SOS_ID  # 这里的batch_size不需要复制
                end_token = EOS_ID

                if self.beam_search:
                    inference_decoder = BeamSearchDecoder(
                        cell=decoder_cell,
                        embedding=self.embedding,
                        start_tokens=start_tokens,
                        end_token=end_token,
                        initial_state=decoder_initial_state,
                        beam_width=self.beam_size,
                        output_layer=output_layer)
                else:
                    decoding_helper = GreedyEmbeddingHelper(
                        embedding=self.embedding,
                        start_tokens=start_tokens,
                        end_token=end_token)

                    inference_decoder = BasicDecoder(
                        cell=decoder_cell,
                        helper=decoding_helper,
                        initial_state=decoder_initial_state,
                        output_layer=output_layer)

                decoder_outputs, _, _ = dynamic_decode(
                    decoder=inference_decoder,
                    maximum_iterations=self.infer_max_iter)
                if self.beam_search:
                    infer_outputs = decoder_outputs.predicted_ids  # [batch_size, decoder_targets_length, beam_size]
                    self.infer_outputs = tf.transpose(
                        infer_outputs,
                        [0, 2, 1
                         ])  # [batch_size, beam_size, decoder_targets_length]
                else:
                    self.infer_outputs = decoder_outputs.sample_id  # [batch_size, decoder_targets_length]

        self.saver = tf.train.Saver(tf.global_variables(),
                                    max_to_keep=self.max_to_keep)
    def add_prediction_op(self):
        encoder_embed_seq = embed_sequence(
            self.inputs,
            vocab_size=self.config.vocab_size + 2,
            embed_dim=self.config.embedding_size,
            scope='embed')

        decoder_input_embed_seq = embed_sequence(
            self.labels[:, :-1],
            vocab_size=self.config.vocab_size + 2,
            embed_dim=self.config.embedding_size,
            scope='embed',
            reuse=True)

        with tf.variable_scope('embed', reuse=True):
            embeddings = tf.get_variable('embeddings')

        encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(
            BasicLSTMCell(self.config.num_units, name="encoder"),
            encoder_embed_seq,
            dtype=tf.float32,
            sequence_length=self.lengths,
        )

        if self.config.train:
            tiled_encoder_outputs = encoder_outputs
            tiled_encoder_final_state = encoder_final_state
            tiled_sequence_length = self.lengths
        else:
            tiled_encoder_outputs = tile_batch(
                encoder_outputs, multiplier=self.config.beam_width)
            tiled_encoder_final_state = tile_batch(
                encoder_final_state, multiplier=self.config.beam_width)
            tiled_sequence_length = tile_batch(
                self.lengths, multiplier=self.config.beam_width)

        attention_mechanism = BahdanauAttention(
            num_units=self.config.num_units,
            memory=tiled_encoder_outputs,
            memory_sequence_length=tiled_sequence_length)

        attn_cell = AttentionWrapper(
            BasicLSTMCell(self.config.num_units, name="decoder"),
            attention_mechanism,
            attention_layer_size=self.config.num_units / 2)

        if self.config.train:
            batch_size = self.config.batch_size
        else:
            batch_size = self.config.batch_size * self.config.beam_width

        decoder_initial_state = attn_cell.zero_state(dtype=tf.float32,
                                                     batch_size=batch_size)
        decoder_initial_state = decoder_initial_state.clone(
            cell_state=tiled_encoder_final_state)

        output_layer = tf.layers.Dense(self.config.vocab_size + 2,
                                       use_bias=True,
                                       name='output_projection')

        if self.config.train:
            training_helper = TrainingHelper(inputs=decoder_input_embed_seq,
                                             sequence_length=self.lengths,
                                             name='training_helper')

            decoder = BasicDecoder(cell=attn_cell,
                                   helper=training_helper,
                                   initial_state=decoder_initial_state,
                                   output_layer=output_layer)
        else:

            def embed_and_input_proj(inputs):
                return tf.nn.embedding_lookup(embeddings, inputs)

            start_tokens = tf.ones([
                self.config.batch_size,
            ], tf.int32) * (self.config.vocab_size + 1)
            decoder = BeamSearchDecoder(
                cell=attn_cell,
                embedding=embed_and_input_proj,
                start_tokens=start_tokens,
                end_token=self.config.vocab_size,
                initial_state=decoder_initial_state,
                beam_width=self.config.beam_width,
                output_layer=output_layer,
            )

        if self.config.train:
            decoder_outputs, _, _ = dynamic_decode(
                decoder=decoder,
                impute_finished=True,
                maximum_iterations=self.config.max_sequence_length + 1)
            pred_logits = tf.identity(decoder_outputs.rnn_output,
                                      name="prediction")
        else:
            decoder_outputs, _, _ = dynamic_decode(
                decoder=decoder,
                impute_finished=False,
                maximum_iterations=self.config.max_sequence_length + 1)
            pred_logits = tf.identity(decoder_outputs.predicted_ids,
                                      name="prediction")
        return pred_logits