def build_decoder_cell(self): encoder_inputs_length = self.encoder_inputs_length if self.beam_search: print("use beamsearch decoding..") self.encoder_outputs = tile_batch(self.encoder_outputs, multiplier=self.beam_size) self.encoder_state = nest.map_structure( lambda s: tile_batch(s, self.beam_size), self.encoder_state) encoder_inputs_length = tile_batch(encoder_inputs_length, multiplier=self.beam_size) # 定义要使用的attention机制。 attention_mechanism = BahdanauAttention( num_units=self.rnn_size, memory=self.encoder_outputs, memory_sequence_length=encoder_inputs_length) # 定义decoder阶段要使用的RNNCell,然后为其封装attention wrapper decoder_cell = self.create_rnn_cell() decoder_cell = AttentionWrapper( cell=decoder_cell, attention_mechanism=attention_mechanism, attention_layer_size=self.rnn_size, name='Attention_Wrapper') batch_size = self.batch_size if not self.beam_search else self.batch_size * self.beam_size decoder_initial_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32).clone(cell_state=self.encoder_state) return decoder_cell, decoder_initial_state
def _build_infer(self, config): # infer_decoder/beam_search # skip for flat_baseline if config.use_att: # with att tiled_inputs = tile_batch(self.xx_context, multiplier=config.beam_width) tiled_sequence_length = tile_batch(self.x_seq_length, multiplier=config.beam_width) tiled_first_attention = tile_batch(self.first_attention, multiplier=config.beam_width) attention_mechanism = BahdanauAttention( config.decode_size, memory=tiled_inputs, memory_sequence_length=tiled_sequence_length) tiled_xx_final = tile_batch(self.xx_final, config.beam_width) encoder_state2 = rnn.LSTMStateTuple(tiled_xx_final, tiled_xx_final) cell = AttentionWrapper(self.lstm, attention_mechanism, output_attention=False) cell_state = cell.zero_state(dtype=tf.float32, batch_size=config.test_batch_size * config.beam_width) cell_state = cell_state.clone(cell_state=encoder_state2, attention=tiled_first_attention) infer_decoder = BeamSearchDecoder(cell, embedding=self.label_embeddings, start_tokens=[config.GO] * config.test_batch_size, end_token=config.EOS, initial_state=cell_state, beam_width=config.beam_width, output_layer=self.output_l) decoder_outputs_infer, decoder_state_infer, decoder_seq_infer = dynamic_decode( infer_decoder, maximum_iterations=config.max_seq_length) self.preds = decoder_outputs_infer.predicted_ids self.scores = decoder_state_infer.log_probs else: # without att tiled_inputs = tile_batch(self.xx_context, multiplier=config.beam_width) tiled_sequence_length = tile_batch(self.x_seq_length, multiplier=config.beam_width) tiled_xx_final = tile_batch(self.xx_final, config.beam_width) encoder_state = rnn.LSTMStateTuple(tiled_xx_final, tiled_xx_final) #tiled_encoder_state = tile_batch(encoder_state, config.beam_width) cell = self.lstm infer_decoder = BeamSearchDecoder(cell, embedding=self.label_embeddings, start_tokens=[config.GO] * config.test_batch_size, end_token=config.EOS, initial_state=encoder_state, beam_width=config.beam_width, output_layer=self.output_l) decoder_outputs_infer, decoder_state_infer, decoder_seq_infer = dynamic_decode( infer_decoder, maximum_iterations=config.max_seq_length) self.preds = decoder_outputs_infer.predicted_ids self.scores = decoder_state_infer.log_probs
def setup_decoder_cell(self, config, keep_prob, use_beam_search, init_state, attention_states, attention_lengths): batch_size = get_state_shape(init_state)[0] if use_beam_search: attention_states = tile_batch(attention_states, multiplier=self.beam_width) init_state = nest.map_structure( lambda s: tile_batch(s, self.beam_width), init_state) attention_lengths = tile_batch(attention_lengths, multiplier=self.beam_width) batch_size = batch_size * self.beam_width attention_size = shape(attention_states, -1) attention = getattr(tf.contrib.seq2seq, config.attention_type)( attention_size, attention_states, memory_sequence_length=attention_lengths) def cell_input_fn(inputs, attention): # define cell input function to keep input/output dimension same if not config.use_attention_input_feeding: return inputs attn_project = tf.layers.Dense(config.hidden_size, dtype=tf.float32, name='attn_input_feeding', activation=self.activation) return attn_project(tf.concat([inputs, attention], axis=-1)) cells = _setup_decoder_cell(config, keep_prob) if config.top_attention: # apply attention mechanism only on the top decoder layer cells[-1] = AttentionWrapper( cells[-1], attention_mechanism=attention, name="AttentionWrapper", attention_layer_size=config.hidden_size, alignment_history=use_beam_search, initial_cell_state=init_state[-1], cell_input_fn=cell_input_fn) init_state = [state for state in init_state] init_state[-1] = cells[-1].zero_state(batch_size=batch_size, dtype=tf.float32) init_state = tuple(init_state) cells = MultiRNNCell(cells) else: cells = MultiRNNCell(cells) cells = AttentionWrapper(cells, attention_mechanism=attention, name="AttentionWrapper", attention_layer_size=config.hidden_size, alignment_history=use_beam_search, initial_cell_state=init_state, cell_input_fn=cell_input_fn) init_state = cells.zero_state(batch_size=batch_size, dtype=tf.float32) \ .clone(cell_state=init_state) return cells, init_state
def _create_decoder_cell(self): enc_outputs, enc_states, enc_seq_len = self.enc_outputs, self.enc_states, self.enc_seq_len batch_size = self.batch_size * self.cfg.beam_size if self.use_beam_search else self.batch_size with tf.variable_scope("attention"): if self.cfg.attention == "luong": # Luong attention mechanism attention_mechanism = LuongAttention( num_units=self.cfg.num_units, memory=enc_outputs, memory_sequence_length=enc_seq_len) else: # default using Bahdanau attention mechanism attention_mechanism = BahdanauAttention( num_units=self.cfg.num_units, memory=enc_outputs, memory_sequence_length=enc_seq_len) def cell_input_fn( inputs, attention ): # define cell input function to keep input/output dimension same # reference: https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/AttentionWrapper if not self.cfg.use_attention_input_feeding: return inputs input_project = tf.layers.Dense(self.cfg.num_units, dtype=tf.float32, name='attn_input_feeding') return input_project(tf.concat([inputs, attention], axis=-1)) if self.cfg.top_attention: # apply attention mechanism only on the top decoder layer cells = [ self._create_rnn_cell() for _ in range(self.cfg.num_layers) ] cells[-1] = AttentionWrapper( cells[-1], attention_mechanism=attention_mechanism, name="Attention_Wrapper", attention_layer_size=self.cfg.num_units, initial_cell_state=enc_states[-1], cell_input_fn=cell_input_fn) initial_state = [state for state in enc_states] initial_state[-1] = cells[-1].zero_state(batch_size=batch_size, dtype=tf.float32) dec_init_states = tuple(initial_state) cells = MultiRNNCell(cells) else: cells = MultiRNNCell( [self._create_rnn_cell() for _ in range(self.cfg.num_layers)]) cells = AttentionWrapper(cells, attention_mechanism=attention_mechanism, name="Attention_Wrapper", attention_layer_size=self.cfg.num_units, initial_cell_state=enc_states, cell_input_fn=cell_input_fn) dec_init_states = cells.zero_state( batch_size=batch_size, dtype=tf.float32).clone(cell_state=enc_states) return cells, dec_init_states
def build_decoder_cell(self): encoder_inputs_length = self.encoder_inputs_length # 编码器输入长度 if self.beam_search: # 是否使用beam search print("use beamsearch decoding..") # 如果使用beam_search,则需要将encoder的输出进行tile_batch # tile_batch的功能是将第一个参数的数据复制multiplier份,在此例中是beam_size份 self.encoder_outputs = tile_batch(self.encoder_outputs, multiplier=self.beam_size) # lambda是一个表达式,在此处相当于是一个关于s的函数 # nest.map_structure(func,structure)将func应用于每一个structure并返回值 # 因为LSTM中有c和h两个structure,所以需要使用nest.map_structrue() self.encoder_state = nest.map_structure( lambda s: tile_batch(s, self.beam_size), self.encoder_state) encoder_inputs_length = tile_batch(encoder_inputs_length, multiplier=self.beam_size) # 定义要使用的attention机制。 # 使用的attention机制是Bahdanau Attention,关于这种attention机制的细节,可以查看论文 # Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio. # "Neural Machine Translation by Jointly Learning to Align and Translate." # ICLR 2015. https://arxiv.org/abs/1409.0473 # 这种attention机制还有一种正则化的版本,如果需要在tensorflow中使用,加上参数normalize=True即可 # 关于正则化的细节,可以查看论文 # Tim Salimans, Diederik P. Kingma. # "Weight Normalization: A Simple Reparameterization to Accelerate Training of Deep Neural Networks." # https://arxiv.org/abs/1602.07868 attention_mechanism = BahdanauAttention( num_units=self.rnn_size, # 隐层的维度 memory=self.encoder_outputs, # 通常情况下就是encoder的输出 # memory的mask,超过长度数据不计入attention memory_sequence_length=encoder_inputs_length) # 定义decoder阶段要使用的RNNCell,然后为其封装attention wrapper decoder_cell = self.create_rnn_cell() # 定义decoder阶段要使用的RNNCell decoder_cell = AttentionWrapper( # AttentionWrapper()用于封装带attention机制的RNN网络 cell=decoder_cell, # cell参数指明了需要封装的RNN网络 attention_mechanism= attention_mechanism, # attention_mechanism指明了AttentionMechanism的实例 attention_layer_size=self. rnn_size, # attention_layer_size TODO:是attention封装后的RNN状态维度? name='Attention_Wrapper' # name指明了AttentionWrapper的名字 ) # 如果使用beam_seach则batch_size = self.batch_size * self.beam_size batch_size = self.batch_size if not self.beam_search else self.batch_size * self.beam_size # AttentionWrapper.zero_state()的功能是将AttentionWrapper对象0初始化 # AttentionWrapper对象0初始化后可以使用.clone()方法将参数中的状态赋值给AttentionWrapper对象 # 本例中使用encoder阶段的最后一个隐层状态来赋值定义decoder阶段的初始化状态 decoder_initial_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32).clone(cell_state=self.encoder_state) return decoder_cell, decoder_initial_state
def create_attention(decoding_cell, encoding_op, encoding_st, fr_len): if (args.attention_option is "Luong"): print("Attention is all I need.") attention_mechanism = tf.contrib.seq2seq.LuongAttention( hidden_size, encoding_op, fr_len) decoding_cell = AttentionWrapper(decoding_cell, attention_mechanism, hidden_size) attention_zero_state = decoding_cell.zero_state(batch_size, tf.float32) attention_zero_state = attention_zero_state.clone( cell_state=encoding_st) return decoding_cell, attention_zero_state
def __graph__(self): # encoder encoder_outputs, encoder_state = self.encoder() # decoder with tf.variable_scope('decoder'): encoder_inputs_length = self.encoder_inputs_length if self.beam_search: # 如果使用beam_search,则需要将encoder的输出进行tile_batch,其实就是复制beam_size份。 print("use beamsearch decoding..") encoder_outputs = tile_batch(encoder_outputs, multiplier=self.beam_size) encoder_state = nest.map_structure(lambda s: tf.contrib.seq2seq.tile_batch(s, self.beam_size), encoder_state) encoder_inputs_length = tile_batch(encoder_inputs_length, multiplier=self.beam_size) # 定义要使用的attention机制。 attention_mechanism = BahdanauAttention(num_units=self.rnn_size, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length) # 定义decoder阶段要是用的RNNCell,然后为其封装attention wrapper decoder_cell = self.create_rnn_cell() decoder_cell = AttentionWrapper(cell=decoder_cell, attention_mechanism=attention_mechanism, attention_layer_size=self.rnn_size, name='Attention_Wrapper') # 如果使用beam_seach则batch_size = self.batch_size * self.beam_size batch_size = self.batch_size if not self.beam_search else self.batch_size * self.beam_size # 定义decoder阶段的初始化状态,直接使用encoder阶段的最后一个隐层状态进行赋值 decoder_initial_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32).clone(cell_state=encoder_state) output_layer = tf.layers.Dense(self.vocab_size, kernel_initializer=tf.truncated_normal_initializer( mean=0.0, stddev=0.1)) if self.mode == 'train': self.decoder_outputs = self.decoder_train(decoder_cell, decoder_initial_state, output_layer) # loss self.loss = sequence_loss(logits=self.decoder_outputs, targets=self.decoder_targets, weights=self.mask) # summary tf.summary.scalar('loss', self.loss) self.summary_op = tf.summary.merge_all() # optimizer optimizer = tf.train.AdamOptimizer(self.learing_rate) trainable_params = tf.trainable_variables() gradients = tf.gradients(self.loss, trainable_params) clip_gradients, _ = tf.clip_by_global_norm(gradients, self.max_gradient_norm) self.train_op = optimizer.apply_gradients(zip(clip_gradients, trainable_params)) elif self.mode == 'decode': self.decoder_predict_decode = self.decoder_decode(decoder_cell, decoder_initial_state, output_layer)
def apply_attention(cell_dec, enc_hidden_states, enc_final_state, input_length, batch_size, attention_probability_fn, dropout, alignment_history=True): if attention_probability_fn == 'softmax': probability_fn = tf.nn.softmax score_mask_value = float('-inf') elif attention_probability_fn == 'hardmax': probability_fn = tf.contrib.seq2seq.hardmax score_mask_value = float('-inf') elif attention_probability_fn == 'sparsemax': def sparsemax(attentionscores): attentionscores = tf.contrib.sparsemax.sparsemax(attentionscores) with tf.control_dependencies([ tf.assert_non_negative(attentionscores), tf.assert_less_equal(attentionscores, 1., summarize=60) ]): return tf.identity(attentionscores) probability_fn = sparsemax # sparsemax does not deal with -inf properly, and has significant numerical stability issues # with large numbers (positive or negative) score_mask_value = -1e+5 else: raise ValueError("Invalid attention_probability_fn " + str(attention_probability_fn)) with tf.variable_scope( 'attention', initializer=tf.initializers.identity(dtype=tf.float32)): attention = LuongAttention(int(cell_dec.output_size), enc_hidden_states, memory_sequence_length=input_length, probability_fn=probability_fn, score_mask_value=score_mask_value) cell_dec = AttentionWrapper(cell_dec, attention, cell_input_fn=lambda inputs, _: inputs, attention_layer_size=int(cell_dec.output_size), alignment_history=alignment_history, initial_cell_state=enc_final_state) enc_final_state = cell_dec.zero_state(batch_size, dtype=tf.float32) cell_dec = ActivationWrapper(cell_dec, activation=tf.tanh) cell_dec = NotBrokenDropoutWrapper(cell_dec, output_keep_prob=dropout) return cell_dec, enc_final_state
def __init__(self, hidden_size, keep_prob, attention_mechanism=None, name="RNNEncoder"): with vs.variable_scope(name): self.hidden_size = hidden_size self.keep_prob = keep_prob self.rnn_cell_fw = rnn_cell.GRUCell(self.hidden_size) if attention_mechanism is not None: self.rnn_cell_fw = AttentionWrapper(self.rnn_cell_fw, attention_mechanism) self.rnn_cell_fw = DropoutWrapper(self.rnn_cell_fw, input_keep_prob=self.keep_prob) self.rnn_cell_bw = rnn_cell.GRUCell(self.hidden_size) if attention_mechanism is not None: self.rnn_cell_bw = AttentionWrapper(self.rnn_cell_bw, attention_mechanism) self.rnn_cell_bw = DropoutWrapper(self.rnn_cell_bw, input_keep_prob=self.keep_prob) self.name = name
def _build_train(self, config): # decode if config.model_name == "fasttext_flat": self.logits = tf.contrib.layers.fully_connected( self.first_attention, config.fn_classes, activation_fn=None) print("logits:", self.logits.get_shape()) self.logits = tf.reshape(self.logits, [-1, config.fn_classes]) elif config.model_name == "RCNN_flat": self.logits = tf.contrib.layers.fully_connected(self.xx_final, config.fn_classes, activation_fn=None) print("logits:", self.logits.get_shape()) self.logits = tf.reshape(self.logits, [-1, config.fn_classes]) else: encoder_state = rnn.LSTMStateTuple(self.xx_final, self.xx_final) if config.use_att: attention_mechanism = BahdanauAttention( config.decode_size, memory=self.xx_context, memory_sequence_length=self.x_seq_length) cell = AttentionWrapper(self.lstm, attention_mechanism, output_attention=False) cell_state = cell.zero_state(dtype=tf.float32, batch_size=config.batch_size) cell_state = cell_state.clone(cell_state=encoder_state, attention=self.first_attention) train_helper = TrainingHelper(self.yy, self.y_seq_length) train_decoder = BasicDecoder(cell, train_helper, cell_state, output_layer=self.output_l) self.decoder_outputs_train, decoder_state_train, decoder_seq_train = dynamic_decode( train_decoder, impute_finished=True) self.logits = self.decoder_outputs_train.rnn_output # self.logits = tf.reshape(self.logits, [-1, config.max_seq_length, config.hn_classes]) print("logits:", self.logits.get_shape()) else: cell = self.lstm train_helper = TrainingHelper(self.yy, self.y_seq_length) train_decoder = BasicDecoder(cell, train_helper, encoder_state, output_layer=self.output_l) self.decoder_outputs_train, decoder_state_train, decoder_seq_train = dynamic_decode( train_decoder, impute_finished=True) self.logits = self.decoder_outputs_train.rnn_output # self.logits = tf.reshape(self.logits, [-1, config.max_seq_length, config.hn_classes]) print("logits:", self.logits.get_shape())
def run_match_lstm(encoded_rep, masks,size): encoded_question, encoded_passage = encoded_rep masks_question, masks_passage = masks match_lstm_cell_attention_fn = lambda curr_input, state : tf.concat([curr_input, state], axis = -1) query_depth = encoded_question.get_shape()[-1] # output attention is false because we want to output the cell output and not the attention values with tf.variable_scope("match_lstm_attender"): attention_mechanism_match_lstm = BahdanauAttention(query_depth, encoded_question, memory_sequence_length = masks_question) cell = tf.contrib.rnn.BasicLSTMCell(num_units=size, state_is_tuple = True) lstm_attender = AttentionWrapper(cell, attention_mechanism_match_lstm, output_attention = False, cell_input_fn = match_lstm_cell_attention_fn) # we don't mask the passage because masking the memories will be handled by the pointerNet reverse_encoded_passage = _reverse(encoded_passage, masks_passage, 1, 0) output_attender_fw, _ = tf.nn.dynamic_rnn(lstm_attender, encoded_passage, dtype=tf.float32, scope ="rnn") output_attender_bw, _ = tf.nn.dynamic_rnn(lstm_attender, reverse_encoded_passage, dtype=tf.float32, scope = "rnn") output_attender_bw = _reverse(output_attender_bw, masks_passage, 1, 0) output_attender = tf.concat([output_attender_fw, output_attender_bw], axis = -1) # (-1, P, 2*H) return output_attender
def __init__( self, memory, memory_sequence_length=None, cell=None, cell_dropout_mode=None, vocab_size=None, output_layer=None, #attention_layer=None, # TODO(zhiting): only valid for tf>=1.0 cell_input_fn=None, hparams=None): RNNDecoderBase.__init__(self, cell, vocab_size, output_layer, cell_dropout_mode, hparams) attn_hparams = self._hparams['attention'] attn_kwargs = attn_hparams['kwargs'].todict() # Parse the 'probability_fn' argument if 'probability_fn' in attn_kwargs: prob_fn = attn_kwargs['probability_fn'] if prob_fn is not None and not callable(prob_fn): prob_fn = utils.get_function(prob_fn, [ 'tensorflow.nn', 'tensorflow.contrib.sparsemax', 'tensorflow.contrib.seq2seq' ]) attn_kwargs['probability_fn'] = prob_fn attn_kwargs.update({ "memory_sequence_length": memory_sequence_length, "memory": memory }) self._attn_kwargs = attn_kwargs attn_modules = ['tensorflow.contrib.seq2seq', 'texar.tf.custom'] # Use variable_scope to ensure all trainable variables created in # the attention mechanism are collected with tf.variable_scope(self.variable_scope): attention_mechanism = utils.check_or_get_instance( attn_hparams["type"], attn_kwargs, attn_modules, classtype=tf.contrib.seq2seq.AttentionMechanism) self._attn_cell_kwargs = { "attention_layer_size": attn_hparams["attention_layer_size"], "alignment_history": attn_hparams["alignment_history"], "output_attention": attn_hparams["output_attention"], } self._cell_input_fn = cell_input_fn # Use variable_scope to ensure all trainable variables created in # AttentionWrapper are collected with tf.variable_scope(self.variable_scope): #if attention_layer is not None: # self._attn_cell_kwargs["attention_layer_size"] = None attn_cell = AttentionWrapper( self._cell, attention_mechanism, cell_input_fn=self._cell_input_fn, #attention_layer=attention_layer, **self._attn_cell_kwargs) self._cell = attn_cell
def _get_beam_search_cell(self, beam_width): """Returns the RNN cell for beam search decoding. """ with tf.variable_scope(self.variable_scope, reuse=True): attn_kwargs = copy.copy(self._attn_kwargs) memory = attn_kwargs['memory'] attn_kwargs['memory'] = tile_batch(memory, multiplier=beam_width) memory_seq_length = attn_kwargs['memory_sequence_length'] if memory_seq_length is not None: attn_kwargs['memory_sequence_length'] = tile_batch( memory_seq_length, beam_width) attn_modules = ['tensorflow.contrib.seq2seq', 'texar.tf.custom'] bs_attention_mechanism = utils.check_or_get_instance( self._hparams.attention.type, attn_kwargs, attn_modules, classtype=tf.contrib.seq2seq.AttentionMechanism) bs_attn_cell = AttentionWrapper(self._cell._cell, bs_attention_mechanism, cell_input_fn=self._cell_input_fn, **self._attn_cell_kwargs) self._beam_search_cell = bs_attn_cell return bs_attn_cell
def add_decoder_op(self, enc_final_state, enc_hidden_states, output_embed_matrix, training): cell_dec = tf.contrib.rnn.MultiRNNCell([self.make_rnn_cell(i, for_decoder=True) for i in range(self.config.rnn_layers)]) encoder_hidden_size = int(enc_hidden_states.get_shape()[-1]) decoder_hidden_size = int(cell_dec.output_size) # if encoder and decoder have different sizes, add a projection layer if encoder_hidden_size != decoder_hidden_size: assert False, (encoder_hidden_size, decoder_hidden_size) with tf.variable_scope('hidden_projection'): kernel = tf.get_variable('kernel', (encoder_hidden_size, decoder_hidden_size), dtype=tf.float32) # apply a relu to the projection for good measure enc_final_state = nest.map_structure(lambda x: tf.nn.relu(tf.matmul(x, kernel)), enc_final_state) enc_hidden_states = tf.nn.relu(tf.tensordot(enc_hidden_states, kernel, [[2], [1]])) else: # flatten and repack the state enc_final_state = nest.pack_sequence_as(cell_dec.state_size, nest.flatten(enc_final_state)) # to use these we need to tile the final encoder state / the memory # but that conflicts with our use of cell_dec on untiled inputs for the gold #cell_dec = ParentFeedingCellWrapper(cell_dec, tf.contrib.seq2seq.tile_batch(enc_final_state, self.config.beam_size)) if self.config.apply_attention and False: attention = LuongAttention(decoder_hidden_size, enc_hidden_states, self.input_length_placeholder, probability_fn=tf.nn.softmax) cell_dec = AttentionWrapper(cell_dec, attention, cell_input_fn=lambda inputs, _: inputs, attention_layer_size=decoder_hidden_size, initial_cell_state=enc_final_state) enc_final_state = cell_dec.zero_state(self.batch_size, dtype=tf.float32) print('enc_final_state', enc_final_state) linear_layer = tf_core_layers.Dense(self.config.output_size) go_vector = tf.ones((self.batch_size,), dtype=tf.int32) * self.config.grammar.start decoder = BeamSearchOptimizationDecoder(training, cell_dec, output_embed_matrix, go_vector, self.config.grammar.end, enc_final_state, beam_width=self.config.beam_size, output_layer=linear_layer, gold_sequence=self.output_placeholder if training else None, gold_sequence_length=(self.output_length_placeholder+1) if training else None) if self.config.use_grammar_constraints: raise NotImplementedError("Grammar constraints are not implemented for the beam search yet") # dynamic_decode craps itself if we pass output_time_major=False, as it tries to transpose # the loss vector final_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=True, maximum_iterations=self.config.max_length) return final_outputs
def attention_alignment(inputs, input_lengths, memory, memory_lengths, n_layers, n_units, dropout_prob, cell_type=GRUCell, attention_mechanism=BahdanauAttention, is_training=True): """Performs alignment over inputs, attending over memory Args: inputs (tensor): Input sequence, with the shape of [Batch x seq_length x dimension] input_lengths (tensor): The length of input sequences. Used for dynamic unrolling memory (tensor): Sequence to attend memory_lengths (tensor): The length of memory. Used for dynamic unrolling n_layers (int): Number of layers in RNN n_units (int): Number of units in RNN dropout_prob (float): Drop out rate for RNN cell cell_type (method): Type of RNN cell, GRU by default attention_mechanism (method): Type of attention mechanism, Bahdanau by default is_training (bool): Whether the model is training or testing returns: (tensor, tensor, tensor): """ # get tensor dimensions batch_size, seq_length, dim = inputs.get_shape().as_list() # create a attention over the memory attention = attention_mechanism(n_units, memory, memory_sequence_length=memory_lengths, dtype=tf.float32) # build an encoder RNN over the input sequence dropout_prob = 0 if not is_training else dropout_prob if n_layers > 1: attention_cell = MultiRNNCell([DropoutWrapper(cell_type(n_units), output_keep_prob=1-dropout_prob) for _ in range(n_layers)]) else: attention_cell = cell_type(n_units) attention_cell = DropoutWrapper(attention_cell, output_keep_prob=1-dropout_prob) # for each input to the next RNN cell, wire the attention mechanism a_cell = AttentionWrapper(attention_cell, attention, alignment_history=True) # define the initial state # TODO: Do we ever feed an init state? attention_state = a_cell.zero_state(batch_size, dtype=tf.float32) # read input while attending over memory helper = TrainingHelper(inputs=inputs, sequence_length=input_lengths) decoder = BasicDecoder(a_cell, helper, attention_state) # output of the decoder is a new representation of input sentence with attention over the question outputs, states, _ = tf.contrib.seq2seq.dynamic_decode(decoder, maximum_iterations=seq_length, impute_finished=True) outputs = tf.pad(outputs.rnn_output, [[0, 0], [0, seq_length - tf.reduce_max(input_lengths)], [0, 0]]) outputs = tf.reshape(outputs, [batch_size, seq_length, dim]) # attention matrix for visualizing heatmap aligned = tf.transpose(states.alignment_history.stack(), [1, 0, 2]) return outputs, states, aligned
def build_decoder_cell(self, encoder_outputs, encoder_final_state, hidden_size, cell_type, layer_size): """ 构建解码器所有层 :param encoder_outputs: :param encoder_state: :param hidden_size: :param cell_type: :param layer_size: :return: """ sequence_length = self.encoder_inputs_length if self.mode == 'decode': encoder_outputs = tf.contrib.seq2seq.tile_batch( encoder_outputs, multiplier=self.beam_width) encoder_final_state = tf.contrib.seq2seq.tile_batch( encoder_final_state, multiplier=self.beam_width) sequence_length = tf.contrib.seq2seq.tile_batch( sequence_length, multiplier=self.beam_width) if self.bidirection: cell = MultiRNNCell([ self.one_cell(hidden_size * 2, cell_type) for _ in range(layer_size) ]) else: cell = MultiRNNCell([ self.one_cell(hidden_size, cell_type) for _ in range(layer_size) ]) # 使用attention机制 self.attention_mechanism = BahdanauAttention( num_units=self.hidden_size, memory=encoder_outputs, memory_sequence_length=sequence_length) def cell_input_fn(inputs, attention): mul = 2 if self.bidirection else 1 attn_projection = layers.Dense(self.hidden_size * mul, dtype=tf.float32, use_bias=False, name='attention_cell_input_fn') return attn_projection(array_ops.concat([inputs, attention], -1)) cell = AttentionWrapper(cell=cell, attention_mechanism=self.attention_mechanism, attention_layer_size=self.hidden_size, cell_input_fn=cell_input_fn, name='Attention_Wrapper') if self.mode == 'decode': decoder_initial_state = cell.zero_state( batch_size=self.batch_size * self.beam_width, dtype=tf.float32).clone(cell_state=encoder_final_state) else: decoder_initial_state = cell.zero_state( batch_size=self.batch_size, dtype=tf.float32).clone(cell_state=encoder_final_state) return cell, decoder_initial_state
def Decoder_LSTM(inputs, sequence_length, attention_mechanism, is_training= False): ''' In inference, input and sequence_length will be ignoired. ''' cell_List = []; for index in range(hp.Decoder.LSTM.Nums): cell_List.append(ZoneoutLSTMCell( num_units= hp.Decoder.LSTM.Cell_Size, is_training= is_training, cell_zoneout_rate= hp.Decoder.LSTM.Zoneout_Rate, output_zoneout_rate= hp.Decoder.LSTM.Zoneout_Rate )) lstm_Cell = tf.nn.rnn_cell.MultiRNNCell(cell_List); attention_Wrapped_Cell = AttentionWrapper( cell= lstm_Cell, attention_mechanism= attention_mechanism, attention_layer_size=None, alignment_history=True, cell_input_fn=None, output_attention= False, initial_cell_state=None, name=None, attention_layer=None ) helper = Decoder_Helper( inputs= inputs, #Mel sequence_length= sequence_length, #Mel_length time_major= False, is_training= is_training, name= None ) decoder = Decoder_Decoder( cell= attention_Wrapped_Cell, helper= helper, initial_state= attention_Wrapped_Cell.zero_state(tf.shape(inputs)[0], tf.float32) ) final_outputs, final_state, _ = Decoder_Dynamic_Decode( decoder= decoder, impute_finished= False #True ) return final_outputs, final_state
def __call__(self, encoder_outputs,encoder_len, inputs, state): inputs_embedding=tf.nn.embedding_lookup(self.embedding,inputs) inputs_embedding=tf.expand_dims(inputs_embedding,axis=1) attention_mechanism = BahdanauAttention( num_units=self.rnn_units, memory=encoder_outputs, memory_sequence_length=encoder_len) sattention_cell = AttentionWrapper(self.gru_cell, attention_mechanism) output,state= tf.nn.dynamic_rnn(self.gru_cell,inputs_embedding,initial_state=state,dtype=tf.float32) output=self.out_layer(output) return output,state
def __init__(self, cell, prenets: Tuple[PreNet], attention_mechanism, trainable=True, name=None, **kwargs): super(AttentionRNN, self).__init__(name=name, trainable=trainable, **kwargs) attention_cell = AttentionWrapper( DecoderPreNetWrapper(cell, prenets), attention_mechanism, cell_input_fn=(lambda inputs, attention: inputs), # Disable concatenation of inputs and context alignment_history=True, output_attention=False) concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) self._cell = concat_cell
def create_attention_cell(depth, memory, seq_len, cell, alignment_history=False): attention = BahdanauAttention(depth, memory, memory_sequence_length=seq_len, normalize=True) attention_cell = AttentionWrapper(cell, attention, alignment_history=alignment_history) return attention_cell
def initialize(self, enc_input, sequence_length, dec_input, mel_target=None): is_training = 1 if mel_target is not None else 0 batch = enc_input.shape[0] embedding = Embedding(symbol_length, embedding_dim)(enc_input) enc_pre = pre_net(embedding, is_training) enc_out = CBHG(enc_pre, sequence_length, K=16, conv_dim=[128, 128]) dec_pre = pre_net(dec_input, is_training) attention_cell = AttentionWrapper(GRUCell(decoder_dim), BahdanauAttention( decoder_dim, enc_out), alignment_history=True, output_attention=False) concat_cell = ConcatWrapper(attention_cell) attention_out, state = tf.nn.dynamic_rnn(concat_cell, dec_pre, dtype=tf.float32) alignment = tf.transpose(state.alignment_history.stack(), [1, 2, 0]) residual_gru_input = Dense(decoder_dim)(attention_out) for _ in range(2): residual_gru_input += GRU( decoder_dim, return_sequences=True)(residual_gru_input) dec_out = Dense(mel_dim * reduction)(residual_gru_input) mel_output = tf.reshape(dec_out, [batch, -1, mel_dim]) self.enc_input = enc_input self.sequence_length = sequence_length self.dec_input = dec_input self.mel_output = mel_output self.alignment = alignment self.mel_target = mel_target if is_training: self.loss = tf.reduce_mean(MAE(self.mel_target, self.mel_output)) self.global_step = tf.Variable(0) optimizer = tf.train.AdamOptimizer() gv = optimizer.compute_gradients(self.loss) self.optimize = optimizer.apply_gradients( gv, global_step=self.global_step)
def biLSTM_layer_op(self): with tf.variable_scope("bi-lstm"): attention_mechannism = BahdanauAttention( num_units=self.hidden_dim, memory=self.word_embeddings) cell_fw = LSTMCell(self.hidden_dim) cell_bw = LSTMCell(self.hidden_dim) att_cell_fw = AttentionWrapper( cell=cell_fw, attention_mechanism=attention_mechannism) att_cell_bw = AttentionWrapper( cell=cell_bw, attention_mechanism=attention_mechannism) (output_fw_seq, output_bw_seq), _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=att_cell_fw, cell_bw=att_cell_bw, inputs=self.word_embeddings, sequence_length=self.sequence_lengths, dtype=tf.float32) output = tf.concat([output_fw_seq, output_bw_seq], axis=-1) output = tf.nn.dropout(output, self.dropout_pl) with tf.variable_scope("proj"): W = tf.get_variable( name="W", shape=[2 * self.hidden_dim, self.num_tags], initializer=tf.contrib.layers.xavier_initializer(), dtype=tf.float32) b = tf.get_variable(name="b", shape=[self.num_tags], initializer=tf.zeros_initializer(), dtype=tf.float32) s = tf.shape(output) output = tf.reshape(output, [-1, 2 * self.hidden_dim]) pred = tf.matmul(output, W) + b self.logits = tf.reshape(pred, [-1, s[1], self.num_tags])
def pointer_net(inputs, input_lengths, n_pointers, word_matrix, cell_type, n_layers, n_units, dropout_prob, is_training=True): """Pointer network. Args: inputs (tensor): Inputs to pointer network (typically output of previous RNN) input_lengths (tensor): Actual non-padded lengths of each input sequence n_pointers (int): Number of pointers to generate word_matrix (tensor): Embedding matrix of word vectors cell_type (method): Cell type to use n_layers (int): Number of layers in RNN (same for encoder & decoder) n_units (int): Number of units in RNN cell (same for encoder & decoder) dropout_prob (float): Dropout probability is_training (bool): Whether the model is training or testing """ batch_size, seq_length, _ = inputs.get_shape().as_list() vocab_size = word_matrix.get_shape().as_list()[0] # instantiate RNN cell; only use dropout during training def _rnn_cell(): keep_prob = 1 - dropout_prob if is_training else 1 return DropoutWrapper(cell_type(n_units), output_keep_prob=keep_prob) enc_cell = MultiRNNCell([_rnn_cell() for _ in range(n_layers)]) if n_layers > 1 else _rnn_cell() encoded, _ = tf.nn.dynamic_rnn(enc_cell, inputs, input_lengths, dtype=tf.float32) attention = BahdanauAttention(n_units, encoded, memory_sequence_length=input_lengths) # TODO: find permanent solution (InferenceHelper?) start_tokens = tf.constant(START_TOKEN, shape=[batch_size], dtype=tf.int32) helper = GreedyEmbeddingHelper(word_matrix, start_tokens, END_TOKEN) dec_cell = MultiRNNCell([_rnn_cell() for _ in range(n_layers)]) if n_layers > 1 else _rnn_cell() attn_cell = AttentionWrapper(dec_cell, attention, alignment_history=True) out_cell = tf.contrib.rnn.OutputProjectionWrapper(attn_cell, vocab_size) decoder = BasicDecoder(out_cell, helper, attn_cell.zero_state(batch_size, tf.float32)) _, states, _ = dynamic_decode(decoder, maximum_iterations=n_pointers, impute_finished=True) probs = tf.reshape(states.alignment_history.stack(), [n_pointers, batch_size, seq_length]) return probs
def decoder(x, decoder_inputs, keep_prob, sequence_length, memory, memory_length, first_attention): with tf.variable_scope("Decoder") as scope: label_embeddings = tf.get_variable(name="embeddings", shape=[n_classes, embedding_size], dtype=tf.float32) train_inputs_embedded = tf.nn.embedding_lookup(label_embeddings, decoder_inputs) lstm = rnn.LayerNormBasicLSTMCell(n_hidden, dropout_keep_prob=keep_prob) output_l = layers_core.Dense(n_classes, use_bias=True) encoder_state = rnn.LSTMStateTuple(x, x) attention_mechanism = BahdanauAttention( embedding_size, memory=memory, memory_sequence_length=memory_length) cell = AttentionWrapper(lstm, attention_mechanism, output_attention=False) cell_state = cell.zero_state(dtype=tf.float32, batch_size=train_batch_size) cell_state = cell_state.clone(cell_state=encoder_state, attention=first_attention) train_helper = TrainingHelper(train_inputs_embedded, sequence_length) train_decoder = BasicDecoder(cell, train_helper, cell_state, output_layer=output_l) decoder_outputs_train, decoder_state_train, decoder_seq_train = dynamic_decode( train_decoder, impute_finished=True) tiled_inputs = tile_batch(memory, multiplier=beam_width) tiled_sequence_length = tile_batch(memory_length, multiplier=beam_width) tiled_first_attention = tile_batch(first_attention, multiplier=beam_width) attention_mechanism = BahdanauAttention( embedding_size, memory=tiled_inputs, memory_sequence_length=tiled_sequence_length) x2 = tile_batch(x, beam_width) encoder_state2 = rnn.LSTMStateTuple(x2, x2) cell = AttentionWrapper(lstm, attention_mechanism, output_attention=False) cell_state = cell.zero_state(dtype=tf.float32, batch_size=test_batch_size * beam_width) cell_state = cell_state.clone(cell_state=encoder_state2, attention=tiled_first_attention) infer_decoder = BeamSearchDecoder(cell, embedding=label_embeddings, start_tokens=[GO] * test_len, end_token=EOS, initial_state=cell_state, beam_width=beam_width, output_layer=output_l) decoder_outputs_infer, decoder_state_infer, decoder_seq_infer = dynamic_decode( infer_decoder, maximum_iterations=4) return decoder_outputs_train, decoder_outputs_infer, decoder_state_infer
def add_decoder_cell(self, encoder_outputs, encoder_states, hidden_size, cell_type, num_layers): encoder_seq_len = self.source_len if self.mode == 'decode': encoder_outputs = tf.contrib.seq2seq.tile_batch( encoder_outputs, multiplier=self.beam_size) encoder_states = tf.contrib.seq2seq.tile_batch( encoder_states, multiplier=self.beam_size) encoder_seq_len = tf.contrib.seq2seq.tile_batch( encoder_seq_len, multiplier=self.beam_size) hidden_size_ = hidden_size * 2 if self.bidirection else hidden_size cell = MultiRNNCell([ self.one_cell(hidden_size_, cell_type) for _ in range(num_layers) ]) self.attention = BahdanauAttention(self.hidden_size, encoder_outputs, encoder_seq_len) def cell_input_fn(inputs, attention): att_proj = tf.layers.Dense(hidden_size_, dtype=tf.float32, use_bias=False, name='att_proj') return att_proj(tf.concat([inputs, attention], axis=-1)) decoder_cell = AttentionWrapper(cell=cell, attention_mechanism=self.attention, attention_layer_size=hidden_size, cell_input_fn=cell_input_fn, name='attentionwrapper') d_size = self.beam_size * self.batch_size if self.mode == 'decode' else self.batch_size decoder_initial_state = decoder_cell.zero_state( batch_size=d_size, dtype=tf.float32).clone(cell_state=encoder_states) return decoder_cell, decoder_initial_state
def add_decoder_op(self, enc_final_state, enc_hidden_states, output_embed_matrix, training): cell_dec = tf.contrib.rnn.MultiRNNCell([ self.make_rnn_cell(i, True) for i in range(self.config.rnn_layers) ]) encoder_hidden_size = int(enc_hidden_states.get_shape()[-1]) decoder_hidden_size = int(cell_dec.output_size) # if encoder and decoder have different sizes, add a projection layer if encoder_hidden_size != decoder_hidden_size: assert False, (encoder_hidden_size, decoder_hidden_size) with tf.variable_scope('hidden_projection'): kernel = tf.get_variable( 'kernel', (encoder_hidden_size, decoder_hidden_size), dtype=tf.float32) # apply a relu to the projection for good measure enc_final_state = nest.map_structure( lambda x: tf.nn.relu(tf.matmul(x, kernel)), enc_final_state) enc_hidden_states = tf.nn.relu( tf.tensordot(enc_hidden_states, kernel, [[2], [1]])) else: # flatten and repack the state enc_final_state = nest.pack_sequence_as( cell_dec.state_size, nest.flatten(enc_final_state)) cell_dec = ParentFeedingCellWrapper(cell_dec, enc_final_state) if self.config.apply_attention: attention = LuongAttention(self.config.decoder_hidden_size, enc_hidden_states, self.input_length_placeholder, probability_fn=tf.nn.softmax) cell_dec = AttentionWrapper( cell_dec, attention, cell_input_fn=lambda inputs, _: inputs, attention_layer_size=self.config.decoder_hidden_size, initial_cell_state=enc_final_state) enc_final_state = cell_dec.zero_state(self.batch_size, dtype=tf.float32) decoder = Seq2SeqDecoder(self.config, self.input_placeholder, self.input_length_placeholder, self.output_placeholder, self.output_length_placeholder, self.batch_number_placeholder) return decoder.decode(cell_dec, enc_final_state, self.config.grammar.output_size, output_embed_matrix, training)
def __init__(self, cell, prenets: Tuple[PreNet], attention_mechanism, trainable=True, name=None, dtype=None, **kwargs): super(AttentionRNN, self).__init__(trainable=trainable, name=name, dtype=dtype, **kwargs) attention_cell = AttentionWrapper( cell, attention_mechanism, alignment_history=True, output_attention=False) # prenet -> attention prenet_cell = DecoderPreNetWrapper(attention_cell, prenets) # prenet -> attention -> concat concat_cell = ConcatOutputAndAttentionWrapper(prenet_cell) self._cell = concat_cell
def __graph__(self): # encoder encoder_outputs, encoder_state = self.encoder() # decoder with tf.variable_scope('decoder'): ##作用域,'/' encoder_inputs_length = self.encoder_inputs_length if self.beam_search: # 如果使用beam_search,则需要将encoder的输出进行tile_batch,其实就是复制beam_size份。 print("use beamsearch decoding..") encoder_outputs = tile_batch(encoder_outputs, multiplier=self.beam_size) encoder_state = nest.map_structure(lambda s: tf.contrib.seq2seq.tile_batch(s, self.beam_size), encoder_state) encoder_inputs_length = tile_batch(encoder_inputs_length, multiplier=self.beam_size) # 定义要使用的attention机制。 attention_mechanism = BahdanauAttention(num_units=self.rnn_size, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length) # 定义decoder阶段要是用的RNNCell,然后为其封装attention wrapper decoder_cell = self.create_rnn_cell() decoder_cell = AttentionWrapper(cell=decoder_cell, attention_mechanism=attention_mechanism, attention_layer_size=self.rnn_size, name='Attention_Wrapper') # 如果使用beam_seach则batch_size = self.batch_size * self.beam_size batch_size = self.batch_size if not self.beam_search else self.batch_size * self.beam_size # 定义decoder阶段的初始化状态,直接使用encoder阶段的最后一个隐层状态进行赋值 decoder_initial_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32).clone(cell_state=encoder_state) output_layer = tf.layers.Dense(self.vocab_size, kernel_initializer=tf.truncated_normal_initializer( mean=0.0,9 stddev=0.1)) if self.mode == 'train':
def __init__(self, cell, prenets: Tuple[PreNet], attention_mechanism, trainable=True, name=None, **kwargs): super(AttentionRNN, self).__init__(name=name, trainable=trainable, **kwargs) attention_cell = AttentionWrapper(DecoderPreNetWrapper(cell, prenets), attention_mechanism, alignment_history=True, output_attention=False) self._cell = attention_cell
def decoding_layer(decoding_embed_inp, embeddings, encoding_op, encoding_st, v_size, fr_len, en_len, max_en_len, rnn_cell_size, word2int, dropout_prob, batch_size, n_layers): for l in range(n_layers): with tf.variable_scope('decs_rnn_layer_{}'.format(l)): #gru = tf.contrib.rnn.GRUCell(rnn_len) gru = get_rnn_cell(rnn_cell_size, dropout_prob) decoding_cell = tf.contrib.rnn.DropoutWrapper( gru, input_keep_prob=dropout_prob) out_l = Dense(v_size, kernel_initializer=tf.truncated_normal_initializer( mean=0.0, stddev=0.1)) attention = BahdanauAttention(rnn_cell_size, encoding_op, fr_len, normalize=False, name='BahdanauAttention') decoding_cell = AttentionWrapper(decoding_cell, attention, rnn_len) attention_zero_state = decoding_cell.zero_state(batch_size, tf.float32) attention_zero_state = attention_zero_state.clone( cell_state=encoding_st[0]) with tf.variable_scope("decoding_layer"): logits_tr = training_decoding_layer(decoding_embed_inp, en_len, decoding_cell, attention_zero_state, out_l, v_size, max_en_len) with tf.variable_scope("decoding_layer", reuse=True): logits_inf = inference_decoding_layer(embeddings, word2int["TOKEN_GO"], word2int["TOKEN_EOS"], decoding_cell, attention_zero_state, out_l, max_en_len, batch_size) return logits_tr, logits_inf