def apply(self, inputs): ''' inputs shape = [batch_size, max_num_causes] the word id of input cause sequence_length = [batch_size,] return shape = [batch_size, max_cause_num, embedding_size] ''' max_num_causes = int(inputs.shape[1]) max_word_num = int(self._cause_id_table.shape[1]) inputs_word_ids = gen_array_ops.gather_v2(self._cause_id_table, inputs, axis=0) # shape = [batch_size, max_num_causes, max_words_length] embedded_inputs = gen_array_ops.gather_v2(self._word_embeddings, inputs_word_ids, axis=0) # shape = [batch_size, max_num_causes, max_words_length, embedding_size], and it will be flattend embedded_inputs = tf.reshape(embedded_inputs, [-1, max_word_num, self._embedding_size]) inputs_word_length = gen_array_ops.gather_v2( self._cause_id_table_length, inputs, axis=0) # shape = [batch_size, max_num_causes] inputs_word_length = tf.reshape(inputs_word_length, [-1]) lstm_zero_state = self._lstm_cell.zero_state( tf.shape(inputs_word_length)[0], tf.float32) outputs, state = tf.nn.dynamic_rnn(self._lstm_cell, embedded_inputs, inputs_word_length, lstm_zero_state, scope='lstm_cause_encoder') # state = [c, h] shape = [batch_size * max_num_causes, embedding_size] state = tf.reshape(state[0], [-1, max_num_causes, self._embedding_size]) return state
def __call__(self, input): ''' transform word ids into embeddings :param input: shape = [-1, sen_len] :return: shape = [-1, sen_len, embedding_dimension] ''' return gen_array_ops.gather_v2(self._embedding_matrix, input, axis=0)
def _mask_outputs_by_lable(self, outputs, last_choice): """outputs shape=[batch_size, num_classes]""" vocab_size = array_ops.shape(outputs)[1] next_choies = gen_array_ops.gather_v2(params=self.lookup_table, indices=last_choice, axis=0) '''get the [batch_size, vocab_size] mask''' mask = math_ops.reduce_sum(array_ops.one_hot(indices=next_choies, depth=vocab_size, dtype=dtypes.int32), axis=1) mask = math_ops.cast(mask, dtype=dtypes.bool) # shape = [batch_size, beam_width, vacab_size] finished_probs = array_ops.fill(dims=array_ops.shape(outputs), value=outputs.dtype.min) return array_ops.where(mask, outputs, finished_probs)
def _match_model_fn_v6(features, labels, mode, params): ''' this version uses origianl seq2seq, but uses a lstm merges the cause and word embedding_tabel and this version use the input embedding as the attention query ''' # print('aaa') '''set parameters''' with tf.device('/gpu:0'), tf.variable_scope('model', reuse=tf.AUTO_REUSE) as scope: # set hyper parameters embedding_size = params['embedding_size'] num_units = params['num_units'] if mode == tf.estimator.ModeKeys.TRAIN: dropout_keep_prob = params['dropout_keep_prob'] else: dropout_keep_prob = 1 beam_width = params['beam_width'] EOS = params['EOS'] SOS = params['SOS'] # set training parameters max_sequence_length = params['max_sequence_length'] max_cause_length = params['max_cause_length'] vocab_size = params['vocab_size'] num_causes = EOS + 1 '''process input and target''' # input layer input = tf.reshape(features['content'], [-1, max_sequence_length]) batch_size = tf.shape(input)[0] input_length = tf.reshape(features['content_length'], [batch_size]) cause_label = tf.reshape(labels['cause_label'], [batch_size, max_cause_length]) cause_length = tf.reshape(labels['cause_length'], [batch_size]) # necessary cast input = tf.cast(input, dtype=tf.int32) input_length = tf.cast(input_length, dtype=tf.int32) cause_label = tf.cast(cause_label, dtype=tf.int32) cause_length = tf.cast(cause_length, dtype=tf.int32) # word embedding layer embeddings_word = load_embedding(params['word2vec_model'], vocab_size, embedding_size) embedded_input = gen_array_ops.gather_v2(embeddings_word, input, axis=0) # cause-label embedding layer cause_encoder = CauseEncoder(word_embeddings=embeddings_word, params=params) embedded_cause = cause_encoder.apply(cause_label) # cause lookpu_table cause_table = tf.constant(params['cause_table'], dtype=tf.int32) encoder_output = encoders(embedded_input, input_length, params, mode) '''hierarchical multilabel decoder''' # build lstm cell with attention lstm = rnn.LayerNormBasicLSTMCell(num_units=num_units, reuse=tf.AUTO_REUSE, dropout_keep_prob=dropout_keep_prob) # lstm = rnn.DropoutWrapper(lstm, output_keep_prob=dropout_keep_prob) # the subtraction at the end of the line is a ele-wise subtraction supported by tensorflow attention_mechanism = MyBahdanauAttention( num_units=embedding_size, memory=encoder_output.attention_values, memory_sequence_length=encoder_output.attention_values_length) initial_state = rnn.LSTMStateTuple(encoder_output.initial_state, encoder_output.initial_state) cell = MyAttentionWrapper_v2(lstm, attention_mechanism, sot=SOS, output_attention=False, name="MyAttentionWrapper") cell_state = cell.zero_state(dtype=tf.float32, batch_size=batch_size) cell_state = cell_state.clone(cell_state=initial_state, attention=encoder_output.final_state) # extra dense layer to project a rnn output into a classification project_dense = Dense(num_causes, _reuse=tf.AUTO_REUSE, _scope='project_dense_scope', name='project_dense') # train_decoder train_helper = MyTrainingHelper(embedded_cause, cause_label, cause_length) train_decoder = MyBasicDecoder(cell, train_helper, cell_state, lookup_table=cause_table, output_layer=project_dense, hie=params['hie']) decoder_output_train, decoder_state_train, decoder_len_train = dynamic_decode( train_decoder, maximum_iterations=max_cause_length - 1, parallel_iterations=64, scope='decoder') # beam_width = 1 tiled_memory_sequence_length = tile_batch( encoder_output.attention_values_length, multiplier=beam_width) tiled_memory = tile_batch(encoder_output.attention_values, multiplier=beam_width) tiled_encoder_output_initital_state = tile_batch( encoder_output.initial_state, multiplier=beam_width) tiled_initial_state = rnn.LSTMStateTuple( tiled_encoder_output_initital_state, tiled_encoder_output_initital_state) tiled_first_attention = tile_batch(encoder_output.final_state, multiplier=beam_width) attention_mechanism = MyBahdanauAttention( num_units=embedding_size, memory=tiled_memory, memory_sequence_length=tiled_memory_sequence_length) cell = MyAttentionWrapper_v2(lstm, attention_mechanism, sot=SOS, output_attention=False, name="MyAttentionWrapper") cell_state = cell.zero_state(dtype=tf.float32, batch_size=batch_size * beam_width) cell_state = cell_state.clone(cell_state=tiled_initial_state, attention=tiled_first_attention) infer_decoder = MyBeamSearchDecoder(cell, embedding=cause_encoder, sots=tf.fill([batch_size], SOS), start_tokens=tf.fill([batch_size], SOS), end_token=EOS, initial_state=cell_state, beam_width=beam_width, output_layer=project_dense, lookup_table=cause_table, length_penalty_weight=0.7, hie=params['hie']) cause_output_infer, cause_state_infer, cause_length_infer = dynamic_decode( infer_decoder, parallel_iterations=64, maximum_iterations=max_cause_length - 1, scope='decoder') # loss mask_for_cause = tf.sequence_mask(cause_length - 1, max_cause_length - 1, dtype=tf.float32) # loss = sequence_loss(logits=padded_train_output, targets=cause_label, weights=mask_for_cause, name='loss') tmp_padding = tf.pad(decoder_output_train.rnn_output, [[0, 0], [ 0, max_cause_length - 1 - tf.shape(decoder_output_train.rnn_output)[1] ], [0, 0]], constant_values=0) loss = _compute_loss(tmp_padding, cause_label, mask_for_cause, batch_size) # predicted_ids: [batch_size, max_cause_length, beam_width] predicted_and_cause_ids = tf.transpose( cause_output_infer.predicted_ids, perm=[0, 2, 1], name='predicted_cause_ids') # for monitoring cause_label_expanded = tf.reshape(cause_label[:, 1:], [-1, 1, max_cause_length - 1]) predicted_and_cause_ids = tf.pad( predicted_and_cause_ids, [[0, 0], [0, 0], [0, max_cause_length - 1 - tf.shape(predicted_and_cause_ids)[2]]], constant_values=EOS) predicted_and_cause_ids = tf.concat( [predicted_and_cause_ids, cause_label_expanded], axis=1, name='predicted_and_cause_ids') predicted_and_cause_ids = tf.reshape( predicted_and_cause_ids, [-1, beam_width + 1, max_cause_length - 1]) predicted_and_cause_ids_train = tf.concat( [decoder_output_train.sample_id, cause_label[:, 1:]], axis=1, name='predicted_and_cause_ids_train') predictions = { 'predicted_and_cause_ids': predicted_and_cause_ids, } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) if mode == tf.estimator.ModeKeys.TRAIN: # warm_up_constant = params['warm_up_steps'] ** (-1.5) # embedding_constant = embedding_size ** (-0.5) # global_step = tf.to_float(tf.train.get_global_step()) # learning_rate = tf.minimum(1 / tf.sqrt(global_step), # warm_up_constant * global_step) * embedding_constant # optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.98, epsilon=1e-9) optimizer = tf.train.AdamOptimizer() # # train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) # '''using gradient clipping''' # loss = tf.Print(loss, [loss, 'to be clear, this is the loss']) grads_and_vars = optimizer.compute_gradients(loss) clipped_gvs = [ ele if ele[0] is None else (tf.clip_by_value(ele[0], -0.1, 0.1), ele[1]) for ele in grads_and_vars ] train_op = optimizer.apply_gradients( clipped_gvs, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) # predicted_cause_ids shape = [batch_size, cause_length] # cause_label = [batch_size, cause_length] # select the predicted cause with the highest possibility # todo: evalutaion # bi_predicted_cause_ids = binarizer(predicted_cause_ids[:, 0, :], num_causes) # bi_cause_label = binarizer(cause_label, num_causes) # todo: now I have to leave the evaluation work be done outside the estimator eval_metric_ops = { 'predicted_and_cause_ids': tf.contrib.metrics.streaming_concat(predicted_and_cause_ids), # 'precision': tf.metrics.precision(bi_cause_label, bi_predicted_cause_ids), # 'recall': tf.metrics.recall(bi_cause_label, bi_predicted_cause_ids), # 'f1-score': f_score(bi_cause_label, bi_predicted_cause_ids), } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
def apply(self, inputs): embedding_state = gen_array_ops.gather_v2(self._cause_embeddings, inputs, axis=0) return embedding_state
def apply(self, inputs): lstm_state = super(CauseEncoder_v2, self).apply(inputs) embedding_state = gen_array_ops.gather_v2(self._cause_embeddings, inputs, axis=0) state = tf.concat([lstm_state, embedding_state], axis=2) return state
def step(self, time, inputs, state, name=None): """Perform a decoding step. Args: time: scalar `int32` tensor. inputs: A (structure of) input tensors. state: A (structure of) state tensors and TensorArrays. name: Name scope for any created operations. Returns: `(outputs, next_state, next_inputs, finished)`. """ batch_size = self._batch_size beam_width = self._beam_width end_token = self._end_token length_penalty_weight = self._length_penalty_weight with ops.name_scope(name, "BeamSearchDecoderStep", (time, inputs, state)): cell_state = state.cell_state inputs = nest.map_structure( lambda inp: self._merge_batch_beams(inp, s=inp.shape[2:]), inputs) cell_state = nest.map_structure(self._maybe_merge_batch_beams, cell_state, self._cell.state_size) cell_outputs, next_cell_state = self._cell(inputs, cell_state) # finished = tf.Print(state.finished, [state.finished, 'finished', time], summarize=100) # not_finished = tf.Print(not_finished, [not_finished, 'not_finished', time], summarize=100) # cell_state.last_choice shape = [batch_size * beam_width] next_choices = gen_array_ops.gather_v2(self.lookup_table, cell_state.last_choice, axis=0) not_finished = tf.not_equal(next_choices[:, 0], end_token) next_next_choices = gen_array_ops.gather_v2(self.lookup_table, next_choices[:, 0], axis=0) will_finish = tf.logical_and( not_finished, tf.equal(next_next_choices[:, 0], end_token)) def move(will_finish, last_choice, cell_outputs): # cell_outputs = tf.Print(cell_outputs, [cell_outputs, 'cell_outputs', time], summarize=1000) # will_finish = tf.Print(will_finish, [will_finish, 'will_finish', time], summarize=100) attention_score = self._step_method(last_choice) attention_score = attention_score + cell_outputs # final = tf.Print(final, [final, 'finalll', time], summarize=1000) return tf.where(will_finish, attention_score, cell_outputs) if self._output_layer is not None: cell_outputs = self._output_layer(cell_outputs) # will_finish = tf.Print(will_finish, [will_finish, 'will_finish, beam_search', time], summarize=100) cell_outputs = tf.cond( tf.reduce_any(will_finish), false_fn=lambda: cell_outputs, true_fn=lambda: move(will_finish, cell_state.last_choice, cell_outputs)) if self.hie: cell_outputs = self._mask_outputs_by_lable( cell_outputs, cell_state.last_choice) # cell_state.last_choice shape = [batch_size*beam_width,] cell_outputs = nest.map_structure( lambda out: self._split_batch_beams(out, out.shape[1:]), cell_outputs) next_cell_state = nest.map_structure(self._maybe_split_batch_beams, next_cell_state, self._cell.state_size) beam_search_output, beam_search_state = _beam_search_step( time=time, logits=cell_outputs, next_cell_state=next_cell_state, beam_state=state, batch_size=batch_size, beam_width=beam_width, end_token=end_token, length_penalty_weight=length_penalty_weight) finished = beam_search_state.finished # replace the father ids sample_ids = beam_search_output.predicted_ids next_cell_state = beam_search_state.cell_state next_cell_state = next_cell_state._replace(last_choice=sample_ids) beam_search_state = beam_search_state._replace( cell_state=next_cell_state) # sample_ids shape = [batch_size, beam_width] next_inputs = control_flow_ops.cond( math_ops.reduce_all(finished), lambda: self._start_inputs, lambda: self._embedding_fn(sample_ids)) return (beam_search_output, beam_search_state, next_inputs, finished)