def build_lstm_cell(inps, num_layers, num_units): lstms = [LSTMCell(num_units, dtype=tf.float32) for _ in range(num_layers)] multilayer_lstm = tf.contrib.rnn.MultiRNNCell(lstms) zero_state = multilayer_lstm.zero_state(tf.shape(inps)[1], tf.float32) logits, _ = tf.nn.dynamic_rnn(multilayer_lstm, inps, initial_state=zero_state, parallel_iterations=1024, time_major=True) return logits
def cal_loss_logit(embedded, keep_prob, reuse=True, scope="loss"): with tf.variable_scope(scope, reuse=tf.AUTO_REUSE) as scope: rnn_outputs, _ = bi_rnn(LSTMCell(self.hidden_size), LSTMCell(self.hidden_size), inputs=embedded, dtype=tf.float32) # Attention H = tf.add(rnn_outputs[0], rnn_outputs[1]) # fw + bw M = tf.tanh(H) # M = tanh(H) (batch_size, seq_len, HIDDEN_SIZE) # alpha (bs * sl, 1) alpha = tf.nn.softmax(tf.matmul(tf.reshape(M, [-1, self.hidden_size]), tf.reshape(W, [-1, 1]))) r = tf.matmul(tf.transpose(H, [0, 2, 1]), tf.reshape(alpha, [-1, self.max_len, 1])) # supposed to be (batch_size * HIDDEN_SIZE, 1) r = tf.squeeze(r) h_star = tf.tanh(r) drop = tf.nn.dropout(h_star, keep_prob) # Fully connected layer(dense layer) y_hat = tf.nn.xw_plus_b(drop, W_fc, b_fc,name='logits') return y_hat, tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y_hat, labels=self.label))
def add_cell_lstm(inps, state, num_units, input_dim, init_parameter): stddevs = compute_stddevs(num_units, input_dim, init_parameter) lstms = [ LSTMCell(nu, dtype=tf.float32, state_is_tuple=False, initializer=tf.truncated_normal_initializer(stddev=stddev)) for nu, stddev in zip(num_units, stddevs) ] multilayer_lstm = tf.contrib.rnn.MultiRNNCell(lstms, state_is_tuple=False) # print("(add_cell_lstm)state:", state) # print("(add_cell_lstm)multilayer_lstm.state_size:", multilayer_lstm.state_size) state = prepare_init_state(state, inps, multilayer_lstm, 'cell') if state is None: state = multilayer_lstm.zero_state(tf.shape(inps)[1], tf.float32) # print("(add_cell_lstm)multilayer_lstm.state:", multilayer_lstm.state) output, state = tf.nn.dynamic_rnn(multilayer_lstm, inps, initial_state=state, parallel_iterations=1024, time_major=True) return output, state
def _create_model(self, mode, input_ids, input_mask, segment_ids, labels, slot_labels, labels_mask, drop_keep_prob, entity_type_ids, sequence_lengths): """Creates a LaserTagger model.""" is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = modeling.BertModel( config=self._config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=self._use_one_hot_embeddings) final_layer = model.get_sequence_output() # final_hidden = model.get_pooled_output() if is_training: # I.e., 0.1 dropout # final_hidden = tf.nn.dropout(final_hidden, keep_prob=drop_keep_prob) final_layer = tf.nn.dropout(final_layer, keep_prob=drop_keep_prob) # 结合实体信息 batch_size, seq_length = modeling.get_shape_list(input_ids) self.entity_type_embedding = tf.get_variable( name="entity_type_embedding", shape=(self.entity_type_num, self._config.hidden_size), dtype=tf.float32, trainable=True, initializer=tf.random_uniform_initializer( -self._config.initializer_range * 100, self._config.initializer_range * 100, seed=20)) with tf.init_scope(): impact_weight_init = tf.constant(1.0 / self.entity_type_num, dtype=tf.float32, shape=(1, self.entity_type_num)) self.impact_weight = tf.Variable(impact_weight_init, dtype=tf.float32, name="impact_weight") # 不同类型的影响权重 impact_weight_matrix = tf.tile(self.impact_weight, multiples=[batch_size * seq_length, 1]) entity_type_ids_matrix1 = tf.cast(tf.reshape( entity_type_ids, [batch_size * seq_length, self.entity_type_num]), dtype=tf.float32) entity_type_ids_matrix = tf.multiply(entity_type_ids_matrix1, impact_weight_matrix) entity_type_emb = tf.matmul(entity_type_ids_matrix, self.entity_type_embedding) final_layer = final_layer + tf.reshape(entity_type_emb, [ batch_size, seq_length, self._config.hidden_size ]) # TODO TODO # 0.7071067811865476是二分之根号二 # final_layer = tf.concat([final_layer, tf.reshape(entity_type_emb, [batch_size, seq_length,self._config.hidden_size])], axis=-1) if is_training: final_layer = tf.nn.dropout(final_layer, keep_prob=drop_keep_prob) (output_fw_seq, output_bw_seq), ((c_fw, h_fw), (c_bw, h_bw)) = tf.nn.bidirectional_dynamic_rnn( cell_fw=LSTMCell(self.lstm_hidden_size), cell_bw=LSTMCell(self.lstm_hidden_size), inputs=final_layer, sequence_length=sequence_lengths, dtype=tf.float32) layer_matrix = tf.concat([output_fw_seq, output_bw_seq], axis=-1) final_hidden = tf.concat([c_fw, c_bw], axis=-1) layer_matrix = tf.contrib.layers.layer_norm(inputs=layer_matrix, begin_norm_axis=-1, begin_params_axis=-1) intent_logits = tf.layers.dense( final_hidden, self._num_tags, kernel_initializer=tf.truncated_normal_initializer(stddev=0.02), name="output_projection") slot_logits = tf.layers.dense( layer_matrix, self.num_slot_tags, kernel_initializer=tf.truncated_normal_initializer(stddev=0.02), name="slot_projection") with tf.variable_scope("loss"): loss = None per_example_intent_loss = None per_example_slot_loss = None if mode != tf.estimator.ModeKeys.PREDICT: per_example_intent_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=intent_logits) slot_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=slot_labels, logits=slot_logits) per_example_slot_loss = tf.truediv( tf.reduce_sum(slot_loss, axis=1), tf.cast(tf.reduce_sum(labels_mask, axis=1), tf.float32)) # from tensorflow.contrib.crf import crf_log_likelihood # from tensorflow.contrib.crf import viterbi_decode # batch_size = tf.shape(slot_logits)[0] # print(curLine(), batch_size, tf.constant([self._max_seq_length])) # length_batch = tf.tile(tf.constant([self._max_seq_length]), [batch_size]) # print(curLine(), batch_size, "length_batch:", length_batch) # per_example_slot_loss, self.transition_params = crf_log_likelihood(inputs=slot_logits, # tag_indices=slot_labels,sequence_lengths=length_batch) # print(curLine(), "per_example_slot_loss:", per_example_slot_loss) # shape=(batch_size,) # print(curLine(), "self.transition_params:", self.transition_params) # shape=(9, 9) loss = tf.reduce_mean(self.intent_ratio * per_example_intent_loss + self.slot_ratio * per_example_slot_loss) pred_intent = tf.cast(tf.argmax(intent_logits, axis=-1), tf.int32) pred_slot = tf.cast(tf.argmax(slot_logits, axis=-1), tf.int32) return (loss, per_example_slot_loss, pred_intent, pred_slot, batch_size, entity_type_emb, impact_weight_matrix, entity_type_ids_matrix, final_layer, slot_logits)
with tf.name_scope('Embedding Layer'): embedding = tf.Variable(tf.random_uniform([vocabulary_size, embed_size], -1, 1)) embeded = tf.nn.embedding_lookup(embedding, batch_ph) tf.summary.histogram('embedding', embedding) ''' ##RNN layers lstm_size = 32 lstm_layers = 3 output = batch_ph for i in range(lstm_layers): with tf.variable_scope('BiLSTM_Layer_{}'.format(i)): lstm_fw = LSTMCell( lstm_size ) #, initializer=tf.truncated_normal_initializer(-0.1, 0.1, seed=2) lstm_bw = LSTMCell(lstm_size) cell_fw = tf.contrib.rnn.DropoutWrapper( lstm_fw, output_keep_prob=keep_prob_ph_rnn) cell_bw = tf.contrib.rnn.DropoutWrapper( lstm_bw, output_keep_prob=keep_prob_ph_rnn) (output_fw, output_bw), final_state = BiRNN(cell_fw, cell_bw, output, dtype=tf.float32) output = tf.concat((output_fw, output_bw), 2) ##Attention + Dropout with tf.variable_scope('BiLSTM_Layer_{}'.format(lstm_layers)):
def build_graph(self): """ Build the main architecture of the graph. """ random.seed(310) tf.set_random_seed(902) print("building graph") with tf.variable_scope('model', reuse=self.reuse): ### Lookup ELMo Embedding ### self.x_elmo = layers.Lambda( lambda inputs: ElmoEmbedding(inputs, elmo_model), output_shape=(1024, ))(self.x_elmo_input) shape = tf.shape(self.x_elmo) self.shape = shape # self.glove = tf.Variable(tf.random_uniform([tf.shape(self.glove)[0], self.embed_dimensions], -1.0, 1.0),trainable=True) if self.glove_include: ### Lookup Glove Vectors ### batch_embedded = tf.nn.embedding_lookup(self.glove, self.x) batch_embedded = batch_embedded[:, -shape[1]:, :] ### Include POS ### if self.pos_include: ### POS-TAG Embedding ### embeddings_var = tf.Variable(tf.random_uniform( [12, self.pos_dimensions], -1.0, 1.0), trainable=True) self.pos_embedding = tf.nn.embedding_lookup( embeddings_var, self.pos) self.pos_embedded = self.pos_embedding[:, -shape[1]:, :] batch_embedded = tf.concat( [batch_embedded, self.pos_embedded], axis=2) if self.layer_1_include: hid = 2 * self.hidden_size if self.layer_1 == 'lstm': rnn_outputs, _ = bi_rnn( LSTMCell(self.hidden_size, use_peepholes=self.peephole_1), LSTMCell(self.hidden_size, use_peepholes=self.peephole_2), inputs=batch_embedded, dtype=tf.float32, scope='rnn_1') fw_outputs, bw_outputs = rnn_outputs layer = tf.concat([fw_outputs, bw_outputs], axis=2) elif self.layer_1 == 'gru': rnn_outputs, _ = bi_rnn(GRUCell(self.hidden_size), GRUCell(self.hidden_size), inputs=batch_embedded, dtype=tf.float32, scope='rnn_1') fw_outputs, bw_outputs = rnn_outputs layer = tf.concat([fw_outputs, bw_outputs], axis=2) else: conv_layer = tf.layers.conv1d( inputs=batch_embedded, filters=self.hidden_size * 2, kernel_size=self.kernel_size, strides=1, padding="same", activation=tf.nn.relu) layer = conv_layer else: layer = batch_embedded hid = self.hidden_size if self.pos_include: hid += self.pos_dimensions print(self.hidden_size) # FLAGS Including ELMO and Glove if self.glove_include and self.elmo: H_1 = tf.concat([layer, self.x_elmo], axis=2) hid += 1024 elif self.glove_include: H_1 = layer elif self.elmo: H_1 = self.x_elmo hid = 1024 if self.layer_2 == 'lstm': rnn_outputs_2, _ = bi_rnn( LSTMCell(hid, use_peepholes=self.peephole_3), LSTMCell(hid, use_peepholes=self.peephole_4), inputs=H_1, dtype=tf.float32, scope='rnn_2') fw_outputs_2, bw_outputs_2 = rnn_outputs_2 H = tf.concat([fw_outputs_2, bw_outputs_2], axis=2) elif self.layer_2 == 'gru': rnn_outputs_2, _ = bi_rnn(GRUCell(hid), GRUCell(hid), inputs=H_1, dtype=tf.float32, scope='rnn_2') fw_outputs_2, bw_outputs_2 = rnn_outputs_2 H = tf.concat([fw_outputs_2, bw_outputs_2], axis=2) elif self.layer_2 == 'conv': conv_layer = tf.layers.conv1d(inputs=H_1, filters=hid, kernel_size=self.kernel_size, strides=1, padding="same", activation=tf.nn.relu) H = conv_layer hid = tf.cast(hid / 2, tf.int32) else: H = H_1 hid = tf.cast(hid / 2, tf.int32) hid *= 2 ### Ask whether there is a sequence with length 0 ### condition = tf.equal(tf.reduce_min(self.seq_len), 0) ### FLAG Including attention ### if self.attention: with tf.variable_scope('attention', reuse=self.reuse): M = tf.tanh( H) # M = tanh(H) (batch_size, seq_len, HIDDEN_SIZE) dropout_layer_attention = tf.layers.dropout( inputs=tf.reshape(M, [-1, hid]), rate=self.attention_prob, training=self.is_training, seed=847) self.dense = tf.layers.dense( inputs=dropout_layer_attention, units=self.num_attention, use_bias=False) ### Pool - Max or Mean ### if self.pool_mean: self.pool = tf.reduce_mean(self.dense, axis=1) else: self.pool = tf.reduce_max(self.dense, axis=1) ### Setting for stride 2 ### #self.alpha = tf.exp(tf.reshape(self.pool, # [-1, tf.cast(tf.round(tf.add(tf.div(tf.cast(shape[1], dtype = tf.float32), 2.0), 0.1)), # dtype = tf.int32)])) self.alpha = tf.exp(tf.reshape(self.pool, [-1, shape[1]])) ### Masking the sequences ### if self.mask: with tf.variable_scope('mask', reuse=self.reuse): self.alpha = tf.reverse(self.alpha, axis=[1]) mask = tf.sequence_mask(self.seq_len) mask = tf.to_float(mask) self.alpha = tf.cond(condition, lambda: self.alpha, lambda: self.alpha * mask) self.alpha = tf.reverse(self.alpha, axis=[1]) #### Softmax #### self.alpha = self.alpha / tf.expand_dims( tf.reduce_sum(self.alpha, axis=1), 1) ### Derive the word with the highest attention ### pos = tf.argmax(self.alpha, axis=1) sparse_tensor = tf.string_split(self.x_elmo_input) dense_tensor = tf.sparse_tensor_to_dense(sparse_tensor, '') rg = tf.range(0, shape[0]) indices = tf.transpose([rg, tf.cast(pos, tf.int32)], [1, 0]) self.best_example = tf.gather_nd(dense_tensor, indices) ### Computing weighted average ### # r = tf.matmul(tf.transpose(H, [0, 2, 1]), tf.reshape(self.alpha, # [-1, tf.cast(tf.round(tf.add( # tf.div(tf.cast(shape[1], dtype=tf.float32), # 2.0), 0.1)), # dtype=tf.int32), 1])) r = tf.matmul(tf.transpose(H, [0, 2, 1]), tf.reshape(self.alpha, [-1, shape[1], 1])) r = tf.squeeze(r, axis=2) else: with tf.variable_scope('rnn_average', reuse=self.reuse): ### Take a simple mean of all the words (INCLUDING padding) ### ### Masking the sequences ### if self.mask: with tf.variable_scope('mask', reuse=self.reuse): self.alpha = tf.cond( condition, lambda: tf.tile(tf.expand_dims(shape[1], 0), tf.expand_dims(shape[0], 0)), lambda: self.seq_len) self.alpha = tf.reciprocal(tf.to_float(self.alpha)) self.alpha = tf.tile(tf.expand_dims(self.alpha, 1), [1, shape[1]]) self.alpha = tf.reverse(self.alpha, axis=[1]) mask = tf.sequence_mask(self.seq_len) mask = tf.to_float(mask) self.alpha = tf.cond(condition, lambda: self.alpha, lambda: self.alpha * mask) self.alpha = tf.reverse(self.alpha, axis=[1]) else: self.alpha = tf.tile(tf.expand_dims(shape[1], 0), tf.expand_dims(shape[0], 0)) self.alpha = tf.reciprocal(tf.to_float(self.alpha)) self.alpha = tf.tile(tf.expand_dims(self.alpha, 1), [1, shape[1]]) ### Necessarily here but serves no purpose - Derive the word with the highest attention ### pos = tf.argmax(self.alpha, axis=1) sparse_tensor = tf.string_split(self.x_elmo_input) dense_tensor = tf.sparse_tensor_to_dense(sparse_tensor, '') rg = tf.range(0, shape[0]) indices = tf.transpose([rg, tf.cast(pos, tf.int32)], [1, 0]) self.best_example = tf.gather_nd(dense_tensor, indices) ### Computing average ### r = tf.matmul(tf.transpose(H, [0, 2, 1]), tf.reshape(self.alpha, [-1, shape[1], 1])) r = tf.squeeze(r, axis=2) self.h_star = tf.tanh(r) # (batch , HIDDEN_SIZE)
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=tf.random_normal_initializer) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell_fw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") cell_bw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") d_cell_fw = SwitchableDropoutWrapper( cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper( cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell2_fw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") cell2_bw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") d_cell2_fw = SwitchableDropoutWrapper( cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell2_bw = SwitchableDropoutWrapper( cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell3_fw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") cell3_bw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") d_cell3_fw = SwitchableDropoutWrapper( cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell3_bw = SwitchableDropoutWrapper( cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell4_fw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") cell4_bw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") d_cell4_fw = SwitchableDropoutWrapper( cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell4_bw = SwitchableDropoutWrapper( cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(axis=2, values=[fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell_fw = AttentionCell( cell2_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) first_cell_bw = AttentionCell( cell2_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_fw = AttentionCell( cell3_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_bw = AttentionCell( cell3_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell_fw = d_cell2_fw second_cell_fw = d_cell3_fw first_cell_bw = d_cell2_bw second_cell_bw = d_cell3_bw (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(axis=3, values=[fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell4_fw, d_cell4_bw, tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(axis=3, values=[fw_g2, bw_g2]) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) if config.na: na_bias = tf.get_variable("na_bias", shape=[], dtype='float') na_bias_tiled = tf.tile(tf.reshape(na_bias, [1, 1]), [N, 1]) # [N, 1] concat_flat_logits = tf.concat( axis=1, values=[na_bias_tiled, flat_logits]) concat_flat_yp = tf.nn.softmax(concat_flat_logits) na_prob = tf.squeeze(tf.slice(concat_flat_yp, [0, 0], [-1, 1]), [1]) flat_yp = tf.slice(concat_flat_yp, [0, 1], [-1, -1]) concat_flat_logits2 = tf.concat( axis=1, values=[na_bias_tiled, flat_logits2]) concat_flat_yp2 = tf.nn.softmax(concat_flat_logits2) na_prob2 = tf.squeeze( tf.slice(concat_flat_yp2, [0, 0], [-1, 1]), [1]) # [N] flat_yp2 = tf.slice(concat_flat_yp2, [0, 1], [-1, -1]) self.concat_logits = concat_flat_logits self.concat_logits2 = concat_flat_logits2 self.na_prob = na_prob * na_prob2 yp = tf.reshape(flat_yp, [-1, M, JX], name="yp") yp2 = tf.reshape(flat_yp2, [-1, M, JX], name="yp2") wyp = tf.nn.sigmoid(logits2, name="wyp") self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2 self.wyp = wyp
dtype=tf.int32, name='encoder_inputs_length') decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets') embeddings = tf.get_variable('embedding', shape=(vocab_size, input_embedding_size), dtype=tf.float32, initializer=tf.initializers.random_uniform( -1.0, 1.0)) encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, encoder_inputs) # encoder encoder_cell = LSTMCell(encoder_hidden_units) (encoder_fw_outputs, encoder_bw_outputs), ( encoder_fw_final_state, encoder_bw_final_state) = tf.nn.bidirectional_dynamic_rnn( cell_fw=encoder_cell, cell_bw=encoder_cell, inputs=encoder_inputs_embedded, sequence_length=encoder_inputs_length, dtype=tf.float32, time_major=True) # 融合双向 LSTM 的状态 encoder_outputs = tf.concat((encoder_fw_outputs, encoder_bw_outputs), axis=2) encoder_final_state_c = tf.concat( (encoder_fw_final_state.c, encoder_bw_final_state.c), axis=1) encoder_final_state_h = tf.concat(
# y=tf.unstack(y) # tf.reset_default_graph() encode_input = tf.placeholder(shape=[None, None], dtype=tf.int32, name='encode_input') decode_target = tf.placeholder(shape=[None, None], dtype=tf.int32, name='encode_input') decode_input = tf.placeholder(shape=[None, None], dtype=tf.int32, name='encode_input') embedding = tf.Variable(tf.random_uniform([4, 10], -1.0, 1.0), dtype=tf.float32) #生成词汇表,前面是字符数量,后面是词嵌入大小 encode_embedding = tf.nn.embedding_lookup(embedding, encode_input) decode_embedding = tf.nn.embedding_lookup(embedding, decode_input) lstm_cell = LSTMCell(4) outputs, states = dynamic_rnn(lstm_cell, encode_embedding, dtype=tf.float32) print('states is ', states) # y=tf.unstack(y,4,1)/ lstm_cell2 = LSTMCell(num_units=4) logit, states2 = dynamic_rnn(lstm_cell2, decode_embedding, dtype=tf.float32, initial_state=states, scope='decode_output') print('2') la = tf.one_hot(y_target, depth=4, dtype=tf.float32) print(la) pre = tf.nn.softmax(logit) print('logit is ', logit)
def _build_lm_graph(hparams, inputs, mode, freeze_bdlm=False, scope=None): ids, lens, seq_in, phyche, seq_out = inputs seq_dense = tf.layers.dense(inputs=seq_in, units=25, use_bias=False, trainable=not freeze_bdlm, name="bdlm_seq_dense") x = tf.concat([seq_dense, phyche], axis=-1) _outputs = [] with tf.variable_scope(scope or "bdlm_cnn_embed", dtype=tf.float32) as cnn_scope: cnn_embed = tf.layers.Conv1D(filters=hparams.num_filters, kernel_size=hparams.filter_size, activation=tf.nn.relu, kernel_regularizer=lambda inp: hparams .l2_lambda * tf.nn.l2_loss(inp), trainable=not freeze_bdlm) embed_proj = tf.layers.Dense(units=hparams.num_units, kernel_regularizer=lambda inp: hparams .l2_lambda * tf.nn.l2_loss(inp), trainable=not freeze_bdlm) z_0 = tf.layers.dropout( inputs=cnn_embed(x), rate=hparams.dropout, training=mode == tf.contrib.learn.ModeKeys.TRAIN) z_0 = embed_proj(z_0) _outputs.append([z_0, z_0]) with tf.variable_scope(scope or "bdlm_rnn", dtype=tf.float32) as bdlm_scope: _get_cell = lambda name: LSTMCell(name=name, num_units=hparams.num_lm_units, num_proj=hparams.num_units, trainable=not freeze_bdlm) _drop_wrap = lambda cell: tf.nn.rnn_cell.DropoutWrapper( cell=cell, state_keep_prob=1.0 - hparams.recurrent_state_dropout if mode == tf.contrib.learn.ModeKeys.TRAIN else 1.0, input_keep_prob=1.0 - hparams.recurrent_input_dropout if mode == tf.contrib.learn.ModeKeys.TRAIN else 1.0, variational_recurrent=True, input_size=tf.TensorShape([1]), dtype=tf.float32) fw_cells = [] bw_cells = [] # keep track of unwrapped cells so we can get their weights later unwrapped_fw_cells = [] unwrapped_bw_cells = [] for i in range(hparams.num_lm_layers): fw_cell = _get_cell("lstm_fw_%d" % (i)) bw_cell = _get_cell("lstm_bw_%d" % (i)) unwrapped_fw_cells.append(fw_cell) unwrapped_bw_cells.append(bw_cell) fw_cell = _drop_wrap(fw_cell) bw_cell = _drop_wrap(bw_cell) # create a residual connection around 1st layer if i == 0: fw_cell = tf.nn.rnn_cell.ResidualWrapper(fw_cell) bw_cell = tf.nn.rnn_cell.ResidualWrapper(bw_cell) # split fw and bw between GPUs if hparams.num_gpus == 2: fw_dev = "/device:GPU:0" bw_dev = "/device:GPU:1" fw_cell = tf.nn.rnn_cell.DeviceWrapper(fw_cell, fw_dev) bw_cell = tf.nn.rnn_cell.DeviceWrapper(bw_cell, bw_dev) else: fw_dev = "/device:GPU:0" bw_dev = "/device:GPU:0" fw_cells.append(fw_cell) bw_cells.append(bw_cell) # reverse the bw inputs, then reverse all _outputs after dynamic_rnn _outputs[0][1] = tf.reverse_sequence( _outputs[0][1], seq_lengths=lens + tf.constant(hparams.filter_size + 1, dtype=tf.int32), seq_axis=1) for i in range(hparams.num_lm_layers): with tf.name_scope("bdlm_layer_%d" % (i)): # get fw / bw _outputs for each layer input_fw = _outputs[-1][0] input_bw = _outputs[-1][1] with tf.device(fw_dev): output_fw, _ = tf.nn.dynamic_rnn( cell=fw_cells[i], inputs=input_fw, sequence_length=lens + tf.constant(hparams.filter_size + 1, dtype=tf.int32), dtype=tf.float32) # add weight reg unwrapped_fw_cells[i].add_loss( tf.multiply(hparams.l2_lambda, tf.nn.l2_loss( unwrapped_fw_cells[i].weights[0]), name="fw_%d_l2w" % (i))) with tf.device(bw_dev): output_bw, _ = tf.nn.dynamic_rnn( cell=bw_cells[i], inputs=input_bw, sequence_length=lens + tf.constant(hparams.filter_size + 1, dtype=tf.int32), dtype=tf.float32) unwrapped_bw_cells[i].add_loss( tf.multiply(hparams.l2_lambda, tf.nn.l2_loss( unwrapped_bw_cells[i].weights[0]), name="bw_%d_l2w" % (i))) _outputs.append([output_fw, output_bw]) outputs = [] for i in range(len(_outputs)): # reverse the backward outputs; trim the extra steps from fw/bw and concat _outputs[i][1] = tf.reverse_sequence( _outputs[i][1], seq_lengths=lens + tf.constant(hparams.filter_size + 1, dtype=tf.int32), seq_axis=1) outputs.append( tf.concat([ _outputs[i][0][:, :-(hparams.filter_size + 1), :], _outputs[i][1][:, (hparams.filter_size + 1):, :] ], axis=-1)) output_fw = outputs[-1][0] output_bw = outputs[-1][1] with tf.variable_scope("bdlm_out", dtype=tf.float32): rnn_out = outputs[-1] rnn_out = tf.layers.dropout( inputs=rnn_out, rate=hparams.dropout, training=mode == tf.contrib.learn.ModeKeys.TRAIN) logits = tf.layers.dense(inputs=rnn_out, units=hparams.num_labels, kernel_regularizer=lambda inp: hparams. l2_lambda * tf.nn.l2_loss(inp), trainable=not freeze_bdlm) # mask out entries longer than target sequence length mask = tf.sequence_mask(lens, dtype=tf.float32) # add activity reg to last layer with tf.name_scope("l2_act_reg"): l2_act_loss = lambda act: tf.reduce_sum( tf.reduce_sum(hparams.l2_alpha * tf.square(act) * tf. expand_dims(mask, axis=-1), axis=[1, 2]) / tf.cast(lens, tf.float32)) # ignore the loss contributed by time steps longer than sequence length fw_act_loss = l2_act_loss(output_fw) bw_act_loss = l2_act_loss(output_bw) unwrapped_fw_cells[-1].add_loss(fw_act_loss, inputs=input_fw) unwrapped_bw_cells[-1].add_loss(bw_act_loss, inputs=input_bw) crossent = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=seq_out, name="crossent") seq_loss = tf.reduce_sum( tf.reduce_sum(crossent * mask, axis=1) / tf.cast( lens, tf.float32)) / tf.cast(hparams.batch_size, tf.float32) reg_loss = tf.add_n(tf.losses.get_regularization_losses(), name="reg_loss") if hparams.l2_alpha == 0. and hparams.l2_lambda == 0. and hparams.l2_beta == 0.: loss = seq_loss else: loss = seq_loss + reg_loss metrics = [] update_ops = [] if mode == tf.contrib.learn.ModeKeys.EVAL: # mean eval loss loss, loss_update = tf.metrics.mean(values=loss) seq_loss, seq_loss_update = tf.metrics.mean(values=seq_loss) tf.summary.scalar("eval_seq_loss", seq_loss, collections=["eval"]) reg_loss, reg_loss_update = tf.metrics.mean(values=reg_loss) tf.summary.scalar("eval_reg_loss", reg_loss, collections=["eval"]) predictions = tf.argmax(input=logits, axis=-1) tgt_labels = tf.argmax(input=seq_out, axis=-1) acc, acc_update = tf.metrics.accuracy(predictions=predictions, labels=tgt_labels, weights=mask) # final layer activations #mean_act_fw, mean_act_fw_update = add_seq_activation_histogram(output_fw, lens, "fw_2") #mean_act_bw, mean_act_bw_update = add_seq_activation_histogram(output_bw, lens, "bw_2") # confusion matrix targets_flat = tf.reshape(tgt_labels, [-1]) predictions_flat = tf.reshape(predictions, [-1]) mask_flat = tf.reshape(mask, [-1]) cm, cm_update = streaming_confusion_matrix( labels=targets_flat, predictions=predictions_flat, num_classes=hparams.num_labels, weights=mask_flat) tf.add_to_collection("eval", cm_summary(cm, hparams.num_labels)) metrics = [acc, cm] update_ops = [ loss_update, seq_loss_update, reg_loss_update, acc_update, cm_update ] #, mean_act_fw_update, mean_act_bw_update] return outputs, logits, loss, metrics, update_ops
def __init__( self, batch_size, inputs, outputs, num_units, cell_type ): """ Args: num_hidden : number of hidden elements of each LSTM unit. inputs : a list (tensor array) of input tensors with size hp.num_time_steps*(batch_size,dim) cell : an rnn cell object (the default option is tf.python.ops.rnn_cell.LSTMCell) reverse : Option to decode in reverse order decode_without_input : Option to decode without input - there are zeros coming to the cell instead of input """ self.batch_size = batch_size self.num_inputs = inputs[0].get_shape().as_list()[1] self.num_outputs = self.num_inputs num_time_steps = len(inputs) num_hidden = num_units[-1] self.last = inputs[-1] if len(num_units) > 1: cells = [LSTMCell(num_units=n) for n in num_units] self._lstm_cell = MultiRNNCell(cells) else: self._lstm_cell = LSTMCell(num_hidden) with tf.compat.v1.variable_scope('encoder') as ec: Wy = tf.Variable(tf.random.truncated_normal([num_hidden, self.num_outputs], dtype=tf.float32), name='enc_weight' ) by = tf.Variable(tf.random.truncated_normal([self.num_outputs], dtype=tf.float32), name='enc_bias') init_states = [] for i in range(len(num_units)): init_c = tf.zeros((batch_size, num_units[i])) init_h = init_c layer = tf.contrib.rnn.LSTMStateTuple(init_c, init_h) init_states.append(layer) init_states = tuple(init_states) if len(num_units) > 1: lstm_state = init_states else: lstm_state = init_states[0] lstm_outputs = [] for step in range(len(inputs)): if step > 0: ec.reuse_variables() lstm_input = inputs[step] (lstm_output, lstm_state) = self._lstm_cell( lstm_input, lstm_state) for step in range(len(outputs)): lstm_input = tf.matmul(lstm_output, Wy) + by lstm_outputs.append(lstm_input) (lstm_output, lstm_state) = self._lstm_cell( lstm_input, lstm_state) self.prediction = tf.transpose( tf.stack(lstm_outputs), [1, 0, 2], name='prediction') self.target = tf.transpose( tf.stack(outputs), [1, 0, 2], name='target') self.input_ = tf.transpose(tf.stack(inputs), [1, 0, 2]) self.prediction = self.prediction[:, :, 0] self.target = self.target[:, :, 0] self.enc_W = Wy self.enc_b = by
def __init__( self, batch_size, inputs, outputs, num_units, cell_type ): """ Args: inputs : a list (tensor array) of input tensors with size hp.num_time_steps*(batch_size,dim) cell : an rnn cell object (the default option is tf.python.ops.rnn_cell.LSTMCell) reverse : Option to decode in reverse order decode_without_input : Option to decode without input - there are zeros coming to the cell instead of input """ self.batch_size = batch_size self.num_inputs = inputs[0].get_shape().as_list()[1] self.num_outputs = self.num_inputs num_hidden = num_units[-1] if len(num_units) > 1: if cell_type == 'GRU': cells = [GRUCell(num_units=n) for n in num_units] else: cells = [LSTMCell(num_units=n) for n in num_units] self._enc_cell = MultiRNNCell(cells) self._dec_cell = MultiRNNCell(cells) else: if cell_type == 'GRU': self._enc_cell = GRUCell(num_hidden) self._dec_cell = GRUCell(num_hidden) else: self._enc_cell = LSTMCell(num_hidden) self._dec_cell = LSTMCell(num_hidden) # , initializer=tf.contrib.layers.xavier_initializer() with tf.compat.v1.variable_scope('encoder') as es: enc_W = tf.Variable(tf.random.truncated_normal([num_hidden, self.num_outputs], dtype=tf.float32), name='enc_weight' ) enc_b = tf.Variable(tf.random.truncated_normal([self.num_outputs], dtype=tf.float32), name='enc_bias') init_states = [] if cell_type == 'GRU': for i in range(len(num_units)): layer = tf.zeros((batch_size, num_units[i])) init_states.append(layer) else: # make the zero initial cell and hidden state as a tuple - in the shape LSTM cell expects it to be for i in range(len(num_units)): init_c = tf.zeros((batch_size, num_units[i])) init_h = init_c layer = tf.contrib.rnn.LSTMStateTuple(init_c, init_h) init_states.append(layer) init_states = tuple(init_states) if len(num_units) > 1: enc_state = init_states else: enc_state = init_states[0] enc_predictions = [] for step in range(len(inputs)): if step > 0: es.reuse_variables() enc_input = inputs[step] (enc_output, enc_state) = self._enc_cell( enc_input, enc_state) # lstm_output = hidden state, lstm_state = tuple(cell state, hidden state) #y_hat = Wy*h + by enc_prediction = tf.matmul(enc_output, enc_W) + enc_b enc_predictions.append(enc_prediction) with tf.compat.v1.variable_scope('decoder') as vs: dec_W = tf.Variable(tf.random.truncated_normal([num_hidden, self.num_outputs], dtype=tf.float32), name='dec_weight' ) dec_b = tf.Variable(tf.random.truncated_normal([self.num_outputs], dtype=tf.float32), name='dec_bias') dec_input = enc_prediction dec_state = enc_state dec_outputs = [] for step in range(len(outputs)): if step > 0: vs.reuse_variables() (dec_input, dec_state) = self._dec_cell( dec_input, dec_state) dec_input = tf.matmul(dec_input, dec_W) + dec_b dec_outputs.append(dec_input) self.prediction = tf.transpose( tf.stack(dec_outputs), [1, 0, 2], name='prediction') self.input_ = tf.transpose(tf.stack(inputs), [1, 0, 2]) self.target = tf.transpose(tf.stack(outputs), [1, 0, 2], name='target') self.prediction = self.prediction[:, :, 0] self.target = self.target[:, :, 0] self.enc_W = enc_W self.enc_b = enc_b self.dec_W = dec_W self.dec_b = dec_b
def _LSTMCells(unit_list, act_fn_list): return MultiRNNCell([ LSTMCell(unit, activation=act_fn) for unit, act_fn in zip(unit_list, act_fn_list) ])
output = embeded ''' for i in range(lstm_layers): with tf.variable_scope('BiLSTM_Layer_{}'.format(i)): lstm_fw = LSTMCell(lstm_size) #, initializer=tf.truncated_normal_initializer(-0.1, 0.1, seed=2) lstm_bw = LSTMCell(lstm_size) cell_fw = tf.contrib.rnn.DropoutWrapper(lstm_fw, output_keep_prob=keep_prob_ph_rnn) cell_bw = tf.contrib.rnn.DropoutWrapper(lstm_bw, output_keep_prob=keep_prob_ph_rnn) (output_fw, output_bw), final_state = BiRNN(cell_fw, cell_bw, output, dtype=tf.float32) output = tf.concat((output_fw, output_bw), 2) ''' ##Attention + Dropout with tf.variable_scope('BiLSTM_Layer_{}'.format(lstm_layers)): lstm_fw = LSTMCell(lstm_size) lstm_bw = LSTMCell(lstm_size) (output_fw, output_bw), final_state = BiRNN(lstm_fw, lstm_bw, output, dtype=tf.float32) output = tf.concat((output_fw, output_bw), 2) attention = Attention(output) drop = tf.nn.dropout(attention, keep_prob_ph) tf.summary.histogram('RNN_output', output) ##FC layers with tf.name_scope('Fully_connected_Layers_0'): fc_output = tf.contrib.layers.fully_connected(drop, 64, activation_fn=tf.nn.relu)