def loss(self): # Calculate mean cross-entropy loss with tf.name_scope("loss"): labels = tf.cast(self.input_y, 'int32') if self.viterbi: log_likelihood, transition_params = crf.crf_log_likelihood( self.unflat_scores, labels, self.label_mask, self.flat_sequence_lengths, transition_params=self.transition_params) # self.transition_params = transition_params loss = tf.reduce_mean(-log_likelihood) else: losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.unflat_scores, labels=labels) masked_losses = tf.multiply(losses, self.input_mask) loss = tf.div(tf.reduce_sum(masked_losses), tf.reduce_sum(self.input_mask)) loss += self.l2_penalty * self.l2_loss drop_loss = tf.nn.l2_loss( tf.subtract(self.unflat_scores, self.unflat_no_dropout_scores)) loss += self.drop_penalty * drop_loss return loss
def __call__(self, y_true, y_pred, sample_weight=None, **kwargs): assert sample_weight is not None, "your model has to support masking" if len(y_true.shape) == 3: y_true = tf.argmax(y_true, axis=-1) sequence_lengths = tf.math.count_nonzero(sample_weight, axis=1) y_pred = tf.convert_to_tensor(y_pred, dtype=self.dtype) log_likelihood, self.crf.transitions = crf_log_likelihood( y_pred, tf.cast(y_true, dtype=tf.int32), sequence_lengths, transition_params=self.crf.transitions) return tf.reduce_mean(-log_likelihood)
def __init__(self, name, batch, config, v_shape): words, pos, gazetteer, chars, len_chars, labels, len_words = batch n_words, n_pos, n_categories, n_chars, n_tags = v_shape batch_size = tf.shape(words)[0] max_words = tf.shape(words)[1] embeddings = char_embeddings(chars, len_chars, n_chars, config) embedding_pool = tf.reshape( conv_max_pool(embeddings, len_words, config), [batch_size, max_words, config.pool_size]) fw = tf.nn.rnn_cell.MultiRNNCell([ tf.nn.rnn_cell.LSTMCell(config.h_size) for _ in range(config.depth) ]) bw = tf.nn.rnn_cell.MultiRNNCell([ tf.nn.rnn_cell.LSTMCell(config.h_size) for _ in range(config.depth) ]) self.dropout = tf.placeholder(tf.float32, []) features = tf.nn.dropout(tf.concat( [words, pos, gazetteer, embedding_pool], axis=2), keep_prob=1 - self.dropout) output, _ = tf.nn.bidirectional_dynamic_rnn(cell_fw=fw, cell_bw=bw, inputs=features, sequence_length=len_words, dtype=tf.float32) output = tf.concat(output, axis=2) output = tf.layers.dense(tf.nn.dropout(output, keep_prob=1 - self.dropout), units=n_tags, name="output") log_likelihood, transition = crf.crf_log_likelihood( output, tag_indices=labels, sequence_lengths=len_words) # Viterbi decode self.predict, self.score = crf.crf_decode(output, transition_params=transition, sequence_length=len_words) # Cross-entropy loss self.loss = tf.reduce_mean(-log_likelihood, name="loss") tvars = tf.trainable_variables() gradients, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clip_norm=5.0) optimizer = tf.train.AdamOptimizer(config.learning_rate, epsilon=0.1) self.train = optimizer.apply_gradients(zip(gradients, tvars))
def loss_layer(self, project_logits, lengths, name=None): """ calculate crf loss :param project_logits: [1, num_steps, num_tags] :return: scalar loss """ with tf.variable_scope("crf_loss" if not name else name): small = -1000.0 start_logits = tf.concat([ small * tf.ones(shape=[self.batch_size, 1, self.num_tags]), tf.zeros(shape=[self.batch_size, 1, 1]) ], axis=-1) pad_logits = tf.cast( small * tf.ones([self.batch_size, self.num_steps, 1]), tf.float32) logits = tf.concat([project_logits, pad_logits], axis=-1) logits = tf.concat([start_logits, logits], axis=1) targets = tf.concat( #在targets后面又增加了一维,值为13(开始标签),原本是0-12 [ tf.cast(self.num_tags * tf.ones([self.batch_size, 1]), tf.int32), self.targets ], axis=-1) #多了个开始标签,所以加1 self.trans = tf.get_variable( "transitions", shape=[self.num_tags + 1, self.num_tags + 1], initializer=self.initializer) # 同理,多了个开始标签,所以lengths+1 log_likelihood, self.trans = crf_log_likelihood( inputs=logits, tag_indices=targets, transition_params=self.trans, sequence_lengths=lengths + 1) # 极大化负对数似然 除以batch_size return tf.reduce_mean(-log_likelihood)
def __init__(self, max_seq_len, max_word_len, char_dim, char_rnn_dim, char_bidirect, word_dim, rnn_dim, word_bidirect, cap_dim, pos_dim, load_path, num_word, num_char, num_cap, num_pos, num_tag): self.word_ids = tf.placeholder(tf.int32, [None, max_seq_len], name="word_ids") self.seq_lengths = tf.placeholder( tf.int64, [None], name="seq_lengths") # number of valid words self.char_for_ids = tf.placeholder(tf.int32, [None, max_seq_len, max_word_len], name="char_for_ids") self.char_rev_ids = tf.placeholder(tf.int32, [None, max_seq_len, max_word_len], name="char_rev_ids") self.word_lengths = tf.placeholder(tf.int32, [None, max_seq_len], name="char_pos_ids") self.tag_ids = tf.placeholder(tf.int32, [None, max_seq_len], name='tag_ids') self.cap_ids = tf.placeholder(tf.int32, [None, max_seq_len], name='cap_ids') self.pos_ids = tf.placeholder(tf.int32, [None, max_seq_len], name='pos_ids') self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") self.word_dim = word_dim self.char_dim = char_dim self.cap_dim = cap_dim self.pos_dim = pos_dim self.char_bidirect = char_bidirect initializer = tf.contrib.layers.xavier_initializer(uniform=True, seed=None, dtype=tf.float32) inputs = [] input_dim = 0 with tf.device("/gpu:2"): if word_dim: word_embedding = tf.get_variable('word_embedding', [num_word, word_dim], initializer=initializer) word_embedded = tf.nn.embedding_lookup(word_embedding, self.word_ids, name="word_layer") inputs.append(word_embedded) input_dim += word_dim if char_dim: char_embedding = tf.get_variable('char_embedding', [num_char, char_dim], initializer=initializer) word_lengths = tf.reshape(self.word_lengths, [-1]) with tf.variable_scope('char_forward_rnn'): char_for_embedded = tf.reshape( tf.nn.embedding_lookup(char_embedding, self.char_for_ids), [-1, max_word_len, char_dim]) char_for_state = self.rnn(char_for_embedded, char_rnn_dim, word_lengths) char_for_out = tf.reshape(char_for_state, [-1, max_seq_len, char_rnn_dim]) inputs.append(char_for_out) input_dim += char_rnn_dim if char_bidirect: with tf.variable_scope('char_backward_rnn'): char_rev_embedded = tf.reshape( tf.nn.embedding_lookup(char_embedding, self.char_rev_ids), [-1, max_word_len, char_dim]) char_rev_state = self.rnn(char_rev_embedded, char_rnn_dim, word_lengths) char_rev_out = tf.reshape( char_rev_state, [-1, max_seq_len, char_rnn_dim]) inputs.append(char_rev_out) input_dim += char_rnn_dim if cap_dim: cap_embedding = tf.get_variable('cap_embedding', [num_cap, cap_dim], initializer=initializer) cap_embedded = tf.nn.embedding_lookup(cap_embedding, self.cap_ids, name="cap_layer") inputs.append(cap_embedded) input_dim += cap_dim if pos_dim: pos_embedding = tf.get_variable('pos_embedding', [num_pos, pos_dim], initializer=initializer) pos_embedded = tf.nn.embedding_lookup(pos_embedding, self.pos_ids, name='pos_layer') inputs.append(pos_embedded) input_dim += pos_dim inputs = tf.concat(2, inputs) inputs = tf.nn.dropout(inputs, self.dropout_keep_prob) with tf.variable_scope('forward_rnn'): word_for_output = self.rnn(inputs, rnn_dim, None) if word_bidirect: inputs_rev = tf.reverse_sequence(inputs, self.seq_lengths, seq_dim=1, batch_dim=None) with tf.variable_scope('backward_rnn'): word_rev_output = self.rnn(inputs_rev, rnn_dim, None) word_rev_output = tf.reverse_sequence(word_rev_output, self.seq_lengths, seq_dim=1, batch_dim=None) final_output = tf.concat(2, [word_for_output, word_rev_output]) final_output = self.hidden_layer(final_output, 2 * rnn_dim, rnn_dim, "tanh_layer", initializer, activation=tf.tanh) else: final_output = word_for_output self.tag_scores = self.hidden_layer( final_output, rnn_dim, num_tag, 'final_layer', initializer, activation=None) # [batch_size, seq_dim, num_tags] # Compute the log-likelihood of the gold sequences and keep the transition # params for inference at test time self.transitions = tf.get_variable("transitions", [num_tag, num_tag]) log_likelihood, _ = crf.crf_log_likelihood(self.tag_scores, self.tag_ids, self.seq_lengths, self.transitions) self.loss = tf.reduce_mean(-log_likelihood) self.likelihood = tf.exp(log_likelihood) tvars = tf.trainable_variables() max_grad_norm = 5 grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), max_grad_norm) optimizer = tf.train.AdamOptimizer(1e-3) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True config.log_device_placement = False self.session = tf.Session(config=config) self.saver = tf.train.Saver(tf.all_variables(), max_to_keep=1) if load_path: self.saver.restore(self.session, load_path) else: self.session.run(tf.initialize_all_variables())