def prepare_model(self): with tf.variable_scope("LSTMTDNN"): self.char_inputs = [] self.word_inputs = [] self.cnn_outputs = [] if self.use_char: char_W = tf.get_variable( "char_embed", [self.char_vocab_size, self.char_embed_dim]) else: word_W = tf.get_variable( "word_embed", [self.word_vocab_size, self.word_embed_dim]) with tf.variable_scope("CNN") as scope: self.char_inputs = tf.placeholder( tf.int32, [self.batch_size, self.seq_length, self.max_word_length]) self.word_inputs = tf.placeholder( tf.int32, [self.batch_size, self.seq_length]) char_indices = tf.split(1, self.seq_length, self.char_inputs) word_indices = tf.split(1, self.seq_length, tf.expand_dims(self.word_inputs, -1)) for idx in xrange(self.seq_length): char_index = tf.reshape(char_indices[idx], [-1, self.max_word_length]) word_index = tf.reshape(word_indices[idx], [-1, 1]) if idx != 0: scope.reuse_variables() if self.use_char: # [batch_size x word_max_length, char_embed] char_embed = tf.nn.embedding_lookup(char_W, char_index) char_cnn = TDNN(char_embed, self.char_embed_dim, self.feature_maps, self.kernels) if self.use_word: word_embed = tf.nn.embedding_lookup( word_W, word_index) cnn_output = tf.concat(1, char_cnn.output, word_embed) else: cnn_output = char_cnn.output else: cnn_output = tf.squeeze( tf.nn.embedding_lookup(word_W, word_index)) if self.use_batch_norm: bn = batch_norm() norm_output = bn( tf.expand_dims(tf.expand_dims(cnn_output, 1), 1)) cnn_output = tf.squeeze(norm_output) if highway: #cnn_output = highway(input_, input_dim_length, self.highway_layers, 0) cnn_output = highway(cnn_output, cnn_output.get_shape()[1], self.highway_layers, 0) self.cnn_outputs.append(cnn_output) with tf.variable_scope("LSTM") as scope: self.cell = rnn_cell.BasicLSTMCell(self.rnn_size) self.stacked_cell = rnn_cell.MultiRNNCell([self.cell] * self.layer_depth) outputs, _ = rnn.rnn(self.stacked_cell, self.cnn_outputs, dtype=tf.float32) self.lstm_outputs = [] self.true_outputs = tf.placeholder( tf.float32, [self.batch_size, self.seq_length, self.word_vocab_size]) loss = 0 true_outputs = tf.split(1, self.seq_length, self.true_outputs) for idx, (top_h, true_output) in enumerate(zip(outputs, true_outputs)): if self.dropout_prob > 0: top_h = tf.nn.dropout(top_h, self.dropout_prob) if self.hsm > 0: self.lstm_outputs.append(top_h) else: if idx != 0: scope.reuse_variables() proj = rnn_cell.linear(top_h, self.word_vocab_size, 0) log_softmax = tf.log(tf.nn.softmax(proj)) self.lstm_outputs.append(log_softmax) loss += tf.nn.softmax_cross_entropy_with_logits( self.lstm_outputs[idx], tf.squeeze(true_output)) self.loss = tf.reduce_mean(loss) / self.seq_length tf.scalar_summary("loss", self.loss) tf.scalar_summary("perplexity", tf.exp(self.loss))
def __init__(self, config): """Build model(define computational blocks). Args: config: an instance of Config class. """ self.config = config self.embvec = config.embvec self.wrd_vocab_size = len(self.embvec.wrd_embeddings) self.wrd_dim = config.wrd_dim self.word_length = config.word_length self.chr_vocab_size = len(self.embvec.chr_vocab) self.chr_dim = config.chr_dim self.pos_vocab_size = len(self.embvec.pos_vocab) self.pos_dim = config.pos_dim self.chk_vocab_size = len(self.embvec.chk_vocab) self.chk_dim = config.chk_dim self.class_size = config.class_size self.use_crf = config.use_crf self.emb_class = config.emb_class self.is_training = config.is_training self.print_local_devices(self.is_training) """ Input layer """ self.is_train = tf.placeholder(tf.bool, name='is_train') self.sentence_length = tf.placeholder(tf.int32, name='sentence_length') self.keep_prob = tf.cond(self.is_train, lambda: config.keep_prob, lambda: 1.0) # pos embedding self.input_data_pos_ids = tf.placeholder(tf.int32, shape=[None, None], name='input_data_pos_ids') # (batch_size, sentence_length) self.sentence_masks = self.__compute_sentence_masks(self.input_data_pos_ids) sentence_lengths = self.__compute_sentence_lengths(self.sentence_masks) self.sentence_lengths = tf.identity(sentence_lengths, name='sentence_lengths') masks = tf.to_float(tf.expand_dims(self.sentence_masks, -1)) # (batch_size, sentence_length, 1) self.pos_embeddings = self.__pos_embedding(self.input_data_pos_ids, keep_prob=self.keep_prob, scope='pos-embedding') # chk embedding self.input_data_chk_ids = tf.placeholder(tf.int32, shape=[None, None], name='input_data_chk_ids') # (batch_size, sentence_length) self.chk_embeddings = self.__chk_embedding(self.input_data_chk_ids, keep_prob=self.keep_prob, scope='chk-embedding') # (large) word embedding data self.wrd_embeddings_init = tf.placeholder(tf.float32, shape=[self.wrd_vocab_size, self.wrd_dim], name='wrd_embeddings_init') self.wrd_embeddings = tf.Variable(self.wrd_embeddings_init, name='wrd_embeddings', trainable=False) # word embeddings self.input_data_word_ids = tf.placeholder(tf.int32, shape=[None, None], name='input_data_word_ids') # (batch_size, sentence_length) self.word_embeddings = self.__word_embedding(self.input_data_word_ids, keep_prob=self.keep_prob, scope='word-embedding') # character embeddings self.input_data_wordchr_ids = tf.placeholder(tf.int32, shape=[None, None, self.word_length], # (batch_size, sentence_length, word_length) name='input_data_wordchr_ids') if config.chr_conv_type == 'conv1d': self.wordchr_embeddings = self.__wordchr_embedding_conv1d(self.input_data_wordchr_ids, keep_prob=self.keep_prob, scope='wordchr-embedding-conv1d') else: self.wordchr_embeddings = self.__wordchr_embedding_conv2d(self.input_data_wordchr_ids, keep_prob=self.keep_prob, scope='wordchr-embedding-conv2d') if 'elmo' in self.emb_class: # elmo embeddings self.elmo_bilm = config.elmo_bilm elmo_keep_prob = tf.cond(self.is_train, lambda: config.elmo_keep_prob, lambda: 1.0) self.elmo_input_data_wordchr_ids = tf.placeholder(tf.int32, shape=[None, None, self.word_length], # (batch_size, sentence_length+2, word_length) name='elmo_input_data_wordchr_ids') # '+2' stands for '<S>', '</S>' self.elmo_embeddings = self.__elmo_embedding(self.elmo_input_data_wordchr_ids, masks, keep_prob=elmo_keep_prob) if 'bert' in self.emb_class: # bert embeddings in subgraph self.bert_config = config.bert_config self.bert_init_checkpoint = config.bert_init_checkpoint self.bert_input_data_token_ids = tf.placeholder(tf.int32, shape=[None, config.bert_max_seq_length], name='bert_input_data_token_ids') self.bert_input_data_token_masks = tf.placeholder(tf.int32, shape=[None, config.bert_max_seq_length], name='bert_input_data_token_masks') self.bert_input_data_segment_ids = tf.placeholder(tf.int32, shape=[None, config.bert_max_seq_length], name='bert_input_data_segment_ids') bert_embeddings_subgraph = self.__bert_embedding(self.bert_input_data_token_ids, self.bert_input_data_token_masks, self.bert_input_data_segment_ids) self.bert_embeddings_subgraph = tf.identity(bert_embeddings_subgraph, name='bert_embeddings_subgraph') # bert embedding at runtime self.bert_embeddings = tf.placeholder(tf.float32, shape=[None, config.bert_max_seq_length, config.bert_dim], name='bert_embeddings') bert_keep_prob = tf.cond(self.is_train, lambda: config.bert_keep_prob, lambda: 1.0) self.bert_embeddings = tf.nn.dropout(self.bert_embeddings, bert_keep_prob) concat_in = [self.word_embeddings, self.wordchr_embeddings, self.pos_embeddings, self.chk_embeddings] if self.emb_class == 'elmo': concat_in = [self.word_embeddings, self.wordchr_embeddings, self.elmo_embeddings, self.pos_embeddings, self.chk_embeddings] if self.emb_class == 'bert': concat_in = [self.word_embeddings, self.wordchr_embeddings, self.bert_embeddings, self.pos_embeddings, self.chk_embeddings] if self.emb_class == 'bert+elmo': concat_in = [self.word_embeddings, self.wordchr_embeddings, self.bert_embeddings, self.elmo_embeddings, self.pos_embeddings, self.chk_embeddings] self.input_data = tf.concat(concat_in, axis=-1, name='input_data') # (batch_size, sentence_length, input_dim) # highway network if config.highway_used: input_dim = self.input_data.get_shape()[-1] self.input_data = tf.reshape(self.input_data, [-1, input_dim]) self.input_data = highway(self.input_data, input_dim, num_layers=2, scope='highway') self.input_data = tf.reshape(self.input_data, [-1, self.sentence_length, input_dim]) self.input_data = tf.nn.dropout(self.input_data, keep_prob=self.keep_prob) # masking (for confirmation) self.input_data *= masks """ RNN layer """ self.rnn_output = self.__bi_rnn(self.input_data) """ Transformer layer """ self.transformed_output = self.__transform(self.rnn_output, masks) """ Projection layer """ self.logits = self.__projection(self.transformed_output, self.class_size, scope='projection') # (batch_size, sentence_length, class_size) """ Output answer """ self.output_data = tf.placeholder(tf.float32, shape=[None, None, self.class_size], # (batch_size, sentence_length, class_size) name='output_data') self.output_data_indices = tf.argmax(self.output_data, axis=-1, output_type=tf.int32) # (batch_size, sentence_length) """ Prediction """ self.prediction = self.__compute_prediction() self.logits_indices = tf.identity(self.prediction, name='logits_indices')
def prepare_model(self): with tf.variable_scope("LSTMTDNN"): self.char_inputs = [] self.word_inputs = [] self.cnn_outputs = [] if self.use_char: char_W = tf.get_variable("char_embed", [self.char_vocab_size, self.char_embed_dim]) if self.use_word: word_W = tf.get_variable("word_embed", [self.word_vocab_size, self.word_embed_dim]) with tf.variable_scope("CNN") as scope: self.char_inputs = tf.placeholder(tf.int32, [self.batch_size, self.seq_length, self.max_word_length]) self.word_inputs = tf.placeholder(tf.int32, [self.batch_size, self.seq_length]) char_indices = tf.split(1, self.seq_length, self.char_inputs) word_indices = tf.split(1, self.seq_length, tf.expand_dims(self.word_inputs, -1)) for idx in xrange(self.seq_length): char_index = tf.reshape(char_indices[idx], [-1, self.max_word_length]) word_index = tf.reshape(word_indices[idx], [-1, 1]) if idx != 0: scope.reuse_variables() if self.use_char: # [batch_size x word_max_length, char_embed] char_embed = tf.nn.embedding_lookup(char_W, char_index) char_cnn = TDNN(char_embed, self.char_embed_dim, self.feature_maps, self.kernels) if self.use_word: word_embed = tf.nn.embedding_lookup(word_W, word_index) cnn_output = tf.concat(1, [char_cnn.output, tf.squeeze(word_embed, [1])]) else: cnn_output = char_cnn.output else: cnn_output = tf.squeeze(tf.nn.embedding_lookup(word_W, word_index)) if self.use_batch_norm: bn = batch_norm() norm_output = bn(tf.expand_dims(tf.expand_dims(cnn_output, 1), 1)) cnn_output = tf.squeeze(norm_output) if highway: #cnn_output = highway(input_, input_dim_length, self.highway_layers, 0) cnn_output = highway(cnn_output, cnn_output.get_shape()[1], self.highway_layers, 0) self.cnn_outputs.append(cnn_output) with tf.variable_scope("LSTM") as scope: self.cell = tf.nn.rnn_cell.BasicLSTMCell(self.rnn_size) self.stacked_cell = tf.nn.rnn_cell.MultiRNNCell([self.cell] * self.layer_depth) outputs, _ = tf.nn.rnn(self.stacked_cell, self.cnn_outputs, dtype=tf.float32) self.lstm_outputs = [] self.true_outputs = tf.placeholder(tf.int64, [self.batch_size, self.seq_length]) loss = 0 true_outputs = tf.split(1, self.seq_length, self.true_outputs) for idx, (top_h, true_output) in enumerate(zip(outputs, true_outputs)): if self.dropout_prob > 0: top_h = tf.nn.dropout(top_h, self.dropout_prob) if self.hsm > 0: self.lstm_outputs.append(top_h) else: if idx != 0: scope.reuse_variables() proj = tf.nn.rnn_cell._linear(top_h, self.word_vocab_size, 0) self.lstm_outputs.append(proj) loss += tf.nn.sparse_softmax_cross_entropy_with_logits(self.lstm_outputs[idx], tf.squeeze(true_output)) self.loss = tf.reduce_mean(loss) / self.seq_length tf.scalar_summary("loss", self.loss) tf.scalar_summary("perplexity", tf.exp(self.loss))
def _build(self, pretrained_word_embedding, bert_word_embedding): with tf.variable_scope("LSTMTDNN"): with tf.device('/cpu:0'): if self.use_char: self.char_embedding = tf.get_variable( "char_matrix", [self.char_vocab_size, self.c_emb_size], initializer=tf.uniform_unit_scaling_initializer()) if self.use_pts: self.postag_embedding = tf.get_variable( "postag_matrix", [self.postag_vocab_size, self.pt_emb_size], initializer=tf.uniform_unit_scaling_initializer()) if self.use_word: if pretrained_word_embedding is None: self.word_embedding = tf.get_variable( "word_matrix", [self.word_vocab_size, self.w_emb_size], initializer=tf.uniform_unit_scaling_initializer()) else: self.word_embedding = tf.get_variable( "word_matrix", [self.word_vocab_size, self.w_emb_size], initializer=tf.constant_initializer( pretrained_word_embedding), trainable=False) # char_vecs sentence_len x max_word_len x embedding_len if self.use_bert_word: if bert_word_embedding is None: self.bio_bert_word_embedding = tf.get_variable( "bio_bert_word_matrix", [ self.bert_vocab_size, self.bert_word_embedding_dim ], initializer=tf.uniform_unit_scaling_initializer()) else: self.bio_bert_word_embedding = tf.get_variable( "bio_bert_word_matrix", [ self.bert_vocab_size, self.bert_word_embedding_dim ], initializer=tf.constant_initializer( bert_word_embedding), trainable=False) if self.use_char: char_vecs = tf.nn.embedding_lookup(self.char_embedding, self.char_input) # word_vec sentence_len x embedding_len if self.use_word: word_vecs = tf.nn.embedding_lookup(self.word_embedding, self.word_input) if self.use_bert_word: bio_word_vecs = tf.nn.embedding_lookup( self.bio_bert_word_embedding, self.bio_word_input) # postag_vec sentence_len x embedding_len if self.use_pts: pt_vecs = tf.nn.embedding_lookup(self.postag_embedding, self.pt_input) # char_embedding layer if self.use_char: char_cnn = TDNN(char_vecs, feature_maps=self.feature_maps, kernels=self.kernels, embed_dim=self.c_emb_size) # if self.use_pts: # combined_emb = tf.concat([pt_vecs, char_cnn.output], 1) # else: combined_emb = char_cnn.output if self.use_word: combined_emb = tf.concat([word_vecs, combined_emb], 1) if self.use_bert_word: combined_emb = tf.concat([bio_word_vecs, combined_emb], 1) combined_emb = tf.reshape(combined_emb, [-1, self.total_emb_size]) if self.highway: combined_emb = highway(combined_emb, self.total_emb_size, layer_size=1) combined_emb = tf.reshape(combined_emb, [-1, self.total_emb_size]) combined_emb = tf.expand_dims(combined_emb, 0) combined_emb = tf.nn.dropout(combined_emb, keep_prob=1 - self.drop_rate) if not self.padding: lstm_cell_fw_1 = tf.contrib.rnn.BasicLSTMCell(self.h_size) lstm_cell_bw_1 = tf.contrib.rnn.BasicLSTMCell(self.h_size) lstm_cell_fw = tf.contrib.rnn.MultiRNNCell([lstm_cell_fw_1] * self.lstm_layers, state_is_tuple=True) lstm_cell_bw = tf.contrib.rnn.MultiRNNCell([lstm_cell_bw_1] * self.lstm_layers, state_is_tuple=True) self.outputs, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=lstm_cell_fw, cell_bw=lstm_cell_bw, inputs=combined_emb, dtype=tf.float32, sequence_length=self.__length(combined_emb)) out = tf.concat([self.outputs[0], self.outputs[1]], 2) # two layer NN w_1 = tf.get_variable("w_1", [self.h_size * 2, self.h_size]) b_1 = tf.get_variable("b_1", [self.h_size]) linear1 = tf.matmul(tf.reshape(out, [-1, self.h_size * 2]), w_1) + b_1 w_3 = tf.get_variable("w_3", [self.h_size, self.num_classes]) b_3 = tf.get_variable("b_3", [self.num_classes]) self.logits = tf.matmul(tf.tanh(linear1), w_3) + b_3 else: line_layer = 200 gram_cnn = n_gram(combined_emb, embed_dim=self.total_emb_size, max_seq_len=self.max_seq_len) # gram_cnn = fcn(combined_emb, embed_dim = self.total_emb_size, max_seq_len = self.max_seq_len) cnn_output = gram_cnn.output if self.use_pts: cnn_output = tf.concat([pt_vecs, cnn_output], 1) w_1 = tf.get_variable("w_1", [cnn_output.get_shape()[1], line_layer]) b_1 = tf.get_variable("b_1", [line_layer]) linear1 = tf.matmul(cnn_output, w_1) + b_1 w_2 = tf.get_variable("w_2", [line_layer, self.num_classes]) b_2 = tf.get_variable("b_2", [self.num_classes]) self.logits = tf.matmul(tf.tanh(linear1), w_2) + b_2 if not self.crf: self.loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( self.logits, self.target)) soft_max = tf.nn.softmax(self.logits) self.y_pred = tf.argmax(soft_max, axis=1) else: # use crf to do post processing unary_scores = tf.reshape(self.logits, [1, -1, self.num_classes]) if not self.padding: log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood( unary_scores, tf.reshape(self.target, [1, -1]), self.__length(combined_emb)) else: log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood( unary_scores, tf.reshape(self.target, [1, -1]), self.s_len) self.loss = tf.reduce_mean(-log_likelihood) self.global_step = tf.Variable(0, name='global_step', trainable=False, collections=[ tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES ]) self.learning_rate = tf.train.exponential_decay( 0.002, # Base learning rate. self.global_step, # Current index into the dataset. 20 * self.train_size, # Decay step. 0.95, # Decay rate. staircase=True) self.opt = tf.train.MomentumOptimizer(self.learning_rate, 0.9) params = tf.trainable_variables() grads = [] for grad in tf.gradients(self.loss, params): if grad is not None: grads.append(tf.clip_by_norm(grad, self.max_grad_norm)) else: grads.append(grad) self.optim = self.opt.apply_gradients(zip(grads, params), global_step=self.global_step)