def encode(self, inputs, masks, is_train): context, question = inputs context_mask, question_mask = masks with tf.variable_scope("encode_context"): # outshape: [batch_size, 2 * rnn_hidden_units] lstm_pool_context, lstm_out_context = BiLSTM( context, context_mask, self.hidden_units, tf.cond(is_train, lambda: self.output_dropout_keep_prob, lambda: 1.0), tf.cond(is_train, lambda: self.input_dropout_keep_prob, lambda: 1.0), tf.cond(is_train, lambda: self.state_dropout_keep_prob, lambda: 1.0), n_layers=self.n_layers, residual=False, use_last=True, seed=self.seed, reuse=False) lstm_out_context = tf.concat( [lstm_out_context[0], lstm_out_context[1]], 2, name='lstm_out_context') with tf.variable_scope('encode_question'): lstm_pool_question, lstm_out_question = BiLSTM( question, question_mask, self.hidden_units, tf.cond(is_train, lambda: self.output_dropout_keep_prob, lambda: 1.0), tf.cond(is_train, lambda: self.input_dropout_keep_prob, lambda: 1.0), tf.cond(is_train, lambda: self.state_dropout_keep_prob, lambda: 1.0), n_layers=self.n_layers, residual=False, use_last=True, seed=self.seed, reuse=False) lstm_out_question = tf.concat( [lstm_out_question[0], lstm_out_question[1]], 2, name='lstm_out_question') return [lstm_out_context, lstm_pool_context], [lstm_out_question, lstm_pool_question]
def run_match_lstm(self, context_out, question_out, context_len, question_len, is_train): qc_att = scaled_dot_product_attention(context_out, question_out, memory_len=question_len, hidden=self.hidden_units, keep_prob=self.keep_prob, is_train=is_train) lstm_out = BiLSTM( qc_att, context_len, self.hidden_units, output_dropout_keep_prob=tf.cond(is_train, lambda: self.output_keep_prob, lambda: 1.0), input_dropout_keep_prob=tf.cond(is_train, lambda: self.input_keep_prob, lambda: 1.0), state_dropout_keep_prob=tf.cond(is_train, lambda: self.state_keep_prob, lambda: 1.0), use_last=False, seed=self.seed, reuse=False) lstm_out = tf.concat([lstm_out[0], lstm_out[1]], 2, name='lstm_out') return lstm_out
def _gen_left_right_ctx(self): self.layers={} self.attention_dims=50 self.rnn_size=150 self.layers['BiLSTM'] = BiLSTM(self.rnn_size) self.layers['att_weights'] = { 'h_m':tf.Variable(tf.truncated_normal([self.args.word_dim,self.attention_dims],stddev=0.01)), 'h1': tf.Variable(tf.truncated_normal([2*self.rnn_size,self.attention_dims],stddev=0.01)), 'h2': tf.Variable(tf.truncated_normal([self.attention_dims,1],stddev=0.01)), } self.right_feature,_,_=self.layers['BiLSTM'](self.ment_sent_right_ctx_embed) self.left_feature,_,_=self.layers['BiLSTM'](self.ment_sent_left_ctx_embed) lstm_feature = tf.concat([self.right_feature,self.left_feature],1) att_w_m = tf.einsum('aij,jk->aik',tf.expand_dims(self.ment_surface_feature,1),self.layers['att_weights']['h_m']) att_w1 = tf.nn.tanh(tf.einsum('aij,jk->aik',lstm_feature,self.layers['att_weights']['h1'])+att_w_m) self.att_w2 = tf.nn.softmax(tf.einsum('aij,jk->aik',att_w1,self.layers['att_weights']['h2'])[:,:,0],-1) att_w = tf.tile(tf.expand_dims(self.att_w2,-1),[1,1,2*self.rnn_size]) lstm_feature = tf.reduce_sum(tf.multiply(lstm_feature , att_w),1) lstm_feature = tf.nn.dropout(lstm_feature,self.keep_prob) print('lstm_feature:',lstm_feature) return lstm_feature
def run_lstm(self, context_out, question_pool, context_len, is_train): # tile pooled question rep and concat with context q_rep = tf.expand_dims(question_pool, 1) # (batch_size, 1, D) encoded_passage_shape = tf.shape(context_out)[1] q_rep = tf.tile(q_rep, [1, encoded_passage_shape, 1]) q_c_rep = tf.concat([context_out, q_rep], axis=-1) with tf.variable_scope('lstm_') as scope: lstm_out = BiLSTM(q_c_rep, context_len, self.hidden_units, tf.cond(is_train, lambda: self.output_keep_prob, lambda: 1.0), tf.cond(is_train, lambda: self.input_keep_prob, lambda: 1.0), tf.cond(is_train, lambda: self.state_keep_prob, lambda: 1.0), use_last=False, seed=self.seed, reuse=False) lstm_out = tf.concat([lstm_out[0], lstm_out[1]], 2, name='lstm_out') return lstm_out
def __init__(self, model, **kwargs): self.pc = model.add_subcollection() self.kwargs = kwargs basename = kwargs.get("basename") index = read_index(basename) self._num_labels = len(index[DEPREL]) lstm_num_layers = kwargs.get("lstm_num_layers", 2) lstm_dim = kwargs.get("lstm_dim", 250) self.embeddings = Embeddings.init_from_word2vec(self.pc, basename, FIELDS, index=index) input_dim = self.embeddings.dim self.lstm = BiLSTM(self.pc, input_dim, lstm_dim, lstm_num_layers) self.spec = kwargs,
def main_graph(self, trained_model, scope, emb_dim, cell, rnn_dim, rnn_num, drop_out=0.5, emb=None): if trained_model is not None: param_dic = { 'nums_chars': self.nums_chars, 'nums_tags': self.nums_tags, 'crf': self.crf, 'emb_dim': emb_dim, 'cell': cell, 'rnn_dim': rnn_dim, 'rnn_num': rnn_num, 'drop_out': drop_out, 'buckets_char': self.buckets_char, 'ngram': self.ngram, 'is_space': self.is_space, 'sent_seg': self.sent_seg, 'emb_path': self.emb_path, 'tag_scheme': self.tag_scheme } #print param_dic f_model = open(trained_model, 'w') pickle.dump(param_dic, f_model) f_model.close() # define shared weights and variables batch_size_h = tf.placeholder(tf.int32, [], name='batch_size_holder') dr = tf.placeholder(tf.float32, [], name='drop_out_holder') self.batch_size_h = batch_size_h self.drop_out = dr self.drop_out_v = drop_out # pdb.set_trace() self.emb_layer = EmbeddingLayer(self.nums_chars + 20, emb_dim, weights=emb, name='emb_layer') if self.ngram is not None: ng_embs = [None for _ in range(len(self.ngram))] for i, n_gram in enumerate(self.ngram): self.gram_layers.append( EmbeddingLayer(n_gram + 5000 * (i + 2), emb_dim, weights=ng_embs[i], name=str(i + 2) + 'gram_layer')) with tf.variable_scope('BiRNN'): if cell == 'gru': fw_rnn_cell = tf.contrib.rnn.GRUCell(rnn_dim) #forward bw_rnn_cell = tf.contrib.rnn.GRUCell(rnn_dim) #backward else: fw_rnn_cell = tf.contrib.rnn.LSTMCell(rnn_dim, state_is_tuple=True) bw_rnn_cell = tf.contrib.rnn.LSTMCell(rnn_dim, state_is_tuple=True) if rnn_num > 1: fw_rnn_cell = tf.contrib.rnn.MultiRNNCell([fw_rnn_cell] * rnn_num, state_is_tuple=True) bw_rnn_cell = tf.contrib.rnn.MultiRNNCell([bw_rnn_cell] * rnn_num, state_is_tuple=True) output_wrapper = HiddenLayer(rnn_dim * 2, self.nums_tags, activation='linear', name='hidden') #define model for each bucket for idx, bucket in enumerate(self.buckets_char): if idx == 1: scope.reuse_variables() t1 = time() batch_size = self.real_batches[idx] input_v1 = tf.placeholder(tf.int32, [None, bucket], name='input_1' + str(bucket)) input_v2 = tf.placeholder(tf.int32, [None, bucket], name='input_2' + str(bucket)) self.input_v1.append([input_v1]) self.input_v2.append([input_v2]) #output = None output = [] for i in range(self.num_gpus): with tf.device('/gpu:{}'.format(i)): input_1 = input_v1[i * batch_size_h:(i + 1) * batch_size_h] input_2 = input_v2[i * batch_size_h:(i + 1) * batch_size_h] emb_set1 = [] emb_set2 = [] word_out1 = self.emb_layer(input_1) word_out2 = self.emb_layer(input_2) emb_set1.append(word_out1) emb_set2.append(word_out2) # if self.ngram is not None: # for i in range(len(self.ngram)): # input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket)) # self.input_v[-1].append(input_g) # gram_out = self.gram_layers[i](input_g) # emb_set.append(gram_out) if len(emb_set1) > 1: emb_out1 = tf.concat(axis=2, values=emb_set1) emb_out2 = tf.concat(axis=2, values=emb_set2) else: emb_out1 = emb_set1[0] emb_out2 = emb_set2[0] emb_out1 = DropoutLayer(dr)(emb_out1) emb_out2 = DropoutLayer(dr)(emb_out2) rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr, name='BiLSTM' + str(bucket), scope='BiRNN')(emb_out1, emb_out2, input_v1) output_g = output_wrapper(rnn_out) # if output == None: # output = output_g # else: # output = tf.concat([output,output_g],axis = 0) #pdb.set_trace() output.append(output_g) self.output.append([output]) self.output_.append([ tf.placeholder(tf.int32, [None, bucket - 1], name='tags' + str(bucket)) ]) self.bucket_dit[bucket] = idx print 'Bucket %d, %f seconds' % (idx + 1, time() - t1) assert len(self.input_v1) == len(self.output) self.params = tf.trainable_variables() self.saver = tf.train.Saver()
max_epochs = 30 lstm_dim = 250 arc_hidden_dim = 100 label_hidden_dim = 100 pc = dy.ParameterCollection() # embeddings = Embeddings(pc, [(len(index[FORM])+1, 100), (len(index[XPOS])+1, 25)]) # input_dim = embeddings.dim input_dim = 125 num_labels = len(index[DEPREL]) form_embeddings = pc.add_lookup_parameters((len(index[FORM]) + 1, 100)) pos_embeddings = pc.add_lookup_parameters((len(index[XPOS]) + 1, 25)) bilstm = BiLSTM(pc, input_dim, lstm_dim, 2) arc_mlp = MLP() label_mlp = MLP() arc_mlp.WH = pc.add_parameters((arc_hidden_dim, lstm_dim)) arc_mlp.WD = pc.add_parameters((arc_hidden_dim, lstm_dim)) arc_mlp.b1 = pc.add_parameters((arc_hidden_dim)) arc_mlp.b2 = pc.add_parameters((1, arc_hidden_dim)) label_mlp.WH = pc.add_parameters((label_hidden_dim, lstm_dim)) label_mlp.WD = pc.add_parameters((label_hidden_dim, lstm_dim)) label_mlp.b1 = pc.add_parameters((label_hidden_dim)) label_mlp.b2 = pc.add_parameters((num_labels, label_hidden_dim)) def predict_arc(head, dep, h, WH, WD):
def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, drop_out=0.5, rad_dim=30, emb=None, ng_embs=None, pixels=None, con_width=None, filters=None, pooling_size=None): if trained_model is not None: param_dic = {} param_dic['nums_chars'] = self.nums_chars param_dic['nums_tags'] = self.nums_tags param_dic['tag_scheme'] = self.tag_scheme param_dic['graphic'] = self.graphic param_dic['pic_size'] = self.pic_size param_dic['word_vec'] = self.word_vec param_dic['radical'] = self.radical param_dic['crf'] = self.crf param_dic['emb_dim'] = emb_dim param_dic['gru'] = gru param_dic['rnn_dim'] = rnn_dim param_dic['rnn_num'] = rnn_num param_dic['drop_out'] = drop_out param_dic['filter_size'] = con_width param_dic['filters'] = filters param_dic['pooling_size'] = pooling_size param_dic['font'] = self.font param_dic['buckets_char'] = self.buckets_char param_dic['ngram'] = self.ngram #print param_dic f_model = open(trained_model, 'w') pickle.dump(param_dic, f_model) f_model.close() # define shared weights and variables dr = tf.placeholder(tf.float32, [], name='drop_out_holder') self.drop_out = dr self.drop_out_v = drop_out if self.word_vec: self.emb_layer = EmbeddingLayer(self.nums_chars + 500, emb_dim, weights=emb, name='emb_layer') if self.radical: self.radical_layer = EmbeddingLayer(216, rad_dim, name='radical_layer') if self.ngram is not None: if ng_embs is not None: assert len(ng_embs) == len(self.ngram) else: ng_embs = [None for _ in range(len(self.ngram))] for i, n_gram in enumerate(self.ngram): self.gram_layers.append(EmbeddingLayer(n_gram + 1000 * (i + 2), emb_dim, weights=ng_embs[i], name= str(i + 2) + 'gram_layer')) wrapper_conv_1, wrapper_mp_1, wrapper_conv_2, wrapper_mp_2, wrapper_dense, wrapper_dr = None, None, None, None, None, None if self.graphic: self.input_p = [] assert pixels is not None and filters is not None and pooling_size is not None and con_width is not None self.pixels = pixels pixel_dim = int(math.sqrt(len(pixels[0]))) wrapper_conv_1 = TimeDistributed(Convolution(con_width, 1, filters, name='conv_1'), name='wrapper_c1') wrapper_mp_1 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_1'), name='wrapper_p1') p_size_1 = toolbox.down_pool(pixel_dim, pooling_size) wrapper_conv_2 = TimeDistributed(Convolution(con_width, filters, filters, name='conv_2'), name='wrapper_c2') wrapper_mp_2 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_2'), name='wrapper_p2') p_size_2 = toolbox.down_pool(p_size_1, pooling_size) wrapper_dense = TimeDistributed(HiddenLayer(p_size_2 * p_size_2 * filters, 100, activation='tanh', name='conv_dense'), name='wrapper_3') wrapper_dr = TimeDistributed(DropoutLayer(self.drop_out), name='wrapper_dr') with tf.variable_scope('BiRNN'): if gru: fw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) bw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) else: fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) if rnn_num > 1: fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([fw_rnn_cell]*rnn_num, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([bw_rnn_cell]*rnn_num, state_is_tuple=True) output_wrapper = TimeDistributed(HiddenLayer(rnn_dim * 2, self.nums_tags[0], activation='linear', name='hidden'), name='wrapper') #define model for each bucket for idx, bucket in enumerate(self.buckets_char): if idx == 1: scope.reuse_variables() t1 = time() input_v = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket)) self.input_v.append([input_v]) emb_set = [] if self.word_vec: word_out = self.emb_layer(input_v) emb_set.append(word_out) if self.radical: input_r = tf.placeholder(tf.int32, [None, bucket], name='input_r' + str(bucket)) self.input_v[-1].append(input_r) radical_out = self.radical_layer(input_r) emb_set.append(radical_out) if self.ngram is not None: for i in range(len(self.ngram)): input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket)) self.input_v[-1].append(input_g) gram_out = self.gram_layers[i](input_g) emb_set.append(gram_out) if self.graphic: input_p = tf.placeholder(tf.float32, [None, bucket, pixel_dim*pixel_dim]) self.input_p.append(input_p) pix_out = tf.reshape(input_p, [-1, bucket, pixel_dim, pixel_dim, 1]) pix_out = tf.unpack(pix_out, axis=1) conv_out_1 = wrapper_conv_1(pix_out) pooling_out_1 = wrapper_mp_1(conv_out_1) conv_out_2 = wrapper_conv_2(pooling_out_1) pooling_out_2 = wrapper_mp_2(conv_out_2) assert p_size_2 == pooling_out_2[0].get_shape().as_list()[1] pooling_out = tf.reshape(pooling_out_2, [-1, bucket, p_size_2 * p_size_2 * filters]) pooling_out = tf.unpack(pooling_out, axis=1) graphic_out = wrapper_dense(pooling_out) graphic_out = wrapper_dr(graphic_out) emb_set.append(graphic_out) if len(emb_set) > 1: emb_out = tf.concat(2, emb_set) emb_out = tf.unpack(emb_out) else: emb_out = emb_set[0] rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr, name='BiLSTM' + str(bucket), scope='BiRNN')(emb_out, input_v) output = output_wrapper(rnn_out) output_c = tf.pack(output, axis=1) self.output.append([output_c]) self.output_.append([tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket))]) self.bucket_dit[bucket] = idx print 'Bucket %d, %f seconds' % (idx + 1, time() - t1) assert len(self.input_v) == len(self.output) and len(self.output) == len(self.output_) and len(self.output) == len(self.counts) self.params = tf.trainable_variables() self.saver = tf.train.Saver()
def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, drop_out=0.5, emb=None): if trained_model is not None: param_dic = {'nums_chars': self.nums_chars, 'nums_tags': self.nums_tags, 'crf': self.crf, 'emb_dim': emb_dim, 'gru': gru, 'rnn_dim': rnn_dim, 'rnn_num': rnn_num, 'drop_out': drop_out, 'buckets_char': self.buckets_char, 'ngram': self.ngram, 'is_space': self.is_space, 'sent_seg': self.sent_seg, 'emb_path': self.emb_path, 'tag_scheme': self.tag_scheme} #print param_dic f_model = open(trained_model, 'w') pickle.dump(param_dic, f_model) f_model.close() # define shared weights and variables dr = tf.placeholder(tf.float32, [], name='drop_out_holder') self.drop_out = dr self.drop_out_v = drop_out self.emb_layer = EmbeddingLayer(self.nums_chars + 20, emb_dim, weights=emb, name='emb_layer') if self.ngram is not None: ng_embs = [None for _ in range(len(self.ngram))] for i, n_gram in enumerate(self.ngram): self.gram_layers.append(EmbeddingLayer(n_gram + 5000 * (i + 2), emb_dim, weights=ng_embs[i], name= str(i + 2) + 'gram_layer')) with tf.variable_scope('BiRNN'): if gru: fw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) bw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) else: fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) if rnn_num > 1: fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([fw_rnn_cell]*rnn_num, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([bw_rnn_cell]*rnn_num, state_is_tuple=True) output_wrapper = TimeDistributed(HiddenLayer(rnn_dim * 2, self.nums_tags, activation='linear', name='hidden'), name='wrapper') #define model for each bucket for idx, bucket in enumerate(self.buckets_char): if idx == 1: scope.reuse_variables() t1 = time() input_v = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket)) self.input_v.append([input_v]) emb_set = [] word_out = self.emb_layer(input_v) emb_set.append(word_out) if self.ngram is not None: for i in range(len(self.ngram)): input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket)) self.input_v[-1].append(input_g) gram_out = self.gram_layers[i](input_g) emb_set.append(gram_out) if len(emb_set) > 1: emb_out = tf.concat(2, emb_set) else: emb_out = emb_set[0] emb_out = DropoutLayer(dr)(emb_out) emb_out = tf.unpack(emb_out) rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr, name='BiLSTM' + str(bucket), scope='BiRNN')(emb_out, input_v) output = output_wrapper(rnn_out) output_c = tf.pack(output, axis=1) self.output.append([output_c]) self.output_.append([tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket))]) self.bucket_dit[bucket] = idx print 'Bucket %d, %f seconds' % (idx + 1, time() - t1) assert len(self.input_v) == len(self.output) and len(self.output) == len(self.output_) and len(self.output) == len(self.counts) self.params = tf.trainable_variables() self.saver = tf.train.Saver()
def build_model(self): with tf.variable_scope("Input_Embedding_Layer"): with tf.variable_scope("Char_Embedding_Layer"): # char embedding ## Lookup ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.contc_input), [-1, self.char_limit, self.char_dim]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.quesc_input), [-1, self.char_limit, self.char_dim]) ch_emb = tf.nn.dropout(ch_emb, 1 - self.dropout_emb) qh_emb = tf.nn.dropout(qh_emb, 1 - self.dropout_emb) ## BiLSTM (weight-shared ??) ch_emb, qh_emb = BiLSTM([ch_emb, qh_emb], self.char_filters // 2, dropout=self.dropout_rnn, name='char_lstm') ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) ch_emb = tf.reshape(ch_emb, [-1, self.c_maxlen, self.char_filters]) qh_emb = tf.reshape(qh_emb, [-1, self.q_maxlen, self.char_filters]) with tf.variable_scope("Word_Embedding_Layer"): # word embedding ## Lookup c_emb = tf.nn.embedding_lookup(self.word_mat, self.contw_input) q_emb = tf.nn.embedding_lookup(self.word_mat, self.quesw_input) c_emb = tf.nn.dropout(c_emb, 1.0 - self.dropout_emb) q_emb = tf.nn.dropout(q_emb, 1.0 - self.dropout_emb) # cove features ## word embedding을 (c_emb, q_emp) -> CoVe를 사용해 embedding (c_emp, q_emp) if self.use_cove != 0: if self.use_cove == 2: self.cove_cont = tf.stop_gradient( self.cove_model(c_emb)) # [bs, c_len, 2, 600] self.cove_ques = tf.stop_gradient( self.cove_model(q_emb)) # [bs, q_len, 2, 600] with tf.variable_scope('Cove_weights', reuse=tf.AUTO_REUSE): cove_context_input = CoveCombineLayer( self.cove_cont, 'input') cove_question_input = CoveCombineLayer( self.cove_ques, 'input') c_emb = tf.concat([c_emb, cove_context_input], axis=-1) q_emb = tf.concat([q_emb, cove_question_input], axis=-1) # elmo features ## word embedding을 (c_emb, q_emp) -> Elmo 사용해 embedding (c_emp, q_emp) if self.use_elmo != 0: with tf.variable_scope('ELMo_weights', reuse=tf.AUTO_REUSE): elmo_context_input = ElmoCombineLayer( self.elmo_cont, 'input') elmo_question_input = ElmoCombineLayer( self.elmo_ques, 'input') elmo_context_output = ElmoCombineLayer( self.elmo_cont, 'output') elmo_question_output = ElmoCombineLayer( self.elmo_ques, 'output') c_emb = tf.concat([c_emb, elmo_context_input], axis=-1) q_emb = tf.concat([q_emb, elmo_question_input], axis=-1) if self.use_feat: c_emb = tf.concat( [c_emb, self.cont_feat], axis=-1 ) ## concat [context_pos, context_ner, context_match] q_emb = tf.concat( [q_emb, self.ques_feat], axis=-1) ## concat [ques_pos, ques_ner, ques_match] # combine embedding feats ## concat word_embedding, char_embedding c_emb = tf.concat([c_emb, ch_emb], axis=-1) q_emb = tf.concat([q_emb, qh_emb], axis=-1) # BiLSTM Embedding (weight-shared ??) with tf.variable_scope("BiLSTM_Embedding_Layer"): c_emb, q_emb = BiLSTM([c_emb, q_emb], self.filters // 2, dropout=self.dropout_rnn, name='encoder') with tf.variable_scope("Iterative_Reattention_Aligner"): self.Lambda = tf.get_variable('Lambda', dtype=tf.float32, initializer=self.init_lambda) with tf.variable_scope("Aligning_Block1"): R, Z1, E, B = align_block(u=c_emb, v=q_emb, c_mask=self.c_mask, q_mask=self.q_mask, Lambda=self.Lambda, filters=self.filters, dropout=self.dropout_rnn) R = tf.nn.dropout(R, 1.0 - self.dropout_att) with tf.variable_scope("Aligning_Block2"): R, Z2, E, B = align_block(u=R, v=q_emb, c_mask=self.c_mask, q_mask=self.q_mask, E_0=E, B_0=B, Lambda=self.Lambda, filters=self.filters, dropout=self.dropout_rnn) R = tf.nn.dropout(R, 1.0 - self.dropout_att) with tf.variable_scope("Aligning_Block3"): R, Z3, E, B = align_block(u=R, v=q_emb, c_mask=self.c_mask, q_mask=self.q_mask, E_0=E, B_0=B, Z_0=[Z1, Z2], Lambda=self.Lambda, filters=self.filters, dropout=self.dropout_rnn) R = tf.nn.dropout(R, 1.0 - self.dropout_att) with tf.variable_scope("Answer_Pointer"): # logits if self.use_elmo != 0: elmo_output_feats = ElmoAttention( [elmo_context_output, elmo_question_output], self.c_maxlen, self.q_maxlen, self.q_mask, self.dropout) R = tf.concat([R, elmo_output_feats], axis=-1) s = summary_vector(q_emb, self.c_maxlen, mask=self.q_mask) s = tf.nn.dropout(s, 1 - self.dropout) logits1 = start_logits(R, s, mask=self.c_mask, filters=self.filters, name='Start_Pointer') # [bs, c_len] logits2 = end_logits(R, logits1, s, mask=self.c_mask, filters=self.filters, name='End_Pointer') # [bs, c_len] with tf.variable_scope("Loss_Layer"): # maximum-likelihood (ML) loss for dataset V2.0 start_loss = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits1, labels=self.y_start) end_loss = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits2, labels=self.y_end) self.loss = tf.reduce_mean(start_loss + end_loss) # l2 loss if self.l2_norm is not None: decay_costs = [] for var in tf.trainable_variables(): decay_costs.append(tf.nn.l2_loss(var)) self.loss += tf.multiply(self.l2_norm, tf.add_n(decay_costs)) # RL loss if self.use_rlloss: with tf.variable_scope("Reinforcement_Loss"): self.rl_loss, _, _ = rl_loss(logits1, logits2, self.y_start, self.y_end, self.c_maxlen) self.loss += (self.rlw * self.rl_loss) with tf.variable_scope('Output_Layer'): softmax_start_scores = tf.nn.softmax(logits1) softmax_end_scores = tf.nn.softmax(logits2) outer = tf.matmul(tf.expand_dims(softmax_start_scores, axis=2), tf.expand_dims(softmax_end_scores, axis=1)) outer = tf.matrix_band_part(outer, 0, self.ans_limit) def position_encoding(x): import math for i in range(x.shape[0]): for j in range(x.shape[1]): if j - i > 5: x[i][j] = float(1.0 / math.log(j - i + 1)) return x mask_mat = tf.ones((self.c_maxlen, self.c_maxlen)) mask_mat = tf.expand_dims(tf.py_func(position_encoding, [mask_mat], tf.float32), axis=0) mask_mat = tf.tile(mask_mat, [self.un_size, 1, 1]) outer_masked = outer * mask_mat self.mask_output1 = tf.argmax(tf.reduce_max(outer_masked, axis=2), axis=1) self.mask_output2 = tf.argmax(tf.reduce_max(outer_masked, axis=1), axis=1)
class MSTParser(object): def __init__(self, model, **kwargs): self.pc = model.add_subcollection() self.kwargs = kwargs basename = kwargs.get("basename") index = read_index(basename) self._num_labels = len(index[DEPREL]) lstm_num_layers = kwargs.get("lstm_num_layers", 2) lstm_dim = kwargs.get("lstm_dim", 250) self.embeddings = Embeddings.init_from_word2vec(self.pc, basename, FIELDS, index=index) input_dim = self.embeddings.dim self.lstm = BiLSTM(self.pc, input_dim, lstm_dim, lstm_num_layers) self.spec = kwargs, def transduce(self, feats): x = self.embeddings(feats) h = self.lstm(x) return h @abstractmethod def _predict_arc(self, head, dep, h): raise NotImplementedError() def predict_arcs(self, h): num_nodes = len(h) def _predict_heads(dep): scores = [ self._predict_arc(head, dep, h) if head != dep else dy.zeros(1) for head in range(num_nodes) ] return dy.concatenate(scores) heads = [_predict_heads(dep) for dep in range(1, num_nodes)] return heads @abstractmethod def _predict_labels(self, head, dep, h): raise NotImplementedError() def predict_labels(self, heads, h): num_nodes = len(h) labels = [ self._predict_labels(heads[dep - 1], dep, h) for dep in range(1, num_nodes) ] return labels def _parse_heads(self, heads, h): scores = self.predict_arcs(h) weights = np.transpose( np.vstack([np.zeros(len(h))] + [s.npvalue() for s in scores])) parse_nonprojective(weights, heads) def _parse_labels(self, heads, labels, h): scores = self.predict_labels(heads, h) labels[:] = [ np.argmax(scores[i].npvalue()) + 1 for i in range(len(scores)) ] def parse(self, feats): dy.renew_cg() x = self.embeddings(feats) h = self.lstm(x) tree = DepTree(len(x)) self._parse_heads(tree.heads, h) self._parse_labels(tree.heads, tree.labels, h) return tree def disable_dropout(self): self.embeddings.disable_dropout() self.lstm.disable_dropout() def enable_dropout(self): self.embeddings.set_dropout(self.kwargs.get("input_dropout", 0)) self.lstm.set_dropout(self.kwargs.get("lstm_dropout", 0)) def param_collection(self): return self.pc __metaclass__ = ABCMeta
def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, fnn_dim, window_size, drop_out=0.5, rad_dim=30, emb=None, ng_embs=None, pixels=None, con_width=None, filters=None, pooling_size=None): if trained_model is not None: param_dic = {} param_dic['nums_chars'] = self.nums_chars param_dic['nums_tags'] = self.nums_tags param_dic['tag_scheme'] = self.tag_scheme param_dic['graphic'] = self.graphic param_dic['pic_size'] = self.pic_size param_dic['word_vec'] = self.word_vec param_dic['radical'] = self.radical param_dic['crf'] = self.crf param_dic['emb_dim'] = emb_dim param_dic['gru'] = gru param_dic['rnn_dim'] = rnn_dim param_dic['rnn_num'] = rnn_num param_dic['fnn_dim'] = fnn_dim param_dic['window_size'] = window_size param_dic['drop_out'] = drop_out param_dic['filter_size'] = con_width param_dic['filters'] = filters param_dic['pooling_size'] = pooling_size param_dic['font'] = self.font param_dic['buckets_char'] = self.buckets_char param_dic['ngram'] = self.ngram param_dic['mode'] = self.mode #print param_dic if self.metric == 'All': pindex = trained_model.rindex('/') + 1 for m in self.all_metrics: f_model = open( trained_model[:pindex] + m + '_' + trained_model[pindex:], 'w') pickle.dump(param_dic, f_model) f_model.close() else: f_model = open(trained_model, 'w') pickle.dump(param_dic, f_model) f_model.close() # define shared weights and variables dr = tf.placeholder(tf.float32, [], name='drop_out_holder') self.drop_out = dr self.drop_out_v = drop_out #concat_emb_dim = emb_dim * 2 concat_emb_dim = 0 if self.word_vec: self.emb_layer = EmbeddingLayer(self.nums_chars + 500, emb_dim, weights=emb, name='emb_layer') concat_emb_dim += emb_dim if self.radical: self.radical_layer = EmbeddingLayer(216, rad_dim, name='radical_layer') concat_emb_dim += rad_dim if self.ngram is not None: if ng_embs is not None: assert len(ng_embs) == len(self.ngram) else: ng_embs = [None for _ in range(len(self.ngram))] for i, n_gram in enumerate(self.ngram): self.gram_layers.append( EmbeddingLayer(n_gram + 1000 * (i + 2), emb_dim, weights=ng_embs[i], name=str(i + 2) + 'gram_layer')) concat_emb_dim += emb_dim wrapper_conv_1, wrapper_mp_1, wrapper_conv_2 = None, None, None wrapper_mp_2, wrapper_dense, wrapper_dr = None, None, None if self.graphic: self.input_p = [] assert pixels is not None and filters is not None and pooling_size is not None and con_width is not None self.pixels = pixels pixel_dim = int(math.sqrt(len(pixels[0]))) wrapper_conv_1 = Convolution(con_width, 1, filters, name='conv_1') wrapper_mp_1 = Maxpooling(pooling_size, pooling_size, name='pooling_1') p_size_1 = toolbox.down_pool(pixel_dim, pooling_size) wrapper_conv_2 = Convolution(con_width, filters, filters, name='conv_2') wrapper_mp_2 = Maxpooling(pooling_size, pooling_size, name='pooling_2') p_size_2 = toolbox.down_pool(p_size_1, pooling_size) wrapper_dense = HiddenLayer(p_size_2 * p_size_2 * filters, 100, activation='tanh', name='conv_dense') wrapper_dr = DropoutLayer(self.drop_out) concat_emb_dim += 100 fw_rnn_cell, bw_rnn_cell = None, None if self.mode == 'RNN': with tf.variable_scope('BiRNN'): if gru: fw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) bw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) else: fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) if rnn_num > 1: fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell( [fw_rnn_cell] * rnn_num, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell( [bw_rnn_cell] * rnn_num, state_is_tuple=True) output_wrapper = HiddenLayer(rnn_dim * 2, self.nums_tags[0], activation='linear', name='out_wrapper') fnn_weights, fnn_bias = None, None else: with tf.variable_scope('FNN'): fnn_weights = tf.get_variable( 'conv_w', [2 * window_size + 1, concat_emb_dim, 1, fnn_dim]) fnn_bias = tf.get_variable( 'conv_b', [fnn_dim], initializer=tf.constant_initializer(0.1)) output_wrapper = HiddenLayer(fnn_dim, self.nums_tags[0], activation='linear', name='out_wrapper') #define model for each bucket for idx, bucket in enumerate(self.buckets_char): if idx == 1: scope.reuse_variables() t1 = time() input_v = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket)) self.input_v.append([input_v]) emb_set = [] if self.word_vec: word_out = self.emb_layer(input_v) emb_set.append(word_out) if self.radical: input_r = tf.placeholder(tf.int32, [None, bucket], name='input_r' + str(bucket)) self.input_v[-1].append(input_r) radical_out = self.radical_layer(input_r) emb_set.append(radical_out) if self.ngram is not None: for i in range(len(self.ngram)): input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket)) self.input_v[-1].append(input_g) gram_out = self.gram_layers[i](input_g) emb_set.append(gram_out) if self.graphic: input_p = tf.placeholder(tf.float32, [None, bucket, pixel_dim * pixel_dim]) self.input_p.append(input_p) pix_out = tf.reshape(input_p, [-1, pixel_dim, pixel_dim, 1]) conv_out_1 = wrapper_conv_1(pix_out) pooling_out_1 = wrapper_mp_1(conv_out_1) conv_out_2 = wrapper_conv_2(pooling_out_1) pooling_out_2 = wrapper_mp_2(conv_out_2) assert p_size_2 == pooling_out_2[0].get_shape().as_list()[1] pooling_out = tf.reshape( pooling_out_2, [-1, bucket, p_size_2 * p_size_2 * filters]) graphic_out = wrapper_dense(pooling_out) graphic_out = wrapper_dr(graphic_out) emb_set.append(graphic_out) if len(emb_set) > 1: emb_out = tf.concat(axis=2, values=emb_set) else: emb_out = emb_set[0] if self.mode == 'RNN': rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr, name='BiLSTM' + str(bucket), scope='BiRNN')(emb_out, input_v) output = output_wrapper(rnn_out) else: emb_out = tf.pad(emb_out, [[0, 0], [window_size, window_size], [0, 0]]) emb_out = tf.reshape( emb_out, [-1, bucket + 2 * window_size, concat_emb_dim, 1]) conv_out = tf.nn.conv2d(emb_out, fnn_weights, [1, 1, 1, 1], padding='VALID') + fnn_bias fnn_out = tf.nn.tanh(conv_out) fnn_out = tf.reshape(fnn_out, [-1, bucket, fnn_dim]) output = output_wrapper(fnn_out) self.output.append([output]) self.output_.append([ tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket)) ]) self.bucket_dit[bucket] = idx print 'Bucket %d, %f seconds' % (idx + 1, time() - t1) assert len(self.input_v) == len(self.output) and len(self.output) == len(self.output_) \ and len(self.output) == len(self.counts) self.params = tf.trainable_variables() self.saver = tf.train.Saver()
def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, drop_out=0.5, rad_dim=30, emb=None, ngram_embedding=None, pixels=None, con_width=None, filters=None, pooling_size=None): """ :param trained_model: :param scope: :param emb_dim: :param gru: :param rnn_dim: :param rnn_num: :param drop_out: :param rad_dim: n :param emb: :param ngram_embedding: 预训练 ngram embeddig 文件 :param pixels: :param con_width: :param filters: :param pooling_size: :return: """ # trained_model: 模型存储路径 if trained_model is not None: param_dic = {'nums_chars': self.nums_chars, 'nums_tags': self.nums_tags, 'tag_scheme': self.tag_scheme, 'graphic': self.graphic, 'pic_size': self.pic_size, 'word_vec': self.word_vec, 'radical': self.radical, 'crf': self.crf, 'emb_dim': emb_dim, 'gru': gru, 'rnn_dim': rnn_dim, 'rnn_num': rnn_num, 'drop_out': drop_out, 'filter_size': con_width, 'filters': filters, 'pooling_size': pooling_size, 'font': self.font, 'buckets_char': self.buckets_char, 'ngram': self.ngram} print "RNN dimension is %d" % rnn_dim print "RNN number is %d" % rnn_num print "Character embedding size is %d" % emb_dim print "Ngram embedding dimension is %d" % emb_dim # 存储模型超参数 if self.metric == 'All': # rindex() 返回子字符串 str 在字符串中最后出现的位置 # 截取模型文件名 pindex = trained_model.rindex('/') + 1 for m in self.all_metrics: f_model = open(trained_model[:pindex] + m + '_' + trained_model[pindex:], 'w') pickle.dump(param_dic, f_model) f_model.close() else: f_model = open(trained_model, 'w') pickle.dump(param_dic, f_model) f_model.close() # define shared weights and variables dr = tf.placeholder(tf.float32, [], name='drop_out_holder') self.drop_out = dr self.drop_out_v = drop_out # 字向量层 # 为什么字符数要加 500 ? # emb_dim 是每个字符的特征向量维度,可以通过命令行参数设置 # weights 表示预训练的字向量,可以通过命令行参数设置 if self.word_vec: self.emb_layer = EmbeddingLayer(self.nums_chars + 500, emb_dim, weights=emb, name='emb_layer') # 偏旁部首向量 # 依照《康熙字典》,共有 214 个偏旁部首。 # 只用了常见汉字的偏旁部首,非常见汉字和非汉字的偏旁部首用其他两个特殊符号代替, # 所以共有 216 个偏旁部首 if self.radical: self.radical_layer = EmbeddingLayer(216, rad_dim, name='radical_layer') if self.ngram is not None: if ngram_embedding is not None: assert len(ngram_embedding) == len(self.ngram) else: ngram_embedding = [None for _ in range(len(self.ngram))] for i, n_gram in enumerate(self.ngram): self.gram_layers.append(EmbeddingLayer(n_gram + 1000 * (i + 2), emb_dim, weights=ngram_embedding[i], name=str(i + 2) + 'gram_layer')) wrapper_conv_1, wrapper_mp_1, wrapper_conv_2, wrapper_mp_2, wrapper_dense, wrapper_dr = \ None, None, None, None, None, None if self.graphic: # 使用图像信息,需要用到 CNN self.input_p = [] assert pixels is not None and filters is not None and pooling_size is not None and con_width is not None self.pixels = pixels pixel_dim = int(math.sqrt(len(pixels[0]))) wrapper_conv_1 = TimeDistributed(Convolution(con_width, 1, filters, name='conv_1'), name='wrapper_c1') wrapper_mp_1 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_1'), name='wrapper_p1') p_size_1 = toolbox.down_pool(pixel_dim, pooling_size) wrapper_conv_2 = TimeDistributed(Convolution(con_width, filters, filters, name='conv_2'), name='wrapper_c2') wrapper_mp_2 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_2'), name='wrapper_p2') p_size_2 = toolbox.down_pool(p_size_1, pooling_size) wrapper_dense = TimeDistributed( HiddenLayer(p_size_2 * p_size_2 * filters, 100, activation='tanh', name='conv_dense'), name='wrapper_3') wrapper_dr = TimeDistributed(DropoutLayer(self.drop_out), name='wrapper_dr') with tf.variable_scope('BiRNN'): if gru: fw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) bw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) else: fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) if rnn_num > 1: fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([fw_rnn_cell] * rnn_num, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([bw_rnn_cell] * rnn_num, state_is_tuple=True) # 隐藏层,输入是前向 RNN 的输出加上 后向 RNN 的输出,所以输入维度为 rnn_dim * 2 # 输出维度即标签个数 output_wrapper = TimeDistributed( HiddenLayer(rnn_dim * 2, self.nums_tags[0], activation='linear', name='hidden'), name='wrapper') # define model for each bucket # 每一个 bucket 中的句子长度不一样,所以需要定义单独的模型 # bucket: bucket 中的句子长度 for idx, bucket in enumerate(self.buckets_char): if idx == 1: # scope 是 tf.variable_scope("tagger", reuse=None, initializer=initializer) # 只需要设置一次 reuse,后面就都 reuse 了 scope.reuse_variables() t1 = time() # 输入的句子,one-hot 向量 # shape = (batch_size, 句子长度) input_sentences = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket)) self.input_v.append([input_sentences]) emb_set = [] if self.word_vec: # 根据 one-hot 向量查找对应的字向量 # word_out: shape=(batch_size, 句子长度,字向量维度(64)) word_out = self.emb_layer(input_sentences) emb_set.append(word_out) if self.radical: # 嵌入偏旁部首信息,shape = (batch_size, 句子长度) input_radicals = tf.placeholder(tf.int32, [None, bucket], name='input_r' + str(bucket)) self.input_v[-1].append(input_radicals) radical_out = self.radical_layer(input_radicals) emb_set.append(radical_out) if self.ngram is not None: for i in range(len(self.ngram)): input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket)) self.input_v[-1].append(input_g) gram_out = self.gram_layers[i](input_g) emb_set.append(gram_out) if self.graphic: input_p = tf.placeholder(tf.float32, [None, bucket, pixel_dim * pixel_dim]) self.input_p.append(input_p) pix_out = tf.reshape(input_p, [-1, bucket, pixel_dim, pixel_dim, 1]) conv_out_1 = wrapper_conv_1(pix_out) pooling_out_1 = wrapper_mp_1(conv_out_1) conv_out_2 = wrapper_conv_2(pooling_out_1) pooling_out_2 = wrapper_mp_2(conv_out_2) assert p_size_2 == pooling_out_2[0].get_shape().as_list()[1] pooling_out = tf.reshape(pooling_out_2, [-1, bucket, p_size_2 * p_size_2 * filters]) pooling_out = tf.unstack(pooling_out, axis=1) graphic_out = wrapper_dense(pooling_out) graphic_out = wrapper_dr(graphic_out) emb_set.append(graphic_out) if self.window_size > 1: padding_size = int(np.floor(self.window_size / 2)) word_padded = tf.pad(word_out, [[0, 0], [padding_size, padding_size], [0, 0]], 'CONSTANT') Ws = [] for q in range(1, self.window_size + 1): Ws.append(tf.get_variable("W_%d" % q, shape=[q * emb_dim, self.filters_number])) b = tf.get_variable("b", shape=[self.filters_number]) z = [None for _ in range(0, bucket)] for q in range(1, self.window_size + 1): for i in range(padding_size, bucket + padding_size): low = i - int(np.floor((q - 1) / 2)) high = i + int(np.ceil((q + 1) / 2)) x = word_padded[:, low, :] for j in range(low + 1, high): x = tf.concat(values=[x, word_padded[:, j, :]], axis=1) z_iq = tf.tanh(tf.nn.xw_plus_b(x, Ws[q - 1], b)) if z[i - padding_size] is None: z[i - padding_size] = z_iq else: z[i - padding_size] = tf.concat([z[i - padding_size], z_iq], axis=1) z = tf.stack(z, axis=1) values, indices = tf.nn.top_k(z, sorted=False, k=emb_dim) # highway layer X = tf.unstack(word_out, axis=1) Conv_X = tf.unstack(values, axis=1) X_hat = [] W_t = tf.get_variable("W_t", shape=[emb_dim, emb_dim]) b_t = tf.get_variable("b_t", shape=[emb_dim]) for x, conv_x in zip(X, Conv_X): T_x = tf.sigmoid(tf.nn.xw_plus_b(x, W_t, b_t)) X_hat.append(tf.multiply(conv_x, T_x) + tf.multiply(x, 1 - T_x)) X_hat = tf.stack(X_hat, axis=1) emb_set.append(X_hat) if len(emb_set) > 1: # 各种字向量直接 concat 起来(字向量、偏旁部首、n-gram、图像信息等) emb_out = tf.concat(axis=2, values=emb_set) else: emb_out = emb_set[0] # rnn_out 是前向 RNN 的输出和后向 RNN 的输出 concat 之后的值 rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr, name='BiLSTM' + str(bucket), scope='BiRNN')(self.highway(emb_out, "tag"), input_sentences) # 应用全连接层,Wx+b 得到最后的输出 output = output_wrapper(rnn_out) # 为什么要 [output] 而不是 output 呢? self.output.append([output]) self.output_.append([tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket))]) self.bucket_dit[bucket] = idx # language model lm_rnn_dim = rnn_dim with tf.variable_scope('LM-BiRNN'): if gru: lm_fw_rnn_cell = tf.nn.rnn_cell.GRUCell(lm_rnn_dim) lm_bw_rnn_cell = tf.nn.rnn_cell.GRUCell(lm_rnn_dim) else: lm_fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(lm_rnn_dim, state_is_tuple=True) lm_bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(lm_rnn_dim, state_is_tuple=True) if rnn_num > 1: lm_fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([lm_fw_rnn_cell] * rnn_num, state_is_tuple=True) lm_bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([lm_bw_rnn_cell] * rnn_num, state_is_tuple=True) lm_rnn_output = BiLSTM(lm_rnn_dim, fw_cell=lm_fw_rnn_cell, bw_cell=lm_bw_rnn_cell, p=dr, name='LM-BiLSTM' + str(bucket), scope='LM-BiRNN')(self.highway(emb_set[0]), input_sentences) lm_output_wrapper = TimeDistributed( HiddenLayer(lm_rnn_dim * 2, self.nums_chars + 2, activation='linear', name='lm_hidden'), name='lm_wrapper') lm_final_output = lm_output_wrapper(lm_rnn_output) self.lm_predictions.append([lm_final_output]) self.lm_groundtruthes.append([tf.placeholder(tf.int32, [None, bucket], name='lm_targets' + str(bucket))]) print 'Bucket %d, %f seconds' % (idx + 1, time() - t1) assert \ len(self.input_v) == len(self.output) and \ len(self.output) == len(self.output_) and \ len(self.lm_predictions) == len(self.lm_groundtruthes) and \ len(self.output) == len(self.counts) self.params = tf.trainable_variables() self.saver = tf.train.Saver()
def build_model(self): with tf.variable_scope("Input_Embedding_Layer"): with tf.variable_scope("Char_Embedding_Layer"): # char embedding ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.contc_input), [-1, self.char_limit, self.char_dim]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.quesc_input), [-1, self.char_limit, self.char_dim]) ch_emb = tf.nn.dropout(ch_emb, 1 - self.dropout_emb) qh_emb = tf.nn.dropout(qh_emb, 1 - self.dropout_emb) ch_emb, qh_emb = BiLSTM([ch_emb, qh_emb], self.char_dim // 2, dropout=self.dropout_rnn, name='char_lstm', return_state=True) ch_emb = tf.reshape(ch_emb, [-1, self.c_maxlen, self.char_dim]) qh_emb = tf.reshape(qh_emb, [-1, self.q_maxlen, self.char_dim]) with tf.variable_scope("Word_Embedding_Layer"): # word embedding c_emb = tf.nn.embedding_lookup(self.word_mat, self.contw_input) q_emb = tf.nn.embedding_lookup(self.word_mat, self.quesw_input) c_emb = tf.nn.dropout(c_emb, 1.0 - self.dropout_emb) q_emb = tf.nn.dropout(q_emb, 1.0 - self.dropout_emb) # cove features if self.use_cove != 0: if self.use_cove == 2: self.cove_cont = tf.stop_gradient( self.cove_model(c_emb)) # [bs, c_len, 2, 600] self.cove_ques = tf.stop_gradient( self.cove_model(q_emb)) # [bs, q_len, 2, 600] with tf.variable_scope('Cove_weights', reuse=tf.AUTO_REUSE): cove_context_input = CoveCombineLayer( self.cove_cont, 'input') cove_question_input = CoveCombineLayer( self.cove_ques, 'input') c_emb = tf.concat([c_emb, cove_context_input], axis=-1) q_emb = tf.concat([q_emb, cove_question_input], axis=-1) # elmo features if self.use_elmo != 0: with tf.variable_scope('ELMo_weights', reuse=tf.AUTO_REUSE): elmo_context_input = ElmoCombineLayer( self.elmo_cont, 'input') elmo_question_input = ElmoCombineLayer( self.elmo_ques, 'input') elmo_context_output = ElmoCombineLayer( self.elmo_cont, 'output') elmo_question_output = ElmoCombineLayer( self.elmo_ques, 'output') c_emb = tf.concat([c_emb, elmo_context_input], axis=-1) q_emb = tf.concat([q_emb, elmo_question_input], axis=-1) if self.use_feat: c_emb = tf.concat([c_emb, self.cont_feat], axis=-1) q_emb = tf.concat([q_emb, self.ques_feat], axis=-1) # combine embedding feats c_emb = tf.concat([c_emb, ch_emb], axis=-1) q_emb = tf.concat([q_emb, qh_emb], axis=-1) # BiLSTM Embedding with tf.variable_scope("BiLSTM_Embedding_Layer"): c_emb, q_emb = BiLSTM([c_emb, q_emb], self.filters // 2, dropout=self.dropout_rnn, name='encoder') with tf.variable_scope("Iterative_Reattention_Aligner"): self.Lambda = tf.get_variable('Lambda', dtype=tf.float32, initializer=self.init_lambda) with tf.variable_scope("Aligning_Block1"): R, Z1, E, B = align_block(u=c_emb, v=q_emb, c_mask=self.c_mask, q_mask=self.q_mask, Lambda=self.Lambda, filters=self.filters, dropout=self.dropout_rnn) R = tf.nn.dropout(R, 1.0 - self.dropout_att) with tf.variable_scope("Aligning_Block2"): R, Z2, E, B = align_block(u=R, v=q_emb, c_mask=self.c_mask, q_mask=self.q_mask, E_0=E, B_0=B, Lambda=self.Lambda, filters=self.filters, dropout=self.dropout_rnn) R = tf.nn.dropout(R, 1.0 - self.dropout_att) with tf.variable_scope("Aligning_Block3"): R, Z3, E, B = align_block(u=R, v=q_emb, c_mask=self.c_mask, q_mask=self.q_mask, E_0=E, B_0=B, Z_0=[Z1, Z2], Lambda=self.Lambda, filters=self.filters, dropout=self.dropout_rnn) R = tf.nn.dropout(R, 1.0 - self.dropout_att) with tf.variable_scope("Answer_Pointer"): # logits if self.use_elmo != 0: elmo_output_feats = ElmoAttention( [elmo_context_output, elmo_question_output], self.c_maxlen, self.q_maxlen, self.q_mask, self.dropout) R = tf.concat([R, elmo_output_feats], axis=-1) s = summary_vector(q_emb, self.c_maxlen, mask=self.q_mask) s = tf.nn.dropout(s, 1 - self.dropout) logits1 = start_logits(R, s, mask=self.c_mask, filters=self.filters, name='Start_Pointer') # [bs, c_len] logits2 = end_logits(R, logits1, s, mask=self.c_mask, filters=self.filters, name='End_Pointer') # [bs, c_len] self.unanswer_bias = tf.get_variable( "unanswer_bias", [1], initializer=tf.zeros_initializer()) self.unanswer_bias = tf.reshape( tf.tile(self.unanswer_bias, [self.un_size]), [-1, 1]) logits1 = tf.concat((self.unanswer_bias, logits1), axis=-1) logits2 = tf.concat((self.unanswer_bias, logits2), axis=-1) logits1p = start_logits(R, s, mask=self.c_mask, filters=self.filters, name='Start_Pointer2') # [bs, c_len] logits2p = end_logits(R, logits1p, s, mask=self.c_mask, filters=self.filters, name='End_Pointer2') # [bs, c_len] with tf.variable_scope("Loss_Layer"): # maximum-likelihood (ML) loss for dataset V2.0 # loss a start_loss = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits1, labels=self.y_start) end_loss = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits2, labels=self.y_end) self.loss = tf.reduce_mean(start_loss + end_loss) # loss b pstart_loss = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits1p, labels=self.yp_start) pend_loss = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits2p, labels=self.yp_end) self.loss += self.gamma_b * tf.reduce_mean(pstart_loss + pend_loss) # loss c answer_exist_label = tf.cast( tf.slice(self.y_start, [0, 0], [-1, 1]), tf.float32) self.loss += self.gamma_c * tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=self.unanswer_bias, labels=answer_exist_label)) # l2 loss if self.l2_norm is not None: decay_costs = [] for var in tf.trainable_variables(): decay_costs.append(tf.nn.l2_loss(var)) self.loss += tf.multiply(self.l2_norm, tf.add_n(decay_costs)) # RL loss if self.use_rlloss: with tf.variable_scope("Reinforcement_Loss"): self.rl_loss_a, _, _ = rl_loss(logits1, logits2, self.y_start, self.y_end, self.c_maxlen + 1) self.rl_loss_b, _, _ = rl_loss(logits1p, logits2p, self.yp_start, self.yp_end, self.c_maxlen) self.loss += ( self.rlw * (self.rl_loss_a + self.gamma_b * self.rl_loss_b)) with tf.variable_scope('Output_Layer'): softmax_start_scores = tf.nn.softmax( tf.slice(logits1, [0, 1], [-1, -1])) softmax_end_scores = tf.nn.softmax( tf.slice(logits2, [0, 1], [-1, -1])) unanswer_mask1 = tf.cast( tf.argmax(tf.nn.softmax(logits1), axis=-1), tf.int64) unanswer_mask1 = tf.cast( tf.cast(unanswer_mask1, tf.bool), tf.int64) # [bs,] has answer=1 no answer=0 unanswer_move1 = unanswer_mask1 - 1 # [bs,] has answer=0 no answer=-1 unanswer_mask2 = tf.cast( tf.argmax(tf.nn.softmax(logits2), axis=-1), tf.int64) unanswer_mask2 = tf.cast(tf.cast(unanswer_mask2, tf.bool), tf.int64) # [bs,] unanswer_move2 = unanswer_mask2 - 1 softmax_start_p = tf.nn.softmax(logits2p) softmax_end_p = tf.nn.softmax(logits2p) softmax_start_scores = ( 1 - self.gamma_b ) * softmax_start_scores + self.gamma_b * softmax_start_p softmax_end_scores = ( 1 - self.gamma_b ) * softmax_end_scores + self.gamma_b * softmax_end_p outer = tf.matmul(tf.expand_dims(softmax_start_scores, axis=2), tf.expand_dims(softmax_end_scores, axis=1)) outer = tf.matrix_band_part(outer, 0, self.ans_limit) def position_encoding(x): import math for i in range(x.shape[0]): for j in range(x.shape[1]): if j - i > 5: x[i][j] = float(1.0 / math.log(j - i + 1)) return x mask_mat = tf.ones((self.c_maxlen, self.c_maxlen)) mask_mat = tf.expand_dims(tf.py_func(position_encoding, [mask_mat], tf.float32), axis=0) mask_mat = tf.tile(mask_mat, [self.un_size, 1, 1]) outer_masked = outer * mask_mat self.mask_output1 = tf.argmax( tf.reduce_max(outer_masked, axis=2), axis=1) * unanswer_mask1 + unanswer_move1 self.mask_output2 = tf.argmax( tf.reduce_max(outer_masked, axis=1), axis=1) * unanswer_mask2 + unanswer_move2