def __init__(self, input,embeddings,features,mini_batch_size=32,nhu=300,width=5,activation=hardtanh,seed=1234,n_out=9,name='SennaNER',params=None): self.name = name self.layers = [] self.input = input self.output = None embedding_dim=embeddings.shape[1] features_dim = features.shape[1] rng=np.random.RandomState(seed) self.EmbeddingLayer = EmbeddingLayer(input=input[:,:,0],w_values=embeddings,embedding_dim=embedding_dim,mini_batch_size=mini_batch_size,width=width,params=params) self.EmbeddingLayer = EmbeddingLayer(input=input[:,:,1],w_values=features,embedding_dim=features_dim,mini_batch_size=mini_batch_size,width=width) self.HiddenLayer = DoubleInputHiddenLayer(input1=self.EmbeddingLayer.output, input2=self.StaticEmbeddingLayer.output, n_in1=embedding_dim*width, n_in2=features_dim*width, n_out=nhu, rng=rng, activation=activation,params=params) self.LogisticRegressionLayer = LogisticRegressionLayer(input=self.HiddenLayer.output,n_in=nhu,n_out=n_out, rng=rng, params=params) self.layers=[self.EmbeddingLayer,self.StaticEmbeddingLayer,self.HiddenLayer,self.LogisticRegressionLayer] self.L1 = T.sum([layer.L1 for layer in self.layers if "L1" in layer.__dict__]) self.L2 = T.sum([layer.L2 for layer in self.layers if "L2" in layer.__dict__]) self.params = list(itertools.chain(*[layer.params for layer in self.layers])) self.negative_log_likelihood = self.LogisticRegressionLayer.negative_log_likelihood self.errors = self.LogisticRegressionLayer.errors self.predictions = self.LogisticRegressionLayer.y_pred self.n_ins = list(itertools.chain(*[[layer.n_in]*len(layer.params) for layer in self.layers])) print self.n_ins print self.params #class SennaNER_alt(Network): # def __init__(self, input,embeddings,mini_batch_size=32,nhu=300,width=5,activation=hardtanh,seed=1234,n_out=9,name='SennaNER',params=None): # self.name = name # self.layers = [] # self.input = input # self.output = None # # embedding_dim=embeddings.shape[1] # features = np.eye(4) # # rng=np.random.RandomState(seed) # self.HiddenLayer = DoubleEmbeddingHiddenLayer(input=input,embeddings_values=embeddings,features_values=features,n_in1=embedding_dim*width,n_in2=features.shape[0]*width, batch_size=mini_batch_size, n_out=nhu, rng=rng, activation=activation,params=params) ## self.EmbeddingLayer = EmbeddingLayer(input=input[:,:,0],w_values=embeddings,embedding_dim=embedding_dim,mini_batch_size=mini_batch_size,width=width,params=params) ## self.StaticEmbeddingLayer = StaticEmbeddingLayer(input=input[:,:,1],w_values=features,embedding_dim=features.shape[0],mini_batch_size=mini_batch_size,width=width) ## self.HiddenLayer = DoubleInputHiddenLayer(input1=self.EmbeddingLayer.output, input2=self.StaticEmbeddingLayer.output, n_in1=embedding_dim*width, n_in2=features.shape[0]*width, n_out=nhu, rng=rng, activation=activation,params=params) # self.LogisticRegressionLayer = LogisticRegressionLayer(input=self.HiddenLayer.output,n_in=nhu,n_out=n_out, rng=rng, params=params) # self.layers=[self.HiddenLayer,self.LogisticRegressionLayer] # # self.L1 = T.sum([layer.L1 for layer in self.layers if "L1" in layer.__dict__]) # self.L2 = T.sum([layer.L2 for layer in self.layers if "L2" in layer.__dict__]) # # self.params = list(itertools.chain(*[layer.params for layer in self.layers])) # # self.negative_log_likelihood = self.LogisticRegressionLayer.negative_log_likelihood # self.errors = self.LogisticRegressionLayer.errors # self.predictions = self.LogisticRegressionLayer.y_pred # self.n_ins = list(itertools.chain(*[[layer.n_in]*len(layer.params) for layer in self.layers])) # print self.n_ins # print self.params
def __init__(self): self.session = tf.InteractiveSession() tf.set_random_seed(FLAGS.seed) np.random.seed(FLAGS.seed) print('Reading embeddings...', end='', flush=True) self.embedding = EmbeddingLayer(FLAGS.embedding) print('done') # Define network parameters nkwargs = { 'in_size' : self.embedding.shape[1], 'out_size' : FLAGS.hidden, 'depth' : FLAGS.depth, 'batch_size': FLAGS.batch, } # Define the inputs, and their respective type & size inputs = { 'x' : [tf.int32, [None, FLAGS.batch]], 'y' : [tf.float32, [FLAGS.batch, 5]], 'kp' : [tf.float32, []], 'lambda' : [tf.float32, []], 'sparsity' : [tf.float32, []], 'coherency' : [tf.float32, []], } # Create placeholders with tf.name_scope('Placeholders'): p = { name: tf.placeholder(*args, name=name) for name, args in inputs.items() } self.train_fd = lambda x,y, kp=FLAGS.keep_prob: { p['x'] : x, p['y'] : y, p['kp'] : kp, p['lambda'] : FLAGS.l2_reg, p['sparsity'] : FLAGS.sparsity, p['coherency'] : FLAGS.coherency, } dropout = lambda x: tf.nn.dropout(x, p['kp']) bxentropy = lambda x,y: -(y * tf.log(x + 1e-8) + (1. - y) * tf.log(1. - x + 1e-8)) sq_err = lambda x,y: (x - y) ** 2 pad_mask = tf.to_float(tf.not_equal(p['x'], self.embedding.pad_id)) embedding = dropout( self.embedding.forward(p['x']) ) print('Creating model...', end='', flush=True) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.generator = Generator(embedding, pad_mask, p, nkwargs, dropout, bxentropy) self.encoder = Encoder(embedding, pad_mask, p, nkwargs, dropout, sq_err) self.encoder.create_minimization(self.generator.z) self.generator.create_minimization(self.encoder.loss_vec, self.global_step) print('done')
def _build_graph(self): """ Defines the model graph. """ with tf.variable_scope('{:s}_model'.format(self.name)): # Instantiate embedding layer(s) if self.config.untie_enc_dec_embeddings: enc_vocab_size = self.source_vocab_size dec_vocab_size = self.target_vocab_size else: assert self.source_vocab_size == self.target_vocab_size, \ 'Input and output vocabularies should be identical when tying embedding tables.' enc_vocab_size = dec_vocab_size = self.source_vocab_size encoder_embedding_layer = EmbeddingLayer(enc_vocab_size, self.config.embedding_size, self.config.hidden_size, self.float_dtype, name='encoder_embedding_layer') if self.config.untie_enc_dec_embeddings: decoder_embedding_layer = EmbeddingLayer(dec_vocab_size, self.config.embedding_size, self.config.hidden_size, self.float_dtype, name='decoder_embedding_layer') else: decoder_embedding_layer = encoder_embedding_layer if self.config.untie_decoder_embeddings: softmax_projection_layer = EmbeddingLayer(dec_vocab_size, self.config.embedding_size, self.config.hidden_size, self.float_dtype, name='softmax_projection_layer') else: softmax_projection_layer = decoder_embedding_layer # Instantiate the component networks self.enc = TransformerEncoder(self.config, encoder_embedding_layer, self.training, self.float_dtype, self.gate_tracker, 'encoder') self.dec = TransformerDecoder(self.config, decoder_embedding_layer, softmax_projection_layer, self.training, self.int_dtype, self.float_dtype, self.gate_tracker, 'decoder') return dec_vocab_size
def __init__(self, hidden_size, batch_size, K, W_init, config, max_sen_len): super(HAQA, self).__init__() self.embedding = EmbeddingLayer(W_init, config) embedding_size = W_init.shape[1] + config['char_filter_size'] self.ga = GatedAttentionLayer() # non-parametrized self.gaao = GatedAttentionAttOnly() # non-parametrized self.ha = HopAttentionLayer() # parametrized self.gating_w = Variable(torch.Tensor([0.5]), requires_grad=True).to(device) self.pred = AnswerPredictionLayer() # non-parametrized self.K = K self.hidden_size = hidden_size self.context_gru_0 = BiGRU(embedding_size, hidden_size, batch_size) self.query_gru_0 = BiGRU(embedding_size, hidden_size, batch_size) self.context_gru_1 = BiGRU(2 * hidden_size, hidden_size, batch_size) self.query_gru_1 = BiGRU(embedding_size, hidden_size, batch_size) self.context_gru_2 = BiGRU(2 * hidden_size, hidden_size, batch_size) self.query_gru_2 = BiGRU(embedding_size, hidden_size, batch_size) self.context_gru_3 = BiGRU(2 * hidden_size, hidden_size, batch_size) self.query_gru_3 = BiGRU(embedding_size, hidden_size, batch_size) self.max_sentence = MaxAttSentence(max_sen_len, 2 * hidden_size)
def build_Emb_layer(self): return EmbeddingLayer(self.in_dim, self.ins_dim, self.hid_dim, self.out_dim, activation=self.activation, side_information=self.side_information, bias=self.bias, dropout=self.dropout, use_cuda=self.use_cuda)
def main_graph(self, trained_model, scope, emb_dim, cell, rnn_dim, rnn_num, drop_out=0.5, emb=None): if trained_model is not None: param_dic = { 'nums_chars': self.nums_chars, 'nums_tags': self.nums_tags, 'crf': self.crf, 'emb_dim': emb_dim, 'cell': cell, 'rnn_dim': rnn_dim, 'rnn_num': rnn_num, 'drop_out': drop_out, 'buckets_char': self.buckets_char, 'ngram': self.ngram, 'is_space': self.is_space, 'sent_seg': self.sent_seg, 'emb_path': self.emb_path, 'tag_scheme': self.tag_scheme } #print param_dic f_model = open(trained_model, 'w') pickle.dump(param_dic, f_model) f_model.close() # define shared weights and variables batch_size_h = tf.placeholder(tf.int32, [], name='batch_size_holder') dr = tf.placeholder(tf.float32, [], name='drop_out_holder') self.batch_size_h = batch_size_h self.drop_out = dr self.drop_out_v = drop_out # pdb.set_trace() self.emb_layer = EmbeddingLayer(self.nums_chars + 20, emb_dim, weights=emb, name='emb_layer') if self.ngram is not None: ng_embs = [None for _ in range(len(self.ngram))] for i, n_gram in enumerate(self.ngram): self.gram_layers.append( EmbeddingLayer(n_gram + 5000 * (i + 2), emb_dim, weights=ng_embs[i], name=str(i + 2) + 'gram_layer')) with tf.variable_scope('BiRNN'): if cell == 'gru': fw_rnn_cell = tf.contrib.rnn.GRUCell(rnn_dim) #forward bw_rnn_cell = tf.contrib.rnn.GRUCell(rnn_dim) #backward else: fw_rnn_cell = tf.contrib.rnn.LSTMCell(rnn_dim, state_is_tuple=True) bw_rnn_cell = tf.contrib.rnn.LSTMCell(rnn_dim, state_is_tuple=True) if rnn_num > 1: fw_rnn_cell = tf.contrib.rnn.MultiRNNCell([fw_rnn_cell] * rnn_num, state_is_tuple=True) bw_rnn_cell = tf.contrib.rnn.MultiRNNCell([bw_rnn_cell] * rnn_num, state_is_tuple=True) output_wrapper = HiddenLayer(rnn_dim * 2, self.nums_tags, activation='linear', name='hidden') #define model for each bucket for idx, bucket in enumerate(self.buckets_char): if idx == 1: scope.reuse_variables() t1 = time() batch_size = self.real_batches[idx] input_v1 = tf.placeholder(tf.int32, [None, bucket], name='input_1' + str(bucket)) input_v2 = tf.placeholder(tf.int32, [None, bucket], name='input_2' + str(bucket)) self.input_v1.append([input_v1]) self.input_v2.append([input_v2]) #output = None output = [] for i in range(self.num_gpus): with tf.device('/gpu:{}'.format(i)): input_1 = input_v1[i * batch_size_h:(i + 1) * batch_size_h] input_2 = input_v2[i * batch_size_h:(i + 1) * batch_size_h] emb_set1 = [] emb_set2 = [] word_out1 = self.emb_layer(input_1) word_out2 = self.emb_layer(input_2) emb_set1.append(word_out1) emb_set2.append(word_out2) # if self.ngram is not None: # for i in range(len(self.ngram)): # input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket)) # self.input_v[-1].append(input_g) # gram_out = self.gram_layers[i](input_g) # emb_set.append(gram_out) if len(emb_set1) > 1: emb_out1 = tf.concat(axis=2, values=emb_set1) emb_out2 = tf.concat(axis=2, values=emb_set2) else: emb_out1 = emb_set1[0] emb_out2 = emb_set2[0] emb_out1 = DropoutLayer(dr)(emb_out1) emb_out2 = DropoutLayer(dr)(emb_out2) rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr, name='BiLSTM' + str(bucket), scope='BiRNN')(emb_out1, emb_out2, input_v1) output_g = output_wrapper(rnn_out) # if output == None: # output = output_g # else: # output = tf.concat([output,output_g],axis = 0) #pdb.set_trace() output.append(output_g) self.output.append([output]) self.output_.append([ tf.placeholder(tf.int32, [None, bucket - 1], name='tags' + str(bucket)) ]) self.bucket_dit[bucket] = idx print 'Bucket %d, %f seconds' % (idx + 1, time() - t1) assert len(self.input_v1) == len(self.output) self.params = tf.trainable_variables() self.saver = tf.train.Saver()
def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, drop_out=0.5, rad_dim=30, emb=None, ng_embs=None, pixels=None, con_width=None, filters=None, pooling_size=None): if trained_model is not None: param_dic = {} param_dic['nums_chars'] = self.nums_chars param_dic['nums_tags'] = self.nums_tags param_dic['tag_scheme'] = self.tag_scheme param_dic['graphic'] = self.graphic param_dic['pic_size'] = self.pic_size param_dic['word_vec'] = self.word_vec param_dic['radical'] = self.radical param_dic['crf'] = self.crf param_dic['emb_dim'] = emb_dim param_dic['gru'] = gru param_dic['rnn_dim'] = rnn_dim param_dic['rnn_num'] = rnn_num param_dic['drop_out'] = drop_out param_dic['filter_size'] = con_width param_dic['filters'] = filters param_dic['pooling_size'] = pooling_size param_dic['font'] = self.font param_dic['buckets_char'] = self.buckets_char param_dic['ngram'] = self.ngram #print param_dic f_model = open(trained_model, 'w') pickle.dump(param_dic, f_model) f_model.close() # define shared weights and variables dr = tf.placeholder(tf.float32, [], name='drop_out_holder') self.drop_out = dr self.drop_out_v = drop_out if self.word_vec: self.emb_layer = EmbeddingLayer(self.nums_chars + 500, emb_dim, weights=emb, name='emb_layer') if self.radical: self.radical_layer = EmbeddingLayer(216, rad_dim, name='radical_layer') if self.ngram is not None: if ng_embs is not None: assert len(ng_embs) == len(self.ngram) else: ng_embs = [None for _ in range(len(self.ngram))] for i, n_gram in enumerate(self.ngram): self.gram_layers.append(EmbeddingLayer(n_gram + 1000 * (i + 2), emb_dim, weights=ng_embs[i], name= str(i + 2) + 'gram_layer')) wrapper_conv_1, wrapper_mp_1, wrapper_conv_2, wrapper_mp_2, wrapper_dense, wrapper_dr = None, None, None, None, None, None if self.graphic: self.input_p = [] assert pixels is not None and filters is not None and pooling_size is not None and con_width is not None self.pixels = pixels pixel_dim = int(math.sqrt(len(pixels[0]))) wrapper_conv_1 = TimeDistributed(Convolution(con_width, 1, filters, name='conv_1'), name='wrapper_c1') wrapper_mp_1 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_1'), name='wrapper_p1') p_size_1 = toolbox.down_pool(pixel_dim, pooling_size) wrapper_conv_2 = TimeDistributed(Convolution(con_width, filters, filters, name='conv_2'), name='wrapper_c2') wrapper_mp_2 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_2'), name='wrapper_p2') p_size_2 = toolbox.down_pool(p_size_1, pooling_size) wrapper_dense = TimeDistributed(HiddenLayer(p_size_2 * p_size_2 * filters, 100, activation='tanh', name='conv_dense'), name='wrapper_3') wrapper_dr = TimeDistributed(DropoutLayer(self.drop_out), name='wrapper_dr') with tf.variable_scope('BiRNN'): if gru: fw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) bw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) else: fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) if rnn_num > 1: fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([fw_rnn_cell]*rnn_num, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([bw_rnn_cell]*rnn_num, state_is_tuple=True) output_wrapper = TimeDistributed(HiddenLayer(rnn_dim * 2, self.nums_tags[0], activation='linear', name='hidden'), name='wrapper') #define model for each bucket for idx, bucket in enumerate(self.buckets_char): if idx == 1: scope.reuse_variables() t1 = time() input_v = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket)) self.input_v.append([input_v]) emb_set = [] if self.word_vec: word_out = self.emb_layer(input_v) emb_set.append(word_out) if self.radical: input_r = tf.placeholder(tf.int32, [None, bucket], name='input_r' + str(bucket)) self.input_v[-1].append(input_r) radical_out = self.radical_layer(input_r) emb_set.append(radical_out) if self.ngram is not None: for i in range(len(self.ngram)): input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket)) self.input_v[-1].append(input_g) gram_out = self.gram_layers[i](input_g) emb_set.append(gram_out) if self.graphic: input_p = tf.placeholder(tf.float32, [None, bucket, pixel_dim*pixel_dim]) self.input_p.append(input_p) pix_out = tf.reshape(input_p, [-1, bucket, pixel_dim, pixel_dim, 1]) pix_out = tf.unpack(pix_out, axis=1) conv_out_1 = wrapper_conv_1(pix_out) pooling_out_1 = wrapper_mp_1(conv_out_1) conv_out_2 = wrapper_conv_2(pooling_out_1) pooling_out_2 = wrapper_mp_2(conv_out_2) assert p_size_2 == pooling_out_2[0].get_shape().as_list()[1] pooling_out = tf.reshape(pooling_out_2, [-1, bucket, p_size_2 * p_size_2 * filters]) pooling_out = tf.unpack(pooling_out, axis=1) graphic_out = wrapper_dense(pooling_out) graphic_out = wrapper_dr(graphic_out) emb_set.append(graphic_out) if len(emb_set) > 1: emb_out = tf.concat(2, emb_set) emb_out = tf.unpack(emb_out) else: emb_out = emb_set[0] rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr, name='BiLSTM' + str(bucket), scope='BiRNN')(emb_out, input_v) output = output_wrapper(rnn_out) output_c = tf.pack(output, axis=1) self.output.append([output_c]) self.output_.append([tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket))]) self.bucket_dit[bucket] = idx print 'Bucket %d, %f seconds' % (idx + 1, time() - t1) assert len(self.input_v) == len(self.output) and len(self.output) == len(self.output_) and len(self.output) == len(self.counts) self.params = tf.trainable_variables() self.saver = tf.train.Saver()
def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, drop_out=0.5, emb=None): if trained_model is not None: param_dic = {'nums_chars': self.nums_chars, 'nums_tags': self.nums_tags, 'crf': self.crf, 'emb_dim': emb_dim, 'gru': gru, 'rnn_dim': rnn_dim, 'rnn_num': rnn_num, 'drop_out': drop_out, 'buckets_char': self.buckets_char, 'ngram': self.ngram, 'is_space': self.is_space, 'sent_seg': self.sent_seg, 'emb_path': self.emb_path, 'tag_scheme': self.tag_scheme} #print param_dic f_model = open(trained_model, 'w') pickle.dump(param_dic, f_model) f_model.close() # define shared weights and variables dr = tf.placeholder(tf.float32, [], name='drop_out_holder') self.drop_out = dr self.drop_out_v = drop_out self.emb_layer = EmbeddingLayer(self.nums_chars + 20, emb_dim, weights=emb, name='emb_layer') if self.ngram is not None: ng_embs = [None for _ in range(len(self.ngram))] for i, n_gram in enumerate(self.ngram): self.gram_layers.append(EmbeddingLayer(n_gram + 5000 * (i + 2), emb_dim, weights=ng_embs[i], name= str(i + 2) + 'gram_layer')) with tf.variable_scope('BiRNN'): if gru: fw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) bw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) else: fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) if rnn_num > 1: fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([fw_rnn_cell]*rnn_num, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([bw_rnn_cell]*rnn_num, state_is_tuple=True) output_wrapper = TimeDistributed(HiddenLayer(rnn_dim * 2, self.nums_tags, activation='linear', name='hidden'), name='wrapper') #define model for each bucket for idx, bucket in enumerate(self.buckets_char): if idx == 1: scope.reuse_variables() t1 = time() input_v = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket)) self.input_v.append([input_v]) emb_set = [] word_out = self.emb_layer(input_v) emb_set.append(word_out) if self.ngram is not None: for i in range(len(self.ngram)): input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket)) self.input_v[-1].append(input_g) gram_out = self.gram_layers[i](input_g) emb_set.append(gram_out) if len(emb_set) > 1: emb_out = tf.concat(2, emb_set) else: emb_out = emb_set[0] emb_out = DropoutLayer(dr)(emb_out) emb_out = tf.unpack(emb_out) rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr, name='BiLSTM' + str(bucket), scope='BiRNN')(emb_out, input_v) output = output_wrapper(rnn_out) output_c = tf.pack(output, axis=1) self.output.append([output_c]) self.output_.append([tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket))]) self.bucket_dit[bucket] = idx print 'Bucket %d, %f seconds' % (idx + 1, time() - t1) assert len(self.input_v) == len(self.output) and len(self.output) == len(self.output_) and len(self.output) == len(self.counts) self.params = tf.trainable_variables() self.saver = tf.train.Saver()
def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, fnn_dim, window_size, drop_out=0.5, rad_dim=30, emb=None, ng_embs=None, pixels=None, con_width=None, filters=None, pooling_size=None): if trained_model is not None: param_dic = {} param_dic['nums_chars'] = self.nums_chars param_dic['nums_tags'] = self.nums_tags param_dic['tag_scheme'] = self.tag_scheme param_dic['graphic'] = self.graphic param_dic['pic_size'] = self.pic_size param_dic['word_vec'] = self.word_vec param_dic['radical'] = self.radical param_dic['crf'] = self.crf param_dic['emb_dim'] = emb_dim param_dic['gru'] = gru param_dic['rnn_dim'] = rnn_dim param_dic['rnn_num'] = rnn_num param_dic['fnn_dim'] = fnn_dim param_dic['window_size'] = window_size param_dic['drop_out'] = drop_out param_dic['filter_size'] = con_width param_dic['filters'] = filters param_dic['pooling_size'] = pooling_size param_dic['font'] = self.font param_dic['buckets_char'] = self.buckets_char param_dic['ngram'] = self.ngram param_dic['mode'] = self.mode #print param_dic if self.metric == 'All': pindex = trained_model.rindex('/') + 1 for m in self.all_metrics: f_model = open( trained_model[:pindex] + m + '_' + trained_model[pindex:], 'w') pickle.dump(param_dic, f_model) f_model.close() else: f_model = open(trained_model, 'w') pickle.dump(param_dic, f_model) f_model.close() # define shared weights and variables dr = tf.placeholder(tf.float32, [], name='drop_out_holder') self.drop_out = dr self.drop_out_v = drop_out #concat_emb_dim = emb_dim * 2 concat_emb_dim = 0 if self.word_vec: self.emb_layer = EmbeddingLayer(self.nums_chars + 500, emb_dim, weights=emb, name='emb_layer') concat_emb_dim += emb_dim if self.radical: self.radical_layer = EmbeddingLayer(216, rad_dim, name='radical_layer') concat_emb_dim += rad_dim if self.ngram is not None: if ng_embs is not None: assert len(ng_embs) == len(self.ngram) else: ng_embs = [None for _ in range(len(self.ngram))] for i, n_gram in enumerate(self.ngram): self.gram_layers.append( EmbeddingLayer(n_gram + 1000 * (i + 2), emb_dim, weights=ng_embs[i], name=str(i + 2) + 'gram_layer')) concat_emb_dim += emb_dim wrapper_conv_1, wrapper_mp_1, wrapper_conv_2 = None, None, None wrapper_mp_2, wrapper_dense, wrapper_dr = None, None, None if self.graphic: self.input_p = [] assert pixels is not None and filters is not None and pooling_size is not None and con_width is not None self.pixels = pixels pixel_dim = int(math.sqrt(len(pixels[0]))) wrapper_conv_1 = Convolution(con_width, 1, filters, name='conv_1') wrapper_mp_1 = Maxpooling(pooling_size, pooling_size, name='pooling_1') p_size_1 = toolbox.down_pool(pixel_dim, pooling_size) wrapper_conv_2 = Convolution(con_width, filters, filters, name='conv_2') wrapper_mp_2 = Maxpooling(pooling_size, pooling_size, name='pooling_2') p_size_2 = toolbox.down_pool(p_size_1, pooling_size) wrapper_dense = HiddenLayer(p_size_2 * p_size_2 * filters, 100, activation='tanh', name='conv_dense') wrapper_dr = DropoutLayer(self.drop_out) concat_emb_dim += 100 fw_rnn_cell, bw_rnn_cell = None, None if self.mode == 'RNN': with tf.variable_scope('BiRNN'): if gru: fw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) bw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) else: fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) if rnn_num > 1: fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell( [fw_rnn_cell] * rnn_num, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell( [bw_rnn_cell] * rnn_num, state_is_tuple=True) output_wrapper = HiddenLayer(rnn_dim * 2, self.nums_tags[0], activation='linear', name='out_wrapper') fnn_weights, fnn_bias = None, None else: with tf.variable_scope('FNN'): fnn_weights = tf.get_variable( 'conv_w', [2 * window_size + 1, concat_emb_dim, 1, fnn_dim]) fnn_bias = tf.get_variable( 'conv_b', [fnn_dim], initializer=tf.constant_initializer(0.1)) output_wrapper = HiddenLayer(fnn_dim, self.nums_tags[0], activation='linear', name='out_wrapper') #define model for each bucket for idx, bucket in enumerate(self.buckets_char): if idx == 1: scope.reuse_variables() t1 = time() input_v = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket)) self.input_v.append([input_v]) emb_set = [] if self.word_vec: word_out = self.emb_layer(input_v) emb_set.append(word_out) if self.radical: input_r = tf.placeholder(tf.int32, [None, bucket], name='input_r' + str(bucket)) self.input_v[-1].append(input_r) radical_out = self.radical_layer(input_r) emb_set.append(radical_out) if self.ngram is not None: for i in range(len(self.ngram)): input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket)) self.input_v[-1].append(input_g) gram_out = self.gram_layers[i](input_g) emb_set.append(gram_out) if self.graphic: input_p = tf.placeholder(tf.float32, [None, bucket, pixel_dim * pixel_dim]) self.input_p.append(input_p) pix_out = tf.reshape(input_p, [-1, pixel_dim, pixel_dim, 1]) conv_out_1 = wrapper_conv_1(pix_out) pooling_out_1 = wrapper_mp_1(conv_out_1) conv_out_2 = wrapper_conv_2(pooling_out_1) pooling_out_2 = wrapper_mp_2(conv_out_2) assert p_size_2 == pooling_out_2[0].get_shape().as_list()[1] pooling_out = tf.reshape( pooling_out_2, [-1, bucket, p_size_2 * p_size_2 * filters]) graphic_out = wrapper_dense(pooling_out) graphic_out = wrapper_dr(graphic_out) emb_set.append(graphic_out) if len(emb_set) > 1: emb_out = tf.concat(axis=2, values=emb_set) else: emb_out = emb_set[0] if self.mode == 'RNN': rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr, name='BiLSTM' + str(bucket), scope='BiRNN')(emb_out, input_v) output = output_wrapper(rnn_out) else: emb_out = tf.pad(emb_out, [[0, 0], [window_size, window_size], [0, 0]]) emb_out = tf.reshape( emb_out, [-1, bucket + 2 * window_size, concat_emb_dim, 1]) conv_out = tf.nn.conv2d(emb_out, fnn_weights, [1, 1, 1, 1], padding='VALID') + fnn_bias fnn_out = tf.nn.tanh(conv_out) fnn_out = tf.reshape(fnn_out, [-1, bucket, fnn_dim]) output = output_wrapper(fnn_out) self.output.append([output]) self.output_.append([ tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket)) ]) self.bucket_dit[bucket] = idx print 'Bucket %d, %f seconds' % (idx + 1, time() - t1) assert len(self.input_v) == len(self.output) and len(self.output) == len(self.output_) \ and len(self.output) == len(self.counts) self.params = tf.trainable_variables() self.saver = tf.train.Saver()
class Model(object): def __init__(self): self.session = tf.InteractiveSession() tf.set_random_seed(FLAGS.seed) np.random.seed(FLAGS.seed) print('Reading embeddings...', end='', flush=True) self.embedding = EmbeddingLayer(FLAGS.embedding) print('done') # Define network parameters nkwargs = { 'in_size' : self.embedding.shape[1], 'out_size' : FLAGS.hidden, 'depth' : FLAGS.depth, 'batch_size': FLAGS.batch, } # Define the inputs, and their respective type & size inputs = { 'x' : [tf.int32, [None, FLAGS.batch]], 'y' : [tf.float32, [FLAGS.batch, 5]], 'kp' : [tf.float32, []], 'lambda' : [tf.float32, []], 'sparsity' : [tf.float32, []], 'coherency' : [tf.float32, []], } # Create placeholders with tf.name_scope('Placeholders'): p = { name: tf.placeholder(*args, name=name) for name, args in inputs.items() } self.train_fd = lambda x,y, kp=FLAGS.keep_prob: { p['x'] : x, p['y'] : y, p['kp'] : kp, p['lambda'] : FLAGS.l2_reg, p['sparsity'] : FLAGS.sparsity, p['coherency'] : FLAGS.coherency, } dropout = lambda x: tf.nn.dropout(x, p['kp']) bxentropy = lambda x,y: -(y * tf.log(x + 1e-8) + (1. - y) * tf.log(1. - x + 1e-8)) sq_err = lambda x,y: (x - y) ** 2 pad_mask = tf.to_float(tf.not_equal(p['x'], self.embedding.pad_id)) embedding = dropout( self.embedding.forward(p['x']) ) print('Creating model...', end='', flush=True) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.generator = Generator(embedding, pad_mask, p, nkwargs, dropout, bxentropy) self.encoder = Encoder(embedding, pad_mask, p, nkwargs, dropout, sq_err) self.encoder.create_minimization(self.generator.z) self.generator.create_minimization(self.encoder.loss_vec, self.global_step) print('done') def train(self): print('Initializing variables...', end='', flush=True) logdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'log') writer = tf.summary.FileWriter(logdir, self.session.graph) saver = tf.train.Saver() self.session.run(tf.global_variables_initializer()) checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint) if checkpoint and checkpoint.model_checkpoint_path: print('restoring previous checkpoint...', end='', flush=True) name = os.path.basename(checkpoint.model_checkpoint_path) saver.restore(self.session, os.path.join(FLAGS.checkpoint, name)) merger = tf.summary.merge_all() print('done') print('Fetching data...', end='', flush=True) x,y = read_data(FLAGS.training) train = ([self.embedding.words_to_ids(s) for s in x], y) x,y = read_data(FLAGS.testing) test = ([self.embedding.words_to_ids(s) for s in x], y) print('done') for epoch in range(FLAGS.epochs): start_time = time.time() train_x, train_y = preprocess(train, FLAGS.batch, self.embedding.pad_id, FLAGS.maxlen) scost = ocost = tcost = p_one = 0 for bx,by in zip(train_x, train_y): result = self.session.run([merger, self.generator.train_g, self.encoder.train_e, self.generator.reg, self.generator.obj, self.encoder.loss, self.generator.z, self.global_step], feed_dict=self.train_fd(bx, by)) writer.add_summary(result[0], result[7]) scost += result[3] ocost += result[4] tcost += result[5] p_one += np.sum(result[6]) / FLAGS.batch / len(bx[0]) print('Regularization: ', scost / float(len(train_x))) print('Objective: ', ocost / float(len(train_x))) print('Prediction loss: ', tcost / float(len(train_x))) print('Generator Selection %: ', p_one / float(len(train_x))) if not epoch % 1: results = [] ocost = tcost = 0 test_x, test_y = preprocess(test, FLAGS.batch, self.embedding.pad_id, FLAGS.maxlen) for bx,by in zip(test_x, test_y): preds, bz, gobj, eloss = self.session.run([ self.encoder.preds, self.generator.z, self.generator.obj, self.encoder.loss], feed_dict=self.train_fd(bx, by, 1.)) ocost += gobj tcost += eloss for p, x, y, z in zip(preds, bx.T, by, bz.T): w = self.embedding.ids_to_words(x) w = [u.replace('<pad>', '_') for u in w] r = [u if v == 1 else '_' for u,v in zip(w,z)] results.append((p, r, w, y)) print('Test Objective: ', ocost / float(len(test_x))) print('Test Prediction loss: ',tcost / float(len(test_x))) with open(FLAGS.output, 'w+') as f: for p, r, w, y in results: f.write(json.dumps({ 'rationale' : ' '.join(r), 'original' : ' '.join(w), 'y' : str(list(y)), 'p' : str(list(p)), }) + '\n') saver.save(self.session, os.path.join(FLAGS.checkpoint, 'GEN.model'), global_step=self.global_step) print('Finished epoch %s in %.2f seconds\n' % (epoch, time.time() - start_time))
def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, drop_out=0.5, rad_dim=30, emb=None, ngram_embedding=None, pixels=None, con_width=None, filters=None, pooling_size=None): """ :param trained_model: :param scope: :param emb_dim: :param gru: :param rnn_dim: :param rnn_num: :param drop_out: :param rad_dim: n :param emb: :param ngram_embedding: 预训练 ngram embeddig 文件 :param pixels: :param con_width: :param filters: :param pooling_size: :return: """ # trained_model: 模型存储路径 if trained_model is not None: param_dic = {'nums_chars': self.nums_chars, 'nums_tags': self.nums_tags, 'tag_scheme': self.tag_scheme, 'graphic': self.graphic, 'pic_size': self.pic_size, 'word_vec': self.word_vec, 'radical': self.radical, 'crf': self.crf, 'emb_dim': emb_dim, 'gru': gru, 'rnn_dim': rnn_dim, 'rnn_num': rnn_num, 'drop_out': drop_out, 'filter_size': con_width, 'filters': filters, 'pooling_size': pooling_size, 'font': self.font, 'buckets_char': self.buckets_char, 'ngram': self.ngram} print "RNN dimension is %d" % rnn_dim print "RNN number is %d" % rnn_num print "Character embedding size is %d" % emb_dim print "Ngram embedding dimension is %d" % emb_dim # 存储模型超参数 if self.metric == 'All': # rindex() 返回子字符串 str 在字符串中最后出现的位置 # 截取模型文件名 pindex = trained_model.rindex('/') + 1 for m in self.all_metrics: f_model = open(trained_model[:pindex] + m + '_' + trained_model[pindex:], 'w') pickle.dump(param_dic, f_model) f_model.close() else: f_model = open(trained_model, 'w') pickle.dump(param_dic, f_model) f_model.close() # define shared weights and variables dr = tf.placeholder(tf.float32, [], name='drop_out_holder') self.drop_out = dr self.drop_out_v = drop_out # 字向量层 # 为什么字符数要加 500 ? # emb_dim 是每个字符的特征向量维度,可以通过命令行参数设置 # weights 表示预训练的字向量,可以通过命令行参数设置 if self.word_vec: self.emb_layer = EmbeddingLayer(self.nums_chars + 500, emb_dim, weights=emb, name='emb_layer') # 偏旁部首向量 # 依照《康熙字典》,共有 214 个偏旁部首。 # 只用了常见汉字的偏旁部首,非常见汉字和非汉字的偏旁部首用其他两个特殊符号代替, # 所以共有 216 个偏旁部首 if self.radical: self.radical_layer = EmbeddingLayer(216, rad_dim, name='radical_layer') if self.ngram is not None: if ngram_embedding is not None: assert len(ngram_embedding) == len(self.ngram) else: ngram_embedding = [None for _ in range(len(self.ngram))] for i, n_gram in enumerate(self.ngram): self.gram_layers.append(EmbeddingLayer(n_gram + 1000 * (i + 2), emb_dim, weights=ngram_embedding[i], name=str(i + 2) + 'gram_layer')) wrapper_conv_1, wrapper_mp_1, wrapper_conv_2, wrapper_mp_2, wrapper_dense, wrapper_dr = \ None, None, None, None, None, None if self.graphic: # 使用图像信息,需要用到 CNN self.input_p = [] assert pixels is not None and filters is not None and pooling_size is not None and con_width is not None self.pixels = pixels pixel_dim = int(math.sqrt(len(pixels[0]))) wrapper_conv_1 = TimeDistributed(Convolution(con_width, 1, filters, name='conv_1'), name='wrapper_c1') wrapper_mp_1 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_1'), name='wrapper_p1') p_size_1 = toolbox.down_pool(pixel_dim, pooling_size) wrapper_conv_2 = TimeDistributed(Convolution(con_width, filters, filters, name='conv_2'), name='wrapper_c2') wrapper_mp_2 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_2'), name='wrapper_p2') p_size_2 = toolbox.down_pool(p_size_1, pooling_size) wrapper_dense = TimeDistributed( HiddenLayer(p_size_2 * p_size_2 * filters, 100, activation='tanh', name='conv_dense'), name='wrapper_3') wrapper_dr = TimeDistributed(DropoutLayer(self.drop_out), name='wrapper_dr') with tf.variable_scope('BiRNN'): if gru: fw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) bw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) else: fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) if rnn_num > 1: fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([fw_rnn_cell] * rnn_num, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([bw_rnn_cell] * rnn_num, state_is_tuple=True) # 隐藏层,输入是前向 RNN 的输出加上 后向 RNN 的输出,所以输入维度为 rnn_dim * 2 # 输出维度即标签个数 output_wrapper = TimeDistributed( HiddenLayer(rnn_dim * 2, self.nums_tags[0], activation='linear', name='hidden'), name='wrapper') # define model for each bucket # 每一个 bucket 中的句子长度不一样,所以需要定义单独的模型 # bucket: bucket 中的句子长度 for idx, bucket in enumerate(self.buckets_char): if idx == 1: # scope 是 tf.variable_scope("tagger", reuse=None, initializer=initializer) # 只需要设置一次 reuse,后面就都 reuse 了 scope.reuse_variables() t1 = time() # 输入的句子,one-hot 向量 # shape = (batch_size, 句子长度) input_sentences = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket)) self.input_v.append([input_sentences]) emb_set = [] if self.word_vec: # 根据 one-hot 向量查找对应的字向量 # word_out: shape=(batch_size, 句子长度,字向量维度(64)) word_out = self.emb_layer(input_sentences) emb_set.append(word_out) if self.radical: # 嵌入偏旁部首信息,shape = (batch_size, 句子长度) input_radicals = tf.placeholder(tf.int32, [None, bucket], name='input_r' + str(bucket)) self.input_v[-1].append(input_radicals) radical_out = self.radical_layer(input_radicals) emb_set.append(radical_out) if self.ngram is not None: for i in range(len(self.ngram)): input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket)) self.input_v[-1].append(input_g) gram_out = self.gram_layers[i](input_g) emb_set.append(gram_out) if self.graphic: input_p = tf.placeholder(tf.float32, [None, bucket, pixel_dim * pixel_dim]) self.input_p.append(input_p) pix_out = tf.reshape(input_p, [-1, bucket, pixel_dim, pixel_dim, 1]) conv_out_1 = wrapper_conv_1(pix_out) pooling_out_1 = wrapper_mp_1(conv_out_1) conv_out_2 = wrapper_conv_2(pooling_out_1) pooling_out_2 = wrapper_mp_2(conv_out_2) assert p_size_2 == pooling_out_2[0].get_shape().as_list()[1] pooling_out = tf.reshape(pooling_out_2, [-1, bucket, p_size_2 * p_size_2 * filters]) pooling_out = tf.unstack(pooling_out, axis=1) graphic_out = wrapper_dense(pooling_out) graphic_out = wrapper_dr(graphic_out) emb_set.append(graphic_out) if self.window_size > 1: padding_size = int(np.floor(self.window_size / 2)) word_padded = tf.pad(word_out, [[0, 0], [padding_size, padding_size], [0, 0]], 'CONSTANT') Ws = [] for q in range(1, self.window_size + 1): Ws.append(tf.get_variable("W_%d" % q, shape=[q * emb_dim, self.filters_number])) b = tf.get_variable("b", shape=[self.filters_number]) z = [None for _ in range(0, bucket)] for q in range(1, self.window_size + 1): for i in range(padding_size, bucket + padding_size): low = i - int(np.floor((q - 1) / 2)) high = i + int(np.ceil((q + 1) / 2)) x = word_padded[:, low, :] for j in range(low + 1, high): x = tf.concat(values=[x, word_padded[:, j, :]], axis=1) z_iq = tf.tanh(tf.nn.xw_plus_b(x, Ws[q - 1], b)) if z[i - padding_size] is None: z[i - padding_size] = z_iq else: z[i - padding_size] = tf.concat([z[i - padding_size], z_iq], axis=1) z = tf.stack(z, axis=1) values, indices = tf.nn.top_k(z, sorted=False, k=emb_dim) # highway layer X = tf.unstack(word_out, axis=1) Conv_X = tf.unstack(values, axis=1) X_hat = [] W_t = tf.get_variable("W_t", shape=[emb_dim, emb_dim]) b_t = tf.get_variable("b_t", shape=[emb_dim]) for x, conv_x in zip(X, Conv_X): T_x = tf.sigmoid(tf.nn.xw_plus_b(x, W_t, b_t)) X_hat.append(tf.multiply(conv_x, T_x) + tf.multiply(x, 1 - T_x)) X_hat = tf.stack(X_hat, axis=1) emb_set.append(X_hat) if len(emb_set) > 1: # 各种字向量直接 concat 起来(字向量、偏旁部首、n-gram、图像信息等) emb_out = tf.concat(axis=2, values=emb_set) else: emb_out = emb_set[0] # rnn_out 是前向 RNN 的输出和后向 RNN 的输出 concat 之后的值 rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr, name='BiLSTM' + str(bucket), scope='BiRNN')(self.highway(emb_out, "tag"), input_sentences) # 应用全连接层,Wx+b 得到最后的输出 output = output_wrapper(rnn_out) # 为什么要 [output] 而不是 output 呢? self.output.append([output]) self.output_.append([tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket))]) self.bucket_dit[bucket] = idx # language model lm_rnn_dim = rnn_dim with tf.variable_scope('LM-BiRNN'): if gru: lm_fw_rnn_cell = tf.nn.rnn_cell.GRUCell(lm_rnn_dim) lm_bw_rnn_cell = tf.nn.rnn_cell.GRUCell(lm_rnn_dim) else: lm_fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(lm_rnn_dim, state_is_tuple=True) lm_bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(lm_rnn_dim, state_is_tuple=True) if rnn_num > 1: lm_fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([lm_fw_rnn_cell] * rnn_num, state_is_tuple=True) lm_bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([lm_bw_rnn_cell] * rnn_num, state_is_tuple=True) lm_rnn_output = BiLSTM(lm_rnn_dim, fw_cell=lm_fw_rnn_cell, bw_cell=lm_bw_rnn_cell, p=dr, name='LM-BiLSTM' + str(bucket), scope='LM-BiRNN')(self.highway(emb_set[0]), input_sentences) lm_output_wrapper = TimeDistributed( HiddenLayer(lm_rnn_dim * 2, self.nums_chars + 2, activation='linear', name='lm_hidden'), name='lm_wrapper') lm_final_output = lm_output_wrapper(lm_rnn_output) self.lm_predictions.append([lm_final_output]) self.lm_groundtruthes.append([tf.placeholder(tf.int32, [None, bucket], name='lm_targets' + str(bucket))]) print 'Bucket %d, %f seconds' % (idx + 1, time() - t1) assert \ len(self.input_v) == len(self.output) and \ len(self.output) == len(self.output_) and \ len(self.lm_predictions) == len(self.lm_groundtruthes) and \ len(self.output) == len(self.counts) self.params = tf.trainable_variables() self.saver = tf.train.Saver()
def build_model(self, load_model=None): a = Input(shape=(self.max_length, ), dtype='int32', name='words_1') # For "premise" b = Input(shape=(self.max_length, ), dtype='int32', name='words_2') # For "hypothesis" # ------- Embedding Layer ------- # Using "Glove" pre-trained embedding matrix as our initial weights embedding_layer = EmbeddingLayer(self.vocab_size, self.embedding_size, self.max_length, self.hidden_unit, init_weights=self.embedding_matrix, dropout=self.dropout_rate, nr_tune=5000) embedded_a = embedding_layer(a) embedded_b = embedding_layer(b) # ------- BiLSTM Layer ------- # BiLSTM learns to represent a word and its context encoded_a = BiLSTM_Layer(self.max_length, self.hidden_unit)(embedded_a) encoded_b = BiLSTM_Layer(self.max_length, self.hidden_unit)(embedded_b) # ------- Attention Layer ------- attention_ab = Lambda(attention, attention_output, name='attention')([encoded_a, encoded_b]) # ------- Soft-Alignment Layer ------- # Modeling local inference needs to employ some forms of hard or soft alignment to associate the relevant # sub-components between a premise and a hypothesis # Using inter-sentence "alignment" (or attention) to softly align each word to the content of hypothesis (or premise) align_alpha = Lambda( attention_softmax3d, attention_softmax3d_output, name='soft_alignment_a')([attention_ab, encoded_b]) align_beta = Lambda(attention_softmax3d, attention_softmax3d_output, name='soft_alignment_b')([attention_ab, encoded_a]) # ------- Enhancement Layer ------- # Compute the difference and the element-wise product for the tuple < encoded_a, align_a > and < encoded_b, align_b > # This operation could help sharpen local inference information between elements in the tuples and capture # inference relationships such as contradiction. sub_a = Lambda(substract, substract_output, name='substract_a')([encoded_a, align_alpha]) mul_a = Lambda(multiply, multiply_output, name='multiply_a')([encoded_a, align_alpha]) sub_b = Lambda(substract, substract_output, name='substract_b')([encoded_b, align_beta]) mul_b = Lambda(multiply, multiply_output, name='multiply_b')([encoded_b, align_beta]) m_a = merge([encoded_a, align_alpha, sub_a, mul_a], mode='concat') # shape=(batch_size, time-steps, 4 * units) m_b = merge([encoded_b, align_beta, sub_b, mul_b], mode='concat') # shape=(batch_size, time-steps, 4 * units) # ------- Composition Layer ------- comp_a = Composition_Layer(self.hidden_unit, self.max_length)(m_a) comp_b = Composition_Layer(self.hidden_unit, self.max_length)(m_b) # ------- Pooling Layer ------- preds = Pooling_Layer(self.hidden_unit, self.n_classes, dropout=self.dropout_rate, l2_weight_decay=self.l2_weight_decay)(comp_a, comp_b) model = Model(inputs=[a, b], outputs=[preds]) model.compile(optimizer=Adam(lr=self.learning_rate), loss='binary_crossentropy', metrics=['accuracy']) if load_model is not None: print('Loading pre-trained weights from \'{}\'...'.format( load_model)) model.load_weights(load_model) return model
def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, drop_out=0.5, emb=None, ngram_embedding=None): """ :param trained_model: :param scope: :param emb_dim: :param gru: :param rnn_dim: :param rnn_num: :param drop_out: :param emb: :return: """ # trained_model: 模型存储路径 if trained_model is not None: param_dic = { 'nums_chars': self.nums_chars, 'nums_tags': self.nums_tags, 'tag_scheme': self.tag_scheme, 'crf': self.crf, 'emb_dim': emb_dim, 'gru': gru, 'rnn_dim': rnn_dim, 'rnn_num': rnn_num, 'drop_out': drop_out, 'buckets_char': self.buckets_char, 'ngram': self.ngram } print "RNN dimension is %d" % rnn_dim print "RNN number is %d" % rnn_num print "Character embedding size is %d" % emb_dim # 存储模型超参数 if self.metric == 'All': # rindex() 返回子字符串 str 在字符串中最后出现的位置 # 截取模型文件名 pindex = trained_model.rindex('/') + 1 for m in self.all_metrics: f_model = open( trained_model[:pindex] + m + '_' + trained_model[pindex:], 'w') pickle.dump(param_dic, f_model) f_model.close() else: f_model = open(trained_model, 'w') pickle.dump(param_dic, f_model) f_model.close() # define shared weights and variables dr = tf.placeholder(tf.float32, [], name='drop_out_holder') self.drop_out = dr self.drop_out_v = drop_out # 字向量层 # 为什么字符数要加 500 ? # emb_dim 是每个字符的特征向量维度,可以通过命令行参数设置 # weights 表示预训练的字向量,可以通过命令行参数设置 self.emb_layer = EmbeddingLayer(self.nums_chars + 500, emb_dim, weights=emb, name='emb_layer') if self.ngram is not None: if ngram_embedding is not None: assert len(ngram_embedding) == len(self.ngram) else: ngram_embedding = [None for _ in range(len(self.ngram))] for i, n_gram in enumerate(self.ngram): self.gram_layers.append( EmbeddingLayer(n_gram + 1000 * (i + 2), emb_dim, weights=ngram_embedding[i], name=str(i + 2) + 'gram_layer')) # 隐藏层,输入是前向 RNN 的输出加上 后向 RNN 的输出,所以输入维度为 rnn_dim * 2 # 输出维度即标签个数 tag_output_wrapper = TimeDistributed(HiddenLayer(rnn_dim * 2, self.nums_tags[0], activation='linear', name='tag_hidden'), name='tag_output_wrapper') if self.char_freq_loss: freq_output_wrapper = TimeDistributed(HiddenLayer( rnn_dim * 2, 1, activation='sigmoid', name='freq_hidden'), name='freq_output_wrapper') if self.co_train: lm_fw_wrapper = TimeDistributed(HiddenLayer(rnn_dim, self.nums_chars + 2, activation='linear', name='lm_fw_hidden'), name='lm_fw_wrapper') lm_bw_wrapper = TimeDistributed(HiddenLayer(rnn_dim, self.nums_chars + 2, activation='linear', name='lm_bw_hidden'), name='lm_bw_wrapper') # define model for each bucket # 每一个 bucket 中的句子长度不一样,所以需要定义单独的模型 # bucket: bucket 中的句子长度 for idx, bucket in enumerate(self.buckets_char): if idx == 1: # scope 是 tf.variable_scope("tagger", reuse=None, initializer=initializer) # 只需要设置一次 reuse,后面就都 reuse 了 scope.reuse_variables() t1 = time() # 输入的句子,one-hot 向量 # shape = (batch_size, 句子长度) input_sentences = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket)) self.input_v.append([input_sentences]) emb_set = [] word_out = self.emb_layer(input_sentences) emb_set.append(word_out) if self.ngram is not None: for i in range(len(self.ngram)): input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket)) self.input_v[-1].append(input_g) gram_out = self.gram_layers[i](input_g) emb_set.append(gram_out) if len(emb_set) > 1: # 各种字向量直接 concat 起来(字向量、偏旁部首、n-gram、图像信息等) word_embeddings = tf.concat(axis=2, values=emb_set) else: word_embeddings = emb_set[0] # rnn_out 是前向 RNN 的输出和后向 RNN 的输出 concat 之后的值 rnn_out_fw, rnn_out_bw = BiRNN(rnn_dim, p=dr, concat_output=False, gru=gru, name='BiLSTM' + str(bucket), scope='Tag-BiRNN')(word_embeddings, input_sentences) tag_rnn_out_fw, tag_rnn_out_bw = rnn_out_fw, rnn_out_bw if self.co_train: if self.highway_layers > 0: tag_rnn_out_fw = highway_network(rnn_out_fw, self.highway_layers, True, is_train=True, scope="tag_fw") tag_rnn_out_bw = highway_network(rnn_out_bw, self.highway_layers, True, is_train=True, scope="tag_bw") tag_rnn_out = tf.concat(values=[tag_rnn_out_fw, tag_rnn_out_bw], axis=2) # 应用全连接层,Wx+b 得到最后的输出 output = tag_output_wrapper(tag_rnn_out) # 为什么要 [output] 而不是 output 呢? self.output.append([output]) self.output_.append([ tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket)) ]) self.bucket_dit[bucket] = idx if self.co_train: # language model lm_rnn_out_fw, lm_rnn_out_bw = rnn_out_fw, rnn_out_bw if self.highway_layers > 0: lm_rnn_out_fw = highway_network(rnn_out_fw, self.highway_layers, True, is_train=True, scope="lm_fw") lm_rnn_out_bw = highway_network(rnn_out_bw, self.highway_layers, True, is_train=True, scope="lm_bw") self.lm_fw_predictions.append([lm_fw_wrapper(lm_rnn_out_fw)]) self.lm_bw_predictions.append([lm_bw_wrapper(lm_rnn_out_bw)]) self.lm_fw_groundtruthes.append([ tf.placeholder(tf.int32, [None, bucket], name='lm_fw_targets' + str(bucket)) ]) self.lm_bw_groundtruthes.append([ tf.placeholder(tf.int32, [None, bucket], name='lm_bw_targets' + str(bucket)) ]) if self.char_freq_loss: freq_rnn_out_fw, freq_rnn_out_bw = rnn_out_fw, rnn_out_bw if self.highway_layers > 0: freq_rnn_out_fw = highway_network(rnn_out_fw, self.highway_layers, True, is_train=True, scope="freq_fw") freq_rnn_out_bw = highway_network(rnn_out_bw, self.highway_layers, True, is_train=True, scope="freq_bw") freq_rnn_out = tf.concat( values=[freq_rnn_out_fw, freq_rnn_out_bw], axis=2) self.char_freq_groundtruthes.append([ tf.placeholder(tf.float32, [None, bucket], name='freq_targets_%d' % bucket) ]) self.char_freq_predictions.append( [freq_output_wrapper(freq_rnn_out)]) print 'Bucket %d, %f seconds' % (idx + 1, time() - t1) assert \ len(self.input_v) == len(self.output) and \ len(self.output) == len(self.output_) and \ len(self.output) == len(self.counts) self.params = tf.trainable_variables() self.saver = tf.train.Saver()