def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() # (N, T) else: # inference self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) # define decoder inputs self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2:<S> # Load vocabulary en2idx, idx2en = load_en_vocab() zh2idx, idx2zh = load_zh_vocab() # initialize transformer transformer = vanilla_transformer(hp, self.is_training) self.enc = transformer.encode(self.x, len(en2idx)) # Decoder self.dec = transformer.decode(self.decoder_inputs, self.enc, len(zh2idx), hp.maxlen) # Final linear projection self.logits = tf.layers.dense(self.dec, len(zh2idx)) self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget) / (tf.reduce_sum(self.istarget)) tf.summary.scalar('acc', self.acc) if is_training: # Loss self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len(zh2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum( self.loss * self.istarget) / (tf.reduce_sum(self.istarget)) # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) # Summary tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
def __init__(self, is_training=True): self.graph = tf.Graph() self.vocab_size = len( load_doc_vocab()[0]) # load_doc_vocab returns: de2idx, idx2de with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() # (N, T) else: # inference self.x = tf.placeholder(tf.int32, shape=(None, hp.article_maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.summary_maxlen)) self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2:<S> # define decoder inputs self._add_encoder(is_training=is_training) self.ml_loss = self._add_ml_loss(is_training=is_training) self.loss = self.ml_loss if is_training: self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) grads_and_vars_ml = self.optimizer.compute_gradients( loss=self.ml_loss) grad_ml, vars_ml = zip( *grads_and_vars_ml) # parse grad and var # add gradient clipping clipped_grad_ml, globle_norm_ml = tf.clip_by_global_norm( grad_ml, hp.maxgradient) self.globle_norm_ml = globle_norm_ml self.train_op_ml = self.optimizer.apply_gradients( grads_and_vars=zip(clipped_grad_ml, vars_ml), global_step=self.global_step) ''' # training wihtout gradient clipping self.train_op_ml = self.optimizer.apply_gradients(grads_and_vars=grads_and_vars_ml, global_step=self.global_step) ''' # Summary tf.summary.scalar('globle_norm_ml', globle_norm_ml) tf.summary.scalar('loss', self.loss) self.merged = tf.summary.merge_all() self.filewriter = tf.summary.FileWriter(hp.tb_dir + '/train', self.graph)
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): # inputs if is_training: self.x, self.y, self.num_batch = get_batch_data() # (N, 9, 9) else: self.x = tf.placeholder(tf.float32, (None, 9, 9)) self.y = tf.placeholder(tf.int32, (None, 9, 9)) self.enc = tf.expand_dims(self.x, axis=-1) # (N, 9, 9, 1) self.istarget = tf.to_float(tf.equal(self.x, tf.zeros_like( self.x))) # 0: blanks # network for i in range(hp.num_blocks): with tf.variable_scope("conv2d_{}".format(i)): self.enc = conv(self.enc, filters=hp.num_filters, size=hp.filter_size, is_training=is_training, norm_type="bn", activation_fn=tf.nn.relu) # outputs self.logits = conv(self.enc, 10, 1, scope="logits") # (N, 9, 9, 1) self.probs = tf.reduce_max(tf.nn.softmax(self.logits), axis=-1) #(N, 9, 9) self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1)) #(N, 9, 9) # accuracy self.hits = tf.to_float(tf.equal(self.preds, self.y)) * self.istarget self.acc = tf.reduce_sum( self.hits) / (tf.reduce_sum(self.istarget) + 1e-8) tf.summary.scalar("acc", self.acc) if is_training: # Loss self.ce = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.y, logits=self.logits) self.loss = tf.reduce_sum( self.ce * self.istarget) / (tf.reduce_sum(self.istarget)) # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr) self.train_op = self.optimizer.minimize( self.loss, global_step=self.global_step) tf.summary.scalar("loss", self.loss) self.merged = tf.summary.merge_all()
def Graph(self): graph = tf.Graph() with graph.as_default(): if self.is_training: next_element, iterator, num_batch = get_batch_data(self.is_training) self.X, self.Y, self.seq_len = next_element["X"], next_element["Y"], next_element["seq_len"] else: self.X = tf.placeholder(tf.int32, shape=(None, config.maxlen)) self.Y = tf.placeholder(tf.int32, shape=(None, config.maxlen)) self.seq_len = tf.placeholder(tf.int32, shape=(None)) idx2word, word2idx, idx2labl, labl2idx = load_vocab() embed = embedding(self.X,len(word2idx),config.embed_dim, config.use_pretrain) if config.embeddig_mode=="concat": assert config.embed_dim==config.position_dim #TODO this part still dont know how to complete better! elif config.embeddig_mode=="add": embed+=position_encoding(self.X,config.position_dim, config.sinusoid) # input embedding Dropout embed = tf.layers.dropout(embed,rate=config.dropout_rate,training=self.is_training) #Muilty layer Bilstm outputs = multibilstm(embed,self.seq_len,config.num_units,config.num_layer,self.is_training,config.cell) #full connect layer # here we use two layer full connect layer. residual and activation can be set by your self. outputs = feedforward(outputs,outputs.get_shape().as_list()[2],scope="first")#residual default used outputs = feedforward(outputs,config.num_class,residual=False,scope="second") noutput = tf.reshape(outputs, [-1, config.maxlen, config.num_class]) # crf layer if config.use_crf: loss, acc, predicts,true_labels = crf_layer(self.Y,noutput,config.num_class,self.seq_len,self.is_training) else: loss, acc, predicts, true_labels = loss_layer(self.Y, noutput, config.num_class) tf.summary.scalar('acc',acc) global_step = tf.Variable(0, name='global_step') if self.is_training: # use exponential_decay to help the model fit quicker if config.exponential_decay: learning_rate = tf.train.exponential_decay( config.lr,global_step, 200, 0.96, staircase=True ) # optimizer = tf.train.AdamOptimizer(learning_rate=config.lr, beta1=0.9, beta2=0.99, epsilon=1e-8) optimizer = tf.train.RMSPropOptimizer(learning_rate=config.lr) train_op = optimizer.minimize(loss, global_step=global_step) tf.summary.scalar('mean_loss',loss) else: train_op=None return graph,train_op,loss, acc, predicts,true_labels,global_step
def __init__(self, is_training): self.de2idx, _idx2de = load_de_vocab() self.en2idx, _idx2en = load_en_vocab() self.is_training = is_training self.graph = tf.Graph() with self.graph.as_default(): if self.is_training: self.x, self.y, self.num_batch = get_batch_data() # (N, T) else: # inference self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.x_len = tf.reduce_sum(self.x, axis=-1) self.y_len = tf.reduce_sum(self.y, axis=-1) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.batch_size = tf.shape(self.x)[0]
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): # Load data if is_training: self.x, self.y, self.num_batch = get_batch_data() # (N, T) else: self.x = tf.placeholder(tf.int32, shape=(None, 60)) # Load vocabulary nucl2idx, idx2nucl = load_vocab() # Encoder ## Embedding enc = embedding(self.x, zero_pad=False, vocab_size=len(nucl2idx), num_units=hp.hidden_units, scale=False, scope="enc_embed") # Encoder pre-net prenet_out = prenet( enc, num_units=[hp.hidden_units, hp.hidden_units // 2], dropout_rate=hp.dropout_rate, is_training=is_training) # (N, T, E/2) # Encoder CBHG ## Conv1D bank enc = conv1d_banks(prenet_out, K=hp.encoder_num_banks, num_units=hp.hidden_units // 2, norm_type=hp.norm_type, is_training=is_training) # (N, T, K * E / 2) # ### Max pooling # enc = tf.layers.max_pooling1d(enc, 2, 2, padding="same") # (N, T, K * E / 2) ### Conv1D projections enc = conv1d(enc, hp.hidden_units // 2, 3, scope="conv1d_1") # (N, T, E/2) enc = normalize(enc, type=hp.norm_type, is_training=is_training, activation_fn=tf.nn.relu, scope="norm1") enc = conv1d(enc, hp.hidden_units // 2, 3, scope="conv1d_2") # (N, T, E/2) enc = normalize(enc, type=hp.norm_type, is_training=is_training, activation_fn=tf.nn.relu, scope="norm2") enc += prenet_out # (N, T, E/2) # residual connections ### Highway Nets for i in range(hp.num_highwaynet_blocks): enc = highwaynet( enc, num_units=hp.hidden_units // 2, scope='highwaynet_{}'.format(i)) # (N, T, E/2) # Final linear projection _, T, E = enc.get_shape().as_list() enc = tf.reshape(enc, (-1, T * E)) self.logits = tf.squeeze(tf.layers.dense(enc, 1)) if is_training: # Loss if hp.loss_type == "l1": self.loss = tf.reduce_mean(tf.abs(self.logits - self.y)) else: # l2 self.loss = tf.reduce_mean( tf.squared_difference(self.logits, self.y)) # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr) self.train_op = self.optimizer.minimize( self.loss, global_step=self.global_step) # Summary tf.summary.scalar('loss', self.loss) tf.summary.merge_all()
pred, _ = transformer(inputs, tar_inp, True, encoder_padding_mask, look_ahead_mask, decoder_padding_mask) loss = loss_fun(tar_real, pred) # 求梯度 gradients = tape.gradient(loss, transformer.trainable_variables) # 反向传播 optimizer.apply_gradients( zip(gradients, transformer.trainable_variables)) # 记录loss和acc train_loss(loss) train_acc(tar_real, pred) for epoch in range(hp.EPOCHS): start_time = time.time() # 重置 train_loss.reset_states() train_acc.reset_states() for step, (inputs, targets) in enumerate(get_batch_data()): print(inputs) train_step(inputs, targets) if step % 10 == 0: print(' epoch{},step:{}, loss:{:.4f}, acc:{:.4f}'.format( epoch, step, train_loss.result(), train_acc.result())) if epoch % 2 == 0: ckpt_save_path = ckpt_manager.save() print('epoch{}, save model at {}'.format(epoch, ckpt_save_path)) print('epoch:{}, loss:{:.4f}, acc:{:.4f}'.format(epoch, train_loss.result(), train_acc.result())) print('time in one epoch:{}'.format(time.time() - start_time))
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() # (N, T) else: # inference self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) # define decoder inputs self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2:<S> # Load vocabulary en2idx, idx2en = load_en_vocab() ch2idx, idx2ch = load_ch_vocab() # Encoder with tf.variable_scope("encoder"): ## Embedding self.enc = embedding(self.x, vocab_size=len(en2idx), num_units=hp.hidden_units, scale=True, scope="enc_embed") ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") ## Dropout self.enc = tf.layers.dropout( self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### Multihead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) ### Feed Forward self.enc = feedforward( self.enc, num_units=[4 * hp.hidden_units, hp.hidden_units]) # Decoder with tf.variable_scope("decoder"): ## Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(ch2idx), num_units=hp.hidden_units, scale=True, scope="dec_embed") ## Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile( tf.expand_dims( tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") ## Dropout self.dec = tf.layers.dropout( self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward( self.dec, num_units=[4 * hp.hidden_units, hp.hidden_units]) # Final linear projection # 对最后一维做线性变换成词库这么长,对应每个单词的logits,然后将logits最大的索引记录下来,即预测值 self.logits = tf.layers.dense(self.dec, len(ch2idx)) #(N, T, vocab_len) self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1)) # (N, T) # 把y中所有不是<PAD>出来的都由True转化为1.0 self.istarget = tf.to_float(tf.not_equal(self.y, 0)) # acc表示的是 (一个batch中所有的非<PAD>的单词,预测对的数量求和)/(一个batch中所有的非<PAD>单词数量) # tips:tf.reduce_sum()未指定axis,即把所有维度都加起来 self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget) / (tf.reduce_sum(self.istarget)) # 计算acc给summary监督学习过程。 tf.summary.scalar('acc', self.acc) if is_training: # Loss # tf.one_hot(tensor, int),构造一个len(tensor)*int的tensor,tensor的值变成索引,对应位置为1.,其他为0. # 如果索引值大于int大小,则整行都是0. self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len( ch2idx))) #y_smoothed因为one_hot变成了(N, T, vocab_len) # tf.nn.softmax_cross_entropy_with_logits实际上做的事情是: # 1.先对logits求softmax 2.再将vocab_len上的分布和y_label做交叉熵,得到一个(N, T)的向量 # 即每一单词有一个loss self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) # (N, T) # 将<PAD>出来的部分的loss去掉,再求mean_loss self.mean_loss = tf.reduce_sum(self.loss * self.istarget) / ( tf.reduce_sum(self.istarget)) #标量scale # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) # Summary tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() # (N, T) else: # inference self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) # define decoder inputs self.decoder_inputs = tf.concat((tf.ones_like(self.y[:, :1])*2, self.y[:, :-1]), -1) # 2:<S> # Load vocabulary de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() # Encoder with tf.variable_scope("encoder"): ## Embedding self.enc = embedding(self.x, vocab_size=len(de2idx), num_units=hp.hidden_units, scale=True, scope="enc_embed") ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") else: self.enc += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") ## Dropout self.enc = tf.layers.dropout(self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### Multihead Attention self.enc = multihead_attention(queries=self.enc, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) ### Feed Forward self.enc = feedforward(self.enc, num_units=[4*hp.hidden_units, hp.hidden_units]) # Decoder with tf.variable_scope("decoder"): ## Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(en2idx), num_units=hp.hidden_units, scale=True, scope="dec_embed") ## Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") ## Dropout self.dec = tf.layers.dropout(self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention(queries=self.dec, keys=self.dec, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention(queries=self.dec, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward(self.dec, num_units=[4*hp.hidden_units, hp.hidden_units]) # Final linear projection self.logits = tf.layers.dense(self.dec, len(en2idx)) self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum(tf.to_float(tf.equal(self.preds, self.y))*self.istarget)/ (tf.reduce_sum(self.istarget)) tf.summary.scalar('acc', self.acc) if is_training: # Loss self.y_smoothed = label_smoothing(tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget)) # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step) # Summary tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x1, self.x2, self.y, self.num_batch = get_batch_data() #self.x, self.label, self.num_batch = get_batch_data() # (N, T) #self.y = tf.one_hot(self.label, depth = hp.n_class) else: # inference self.x1 = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.x2 = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) #self.label = tf.placeholder(tf.int32, shape = (None, hp.n_class)) #self.y = tf.placeholder(tf.int32, shape = (None, hp.n_class)) #self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.l2_loss = tf.constant(0.0) # define decoder inputs #for sentence relationship learning task we want to encoder sent1 to e1, then decoder(e1 + sent2) #to get a more sementic relationship across corpus self.decoder_inputs = tf.concat( (tf.ones_like(self.x2[:, :1]) * 2, self.x2[:, :-1]), -1) # 2:<S> # Load vocabulary word2idx, idx2word = load_vocabs() # initialize transformer transformer = vanilla_transformer(hp, self.is_training) #encode self.encode1, self.encode2 = transformer.encode(self.x1, len(word2idx)), \ transformer.encode(self.x2, len(word2idx)) #concated self.enc = tf.divide(tf.add(self.encode1, encode2), 2) self.enc = normalize(self.enc) #for sentence relationship learning task we want to encoder sent1 to e1, then decoder(e1 + sent2) #to get a more sementic relationship across corpus # Decoder self.dec = transformer.decode(self.decoder_inputs, self.enc, len(word2idx), hp.p_maxlen) self.logits = tf.add(self.enc, tf.multiply(self.enc, self.dec)) #self.logits = self.enc #self.logits = tf.layers.dense(self.logits, 64, activation = 'tanh') self.logits = tf.layers.flatten(self.logits) #self.logits = tf.reshape(self.logits, [64, -1]) self.h_drop = tf.nn.dropout(self.logits, hp.dropout_keep_prob) with tf.name_scope("output_logit"): W = tf.get_variable( "W", shape=[hp.maxlen * hp.hidden_units, len(hp.relations)], initializer=tf.contrib.layers.xavier_initializer()) b = tf.Variable(tf.constant(0.1, shape=[len(hp.relations)]), name="b") self.l2_loss += tf.nn.l2_loss(W) self.l2_loss += tf.nn.l2_loss(b) self.logits = tf.nn.xw_plus_b(self.h_drop, W, b, name="logit") #self.preds = tf.argmax(self.scores, 1, name="predictions") self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1)) if is_training: self.y_hotting = tf.one_hot(self.y, depth=len(hp.relations)) #Accuracy self.cpl = tf.equal(tf.convert_to_tensor(self.y, tf.int32), self.preds) self.cpl = tf.to_int32(self.cpl) self.acc = tf.reduce_sum(self.cpl) / tf.to_int32( tf.reduce_sum(self.y_hotting)) tf.summary.scalar('acc', self.acc) # Loss #self.y_smoothed = label_smoothing(self.y_hotting) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_hotting) self.mean_loss = (tf.reduce_sum( self.loss) + self.l2_loss * hp.reg_lambda) / tf.reduce_sum( self.y_hotting) # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) # Summary tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.q, self.p, self.q_length, self.p_length, \ self.start_label, self.end_label, self.num_batch = get_batch_data() self.dropout_keep_prob = hp.dropout_keep_prob else: # inference self.q = tf.placeholder(tf.int32, [None, hp.q_maxlen]) self.p = tf.placeholder(tf.int32, [None, hp.p_maxlen]) self.q_length = tf.placeholder(tf.int32, [None]) self.p_length = tf.placeholder(tf.int32, [None]) self.start_label = tf.placeholder(tf.int32, [None]) self.end_label = tf.placeholder(tf.int32, [None]) self.dropout_keep_prob = hp.dropout_keep_prob self.l2_loss = tf.constant(0.0) # define decoder input self.decoder_inputs = tf.concat((tf.ones_like(self.p[:, :1])*2, self.p[:, :-1]), -1) # 2:<S> # Load vocabulary word2idx, idx2word = load_vocabs() # initialize transformer transformer = vanilla_transformer(hp, self.is_training) ### encode self.q_encodes, self.p_encodes = transformer.encode(self.q, len(word2idx)), \ transformer.encode(self.q, len(word2idx)) #concated features to attend p with q # first pad q_encodes to the length of p_encodes pad_dim = hp.p_maxlen - hp.q_maxlen pad_ = tf.zeros([tf.shape(self.q_encodes)[0], pad_dim, hp.hidden_units], dtype = self.q_encodes.dtype) self.padded_q_encodes = tf.concat([self.q_encodes, pad_,], 1) #normalization self.padded_q_encodes = normalize(self.padded_q_encodes) # Decoder self.dec = transformer.decode(self.decoder_inputs, self.padded_q_encodes, len(word2idx), hp.p_maxlen) # fix paragraph tensor with self.dec self.p_encodes = self.dec """ The core of RC model, get the question-aware passage encoding """ match_layer = AttentionFlowMatchLayer(hp.hidden_units) self.match_p_encodes, _ = match_layer.match(self.p_encodes, self.q_encodes, self.p_length, self.q_length) # pooling or bi-rnn to fuision passage encodes if hp.Passage_fuse == 'Pooling': #pooling layer self.match_p_encodes = \ tf.keras.layers.MaxPool1D(pool_size=4, strides=None, padding='valid')\ (self.match_p_encodes) self.match_p_encodes = tf.reshape(self.match_p_encodes, [-1, hp.p_maxlen, hp.hidden_units]) #normalization self.match_p_encodes = tf.layers.batch_normalization(self.match_p_encodes) if hp.use_dropout: self.match_p_encodes = tf.nn.dropout(self.match_p_encodes, self.dropout_keep_prob) elif hp.Passage_fuse == 'bi-rnn': self.fuse_p_encodes, _ = rnn('bi-lstm', self.match_p_encodes, self.p_length, hp.hidden_units, layer_num=1, concat = False) if hp.use_dropout: self.fuse_p_encodes = tf.nn.dropout(self.fuse_p_encodes, self.dropout_keep_prob) decoder = PointerNetDecoder(hp.hidden_units) self.start_probs, self.end_probs = decoder.decode(self.match_p_encodes, self.q_encodes) if is_training: self.start_loss = self.sparse_nll_loss(probs=self.start_probs, labels=self.start_label) self.end_loss = self.sparse_nll_loss(probs=self.end_probs, labels=self.end_label) self.all_params = tf.trainable_variables() self.loss = tf.reduce_mean(tf.add(self.start_loss, self.end_loss)) if hp.weight_decay > 0: with tf.variable_scope('l2_loss'): l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in self.all_params]) self.loss += hp.weight_decay * l2_loss # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step) # Summary tf.summary.scalar('mean_loss', self.loss) self.merged = tf.summary.merge_all()
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() else: self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) # define decoder inputs # id = 2代表<S>,是decoder的初始输入,这一步把正常的y向量做转换,比如y = [["i", "love", "china", "deeply"], ["can", "you", "speak", "chinese"]]修改为 # [["<s>", "i", "love", "china"], ["<s>, "can", "you", "speak"]], 这部分将在decoder阶段,最先输入self-attention部分 # 在训练阶段,decoder_inputs如上,在inference阶段,由于无法获知真正的y,所以y输入的是shape=[batch_size, max_length]的全0向量。 # 处理之后旧变成[["<s>", 0, 0, 0]]这样子,每次值取第一个预测结果,循环输入再取前两个结果 self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() with tf.variable_scope("encoder"): # Embedding self.enc = embedding( self.x, vocab_size=len(de2idx), num_units=hp.hidden_units, zero_pad= True, # id为0的行表示padding的embedding, true表示将这一行置0(随机初始化出来的可能不是0) scale=True, scope="enc_embed") ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.hidden_units, zero_pad=False, scale=False, scope='enc_pe') else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") ## Dropout self.enc = tf.layers.dropout( self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks, 叠加block,6个 for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### MultiHead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) self.enc = feedforward( self.enc, num_units=[4 * hp.hidden_units, hp.hidden_units]) with tf.variable_scope("decoder"): # Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(en2idx), num_units=hp.hidden_units, scale=True, scope="dec_embed") # Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile( tf.expand_dims( tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") # Dropout self.dec = tf.layers.dropout( self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) # Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward( self.dec, num_units=[4 * hp.hidden_units, hp.hidden_units]) # Final linear projection, 分类任务,分类数量是词表长度 self.logits = tf.layers.dense(self.dec, len(en2idx)) self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget / (tf.reduce_sum(self.istarget))) if is_training: # Loss # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。 self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum( self.loss * self.istarget) / (tf.reduce_sum(self.istarget)) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
def __init__(self, is_training): self.graph = tf.Graph() with self.graph.as_default(): self.is_training = is_training if self.is_training: self.next_element, num_batch = get_batch_data(self.is_training) self.X, self.Y, self.seq_len = self.next_element[ "X"], self.next_element["Y"], self.next_element["seq_len"] self.X.set_shape([None, config.maxlen, config.bert_dim]) self.Y.set_shape([None, config.maxlen]) self.seq_len.set_shape([None]) else: self.X = tf.placeholder(tf.float32, shape=(None, config.maxlen, config.bert_dim)) self.Y = tf.placeholder(tf.int32, shape=(None, config.maxlen)) self.seq_len = tf.placeholder(tf.int32, shape=(None)) idx2word, word2idx, idx2labl, labl2idx = load_vocab() embed = tf.convert_to_tensor(self.X) # input embedding Dropout embed = tf.layers.dropout(embed, rate=config.dropout_rate, training=self.is_training) #Muilty layer Bilstm outputs = multibilstm(embed, self.seq_len, config.num_units, config.num_layer, self.is_training, config.cell) #full connect layer # here we use two layer full connect layer. residual and activation can be set by your self. outputs = feedforward(outputs, outputs.get_shape().as_list()[2], scope="first") #residual default used outputs = feedforward(outputs, config.num_class, residual=False, scope="second") noutput = tf.reshape(outputs, [-1, config.maxlen, config.num_class]) # crf layer if config.use_crf: self.loss, self.acc, self.predicts, self.true_labels = crf_layer( self.Y, noutput, config.num_class, self.seq_len, self.is_training) else: self.loss, self.acc, self.predicts, self.true_labels = loss_layer( self.Y, noutput, config.num_class) tf.summary.scalar('acc', self.acc) self.global_step = tf.Variable(0, name='global_step') if self.is_training: # use exponential_decay to help the model fit quicker if config.exponential_decay: learning_rate = tf.train.exponential_decay( config.lr, self.global_step, 200, 0.96, staircase=True) # optimizer = tf.train.AdamOptimizer(learning_rate=config.lr, beta1=0.9, beta2=0.99, epsilon=1e-8) optimizer = tf.train.RMSPropOptimizer(learning_rate=config.lr) self.train_op = optimizer.minimize( self.loss, global_step=self.global_step) tf.summary.scalar('mean_loss', self.loss) else: train_op = None
def __init__(self, hp, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.x_image, self.y_image, self.x_length, self.y, self.num_batch, self.source, self.target, self.x_turn_number, self.x_emotion, self.y_emotion, self.speaker, self.A = get_batch_data( hp) # (N, T) else: # inference self.x = tf.placeholder( tf.int32, shape=(None, hp.max_turn, hp.maxlen)) # shape=(16, 15, 50) self.x_image = tf.placeholder(tf.float32, shape=(None, hp.max_turn, 17)) self.y_image = tf.placeholder(tf.float32, shape=(None, 17)) self.x_length = tf.placeholder(tf.int32, shape=(None, hp.max_turn)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.x_emotion = tf.placeholder(tf.int32, shape=(None, hp.max_turn)) self.y_emotion = tf.placeholder(tf.int32, shape=(None, )) self.speaker = tf.placeholder(tf.int32, shape=(None, )) self.A = tf.placeholder(tf.float32, shape=(None, 7, 90, 90)) self.x_turn_number = tf.placeholder(tf.int32, shape=(None, )) # define decoder inputs self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2:<S> # Load vocabulary # de2idx, idx2de = load_de_vocab(hp) en2idx, idx2en = load_en_vocab(hp) speaker_memory = tf.get_variable( 'speaker_memory', dtype=tf.float32, shape=[13, hp.hidden_units], initializer=tf.contrib.layers.xavier_initializer()) emotion_memory = tf.get_variable( 'emotion_memory', dtype=tf.float32, shape=[7, hp.hidden_units], initializer=tf.contrib.layers.xavier_initializer()) outputs_speaker = tf.nn.embedding_lookup(speaker_memory, self.speaker) outputs_speaker_ = tf.tile(tf.expand_dims(outputs_speaker, 1), [1, 50, 1]) # Encoder with tf.variable_scope("encoder"): ## Embedding embeddingsize = hp.hidden_units / 2 self.enc_embed = embedding( tf.reshape( self.x, [-1, hp.maxlen ]), #batch_size*max_turn=240 shape=(240, 50, 256) vocab_size=len(de2idx), num_units=embeddingsize, scale=True, scope="enc_embed") single_cell = tf.nn.rnn_cell.GRUCell(hp.hidden_units) self.rnn_cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * hp.num_layers) print(self.enc_embed.get_shape()) self.sequence_length = tf.reshape(self.x_length, [-1]) print(self.sequence_length.get_shape()) self.uttn_outputs, self.uttn_states = tf.nn.dynamic_rnn( cell=self.rnn_cell, inputs=self.enc_embed, sequence_length=self.sequence_length, dtype=tf.float32, swap_memory=True) print(hp.batch_size, hp.max_turn, hp.hidden_units) self.enc = tf.reshape( self.uttn_states, [hp.batch_size, hp.max_turn, hp.hidden_units ]) #shape=(16, 15, 512) ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") ## Dropout self.enc = tf.layers.dropout( self.enc, #shape=(32, 15, 512), rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) print('self.enc=', self.enc) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### Multihead Attention self.enc, _ = multihead_attention( queries=self.enc, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) ### Feed Forward self.enc = feedforward( self.enc, num_units=[4 * hp.hidden_units, hp.hidden_units]) #shape=(32, 15, 512), #code.interact(local=locals()) matrix = tf.get_variable("transform", [ self.x_image.shape.as_list()[-1], self.enc.shape.as_list()[-1] ], dtype=tf.float32) self.x_ima = tf.map_fn(lambda x: tf.matmul(x, matrix), self.x_image, dtype=tf.float32) #code.interact(local=locals()) self.enc = tf.concat((self.enc, self.x_ima), -2) s_m = tf.tile(tf.expand_dims(speaker_memory, 0), [hp.batch_size, 1, 1]) e_m = tf.tile(tf.expand_dims(emotion_memory, 0), [hp.batch_size, 1, 1]) self.enc = tf.concat((self.enc, e_m), -2) self.enc = tf.concat((self.enc, s_m), -2) self.H1 = HGraph(256, activation='relu')([self.enc, self.A]) self.H1 = Dropout(hp.dropout_rate)(self.H1) self.H2 = HGraph(256, activation='relu')([self.H1, self.A]) self.enc = Dropout(hp.dropout_rate)(self.H2) self.enc = tf.map_fn(lambda x: x, self.enc, dtype=tf.float32) self.enc = feedforward( self.enc, num_units=[4 * hp.hidden_units, hp.hidden_units]) with tf.variable_scope("emotion"): x3 = tf.reduce_max(self.enc, axis=1) self.emotion_logits = linear(x3, 7, True, False, scope="softmax") outputs_emotion = tf.matmul(self.emotion_logits, emotion_memory) outputs_emotion_ = tf.tile(tf.expand_dims(outputs_emotion, 1), [1, 50, 1]) #shape=(50, 50, 128) # Decoder with tf.variable_scope("decoder"): ## Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(en2idx), num_units=hp.hidden_units, scale=True, scope="dec_embed") ## Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile( tf.expand_dims( tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") ## Dropout self.dec = tf.layers.dropout( self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) print('self.dec', self.dec) #shape=(50, 50, 512) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec, _ = multihead_attention( queries=self.dec, keys=self.dec, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope="self_attention") print('self.dec', self.dec) #shape=(50, 50, 512) ## Multihead Attention ( vanilla attention) self.dec, self.attn = multihead_attention( queries=self.dec, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="vanilla_attention") ## Feed Forward print('self.dec', self.dec) #shape=(50, 50, 512) self.dec = feedforward( self.dec, num_units=[4 * hp.hidden_units, hp.hidden_units]) #code.interact(local=locals()) self.dec_emo = tf.concat((outputs_emotion_, outputs_speaker_), -1) self.dec_emo_spe = tf.concat((self.dec, self.dec_emo), -1) g = tf.nn.sigmoid( layer_norm(linear(self.dec_emo_spe, 256, False, False, scope="context_gate"), name="context_gate_ln")) self.dec_emo_spe = self.dec + g * outputs_emotion_ + ( 1 - g) * outputs_speaker_ self.dec_emo_spe = tf.layers.dropout( self.dec_emo_spe, #shape=(32, 50, 512), rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) # Final linear projection self.logits = tf.layers.dense(self.dec_emo_spe, len(en2idx)) #shape=(128, 50, 5124) self.preds = tf.to_int32(tf.arg_max( self.logits, dimension=-1)) #shape=(128, 50) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget) / (tf.reduce_sum(self.istarget)) tf.summary.scalar('acc', self.acc) # if is_training: # Loss self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) #shape=(256, 50) self.mean_loss = tf.reduce_sum( self.loss * self.istarget) / (tf.reduce_sum(self.istarget)) if is_training: tgt_emotion = label_smoothing( tf.one_hot(self.y_emotion, depth=7)) emotion_loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.emotion_logits, labels=tgt_emotion) emotion_loss = tf.reduce_mean(emotion_loss) # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( (1 - hp.alpha) * self.mean_loss + hp.alpha * emotion_loss, global_step=self.global_step) # Summary tf.summary.scalar('mean_loss', self.mean_loss + emotion_loss) self.merged = tf.summary.merge_all()
def __init__(self, is_training=True): self.graph = tf.Graph() # de2idx, idx2de = load_doc_vocab() # self.vocab_size = len(de2idx) self.vocab_size = len(load_doc_vocab()[0]) with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() # (N, T) else: # inference self.x = tf.placeholder(tf.int32, shape=(None, hp.article_maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.summary_maxlen)) self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2:<S> # define decoder inputs self._add_encoder(is_training=is_training) self.ml_loss = self._add_ml_loss(is_training=is_training) if is_training: self.eta = tf.Variable(initial_value=hp.eta_init, dtype=tf.float32, trainable=False, name='eta') # disallow eta to be updated by loss self.update_eta = tf.assign(self.eta, self.eta + 0.1) self.rl_loss = self._add_rl_loss() self.loss = self.eta * self.rl_loss + (1 - self.eta) * self.ml_loss # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) grads_and_vars_mix = self.optimizer.compute_gradients( loss=self.loss) grads_and_vars_ml = self.optimizer.compute_gradients( loss=self.ml_loss) grad_mix, vars_mix = zip( *grads_and_vars_mix) # parse grad and var grad_ml, vars_ml = zip( *grads_and_vars_ml) # parse grad and var # add gradient clipping clipped_grad_mix, globle_norm_mix = tf.clip_by_global_norm( grad_mix, hp.maxgradient) clipped_grad_ml, globle_norm_ml = tf.clip_by_global_norm( grad_ml, hp.maxgradient) self.globle_norm_ml = globle_norm_ml self.train_op_mix = self.optimizer.apply_gradients( grads_and_vars=zip(clipped_grad_mix, vars_mix), global_step=self.global_step) self.train_op_ml = self.optimizer.apply_gradients( grads_and_vars=zip(clipped_grad_ml, vars_ml), global_step=self.global_step) ''' # below: training wihtout gradient clipping self.train_op_mix = self.optimizer.apply_gradients(grads_and_vars=grads_and_vars_mix, global_step=self.global_step) self.train_op_ml = self.optimizer.apply_gradients(grads_and_vars=grads_and_vars_ml, global_step=self.global_step) ''' # Summary tf.summary.scalar('globle_norm_ml', globle_norm_ml) tf.summary.histogram(name='reward_diff', values=self.reward_diff) tf.summary.histogram(name='clipped_reward_diff', values=self.clipped_reward_diff) tf.summary.scalar('rl_loss', self.rl_loss) tf.summary.scalar('ml_loss', self.ml_loss) tf.summary.scalar('loss', self.loss) self.merged = tf.summary.merge_all() # prepare the Saver that restore all variables other than eta all_var = tf.get_collection(key=tf.GraphKeys.GLOBAL_VARIABLES) all_var.remove(self.eta) self.subset_saver = tf.train.Saver(var_list=all_var) self.filewriter = tf.summary.FileWriter(hp.tb_dir + '/train', self.graph)
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): # Load data self.x, self.y, self.num_batch = get_batch_data() # (N, T) # Load vocabulary char2idx, idx2char = load_vocab() # Encoder ## Embedding enc = embedding(self.x, vocab_size=len(char2idx), num_units=hp.hidden_units, scale=False, scope="enc_embed") # Encoder pre-net prenet_out = prenet( enc, num_units=[hp.hidden_units, hp.hidden_units // 2], dropout_rate=hp.dropout_rate, is_training=is_training) # (N, T, E/2) # Encoder CBHG ## Conv1D bank enc = conv1d_banks(prenet_out, K=hp.encoder_num_banks, num_units=hp.hidden_units // 2, norm_type="ins", is_training=is_training) # (N, T, K * E / 2) ### Max pooling enc = tf.layers.max_pooling1d(enc, 2, 1, padding="same") # (N, T, K * E / 2) ### Conv1D projections enc = conv1d(enc, hp.hidden_units // 2, 3, scope="conv1d_1") # (N, T, E/2) enc = normalize(enc, type="ins", is_training=is_training, activation_fn=tf.nn.relu) enc = conv1d(enc, hp.hidden_units // 2, 3, scope="conv1d_2") # (N, T, E/2) enc += prenet_out # (N, T, E/2) # residual connections ### Highway Nets for i in range(hp.num_highwaynet_blocks): enc = highwaynet( enc, num_units=hp.hidden_units // 2, scope='highwaynet_{}'.format(i)) # (N, T, E/2) ### Bidirectional GRU enc = gru(enc, hp.hidden_units // 2, True) # (N, T, E) # Final linear projection self.logits = tf.layers.dense(enc, 2) # 0 for non-space, 1 for space self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1)) self.istarget = tf.to_float(tf.not_equal(self.x, 0)) # masking self.num_hits = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget) self.num_targets = tf.reduce_sum(self.istarget) self.acc = self.num_hits / self.num_targets if is_training: # Loss self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y) self.mean_loss = tf.reduce_sum( self.loss * self.istarget) / (tf.reduce_sum(self.istarget)) # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step)
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data( ) # shape=[batch_size, max_seq_len] else: self.x = tf.placeholder(tf.int32, shape=(None, hp.max_seq_len)) self.y = tf.placeholder(tf.int32, shape=(None, hp.max_seq_len)) # decoder_inputs '''decoder_inputs和self.y相比,去掉了最后一个句子结束符,而在每句话最前面加了一个初始化为2的id,即<S> ,代表开始。''' self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), axis=-1) # load_vocab de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() # encoder with tf.variable_scope('encoder'): # input - word embedding self.enc = embedding(self.x, vocab_size=len(de2idx), d_model=hp.d_model, scale=True, scope='enc_embed') # input - positional encoding self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.max_seq_len, d_model=hp.d_model, zero_pad=False, scale=False, scope='enc_pe') # Dropout self.enc = tf.layers.dropout( self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) # 3. num_layers multi-head attention for i in range(hp.num_layers): with tf.variable_scope('num_layers_{}'.format(i)): # multi head attention + Add and Norm self.enc = multihead_attention( queries=self.enc, keys=self.enc, d_model=hp.d_model, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) # feed forward + Add and Norm self.enc = feedforward( self.enc, dff=[4 * hp.d_model, hp.d_model]) # decoder with tf.variable_scope('decoder'): self.dec = embedding(self.decoder_inputs, vocab_size=len(en2idx), d_model=hp.d_model, scale=True, scope='dec_embed') self.dec += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.max_seq_len, d_model=hp.d_model, zero_pad=False, scale=False, scope='dec_pe') self.dec = tf.layers.dropout( self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) for i in range(hp.num_layers): with tf.variable_scope('num_layers_{}'.format(i)): # masked multi-head attention self.dec = multihead_attention( queries=self.dec, keys=self.dec, d_model=hp.d_model, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope='self-attention') # multi-head attention self.dec = multihead_attention( queries=self.dec, keys=self.enc, d_model=hp.d_model, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope='vanilla-attention') self.dec = feedforward( self.dec, dff=[4 * hp.d_model, hp.d_model ]) # shape=[batch_size, seq_len, d_model] # final linear projection self.logits = tf.layers.dense( self.dec, len(en2idx)) # shape=[batch_size, seq_len, target_vocab_size] self.preds = tf.to_int32(tf.arg_max( self.logits, dimension=-1)) # 预测值 shape=[batch_size, seq_len] self.istarget = tf.to_float(tf.not_equal( self.y, 0)) # 真实值 shape=[batch_size, seq_len] # pad 部分不参与准确率计算 self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget) / tf.reduce_sum(self.istarget) tf.summary.scalar('acc', self.acc) if is_training: # loss self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) # pad 部分不参与损失计算 self.mean_loss = tf.reduce_sum( self.loss * self.istarget) / (tf.reduce_sum(self.istarget)) # training scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) # summary tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
self.prob_c = tf.nn.softmax(self.logits_c) # (N, T_q, vocab_size) self.prob_t = tf.nn.softmax(self.logits_t) # (N, T_q, tw_vocab_size) self.prob_t = tf.einsum('nlt,tv->nlv', self.prob_t, self.tw_vocab_overlap) # (N, T_q, vocab_size) self.prob = self.prob_c + self.prob_t * hp.penalty # (N, T_q, vocab_size) self.preds = tf.to_int32(tf.argmax(self.prob, axis=-1)) # (N, T_q) if __name__ == '__main__': # Load vocabulary token2idx, idx2token = load_de_en_vocab() tw2idx, idx2tw = load_tw_vocab() token2idx_len = len(token2idx) tw2idx_len = len(tw2idx) X, X_length, Y, YTWD, Y_DI, TW, num_batch = get_batch_data() # Construct graph g = Graph(True, token2idx_len, tw2idx_len, None) print("Graph loaded") # Start session sv = tf.train.Supervisor(graph=g.graph, logdir=hp.logdir, save_model_secs=0) with sv.managed_session() as sess: for epoch in range(1, hp.num_epochs+1): if sv.should_stop(): break loss=[]
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() else: # x: (32,10) y:(32,10) 一个batch32个句子,每个句子长度为10 self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) """ 定义decoder部分的input 假设真实翻译后的输出为 i am a student </S> decoder部分的input应为: <S> i am a student """ self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2代表<S>,是decoder的初始输入 # 词典 de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() with tf.variable_scope("encoder"): # Embedding self.enc = embedding( self.x, vocab_size=len(de2idx), num_units=hp.hidden_units, zero_pad=True, # 让padding一直是0 scale=True, scope="enc_embed") ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.hidden_units, zero_pad=False, scale=False, scope='enc_pe') else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") ##Drop out self.enc = tf.layers.dropout( self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### MultiHead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) self.enc = feedforward( self.enc, num_units=[4 * hp.hidden_units, hp.hidden_units]) with tf.variable_scope("decoder"): # Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(en2idx), num_units=hp.hidden_units, scale=True, scope="dec_embed") ## Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile( tf.expand_dims( tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") # Dropout self.dec = tf.layers.dropout( self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward( self.dec, num_units=[4 * hp.hidden_units, hp.hidden_units]) # Final linear projection self.logits = tf.layers.dense(self.dec, len(en2idx)) self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget / (tf.reduce_sum(self.istarget))) if is_training: # Loss # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。 self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum( self.loss * self.istarget) / (tf.reduce_sum(self.istarget)) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() # (N, T) else: # inference self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) # define decoder inputs self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2:<S> # Load vocabulary de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() # Encoder with tf.variable_scope("encoder"): ## Embedding self.enc = embedding(self.x, vocab_size=len(de2idx), num_units=hp.hidden_units, scale=True, scope="enc_embed") key_masks = tf.expand_dims( tf.sign(tf.reduce_sum(tf.abs(self.enc), axis=-1)), -1) ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") self.enc *= key_masks ## Dropout self.enc = tf.layers.dropout( self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### Multihead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) ### Feed Forward self.enc = feedforward( self.enc, num_units=[4 * hp.hidden_units, hp.hidden_units]) # Decoder with tf.variable_scope("decoder"): ## Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(en2idx), num_units=hp.hidden_units, scale=True, scope="dec_embed") key_masks = tf.expand_dims( tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)), -1) ## Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile( tf.expand_dims( tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") self.dec *= key_masks ## Dropout self.dec = tf.layers.dropout( self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward( self.dec, num_units=[4 * hp.hidden_units, hp.hidden_units]) # Final linear projection self.logits = tf.layers.dense(self.dec, len(en2idx)) self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget) / (tf.reduce_sum(self.istarget)) tf.summary.scalar('acc', self.acc) if is_training: # Loss self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum( self.loss * self.istarget) / (tf.reduce_sum(self.istarget)) # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) # Summary tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
print("Create preprocessed data.....") make_vocab(FLAGS.source_train, "input.vocab") make_vocab(FLAGS.target_train, "output.vocab") print("....Done\n") # Load vocabulary input2idx, idx2input = load_input_vocab() output2idx, idx2output = load_output_vocab() # Construct graph g = Graph("train") print("Graph loaded\n") print("Loading batch data.....") x, y, _ = get_batch_data() print(len(x)) print(len(y)) print("........Done") x = np.array(x) y = np.array(y) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) batches = batch_iter_seq2seq(x, y, FLAGS.batch_size, FLAGS.num_epochs) print("num batches")