def train(self, xs, ys): ''' Returns loss: scalar. train_op: training operation global_step: scalar. summaries: training summary node ''' # forward memory, sents1, src_masks = self.encode(xs) logits, preds, y, sents2 = self.decode(ys, memory, src_masks) # train scheme y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size)) ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_) nonpadding = tf.to_float(tf.not_equal( y, self.token2idx["<pad>"])) # 0: <pad> loss = tf.reduce_sum( ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) if self.hp.io_tie and self.hp.ortho_embedding: lmb = self.hp.ortho_lambda normlevel = self.hp.ortho_reg_norm if not self.hp.fac_embed: real_embedding = self.embeddings[1:, :] if not (self.hp.norm_embedding or self.embedding_normalization): loss = loss + (tf.norm(tf.subtract( tf.matmul(tf.transpose(real_embedding), real_embedding), tf.scalar_mul(tf.constant(2.0, dtype=tf.float32), tf.eye(self.hp.d_model))), ord=normlevel)**2) * lmb else: wtw = tf.matmul(tf.transpose(real_embedding), real_embedding) wtw_diag = tf.linalg.diag(tf.linalg.diag_part(wtw)) loss = loss + (tf.norm(tf.subtract(wtw, wtw_diag))** 2) * lmb else: loss = loss + (tf.norm(tf.subtract( tf.matmul(self.embeddings2, tf.transpose( self.embeddings2)), tf.eye(self.hp.d_embed)), ord=normlevel)**2) * lmb #loss=loss+tf.norm(tf.subtract( tf.matmul( tf.transpose(self.embeddings1), self.embeddings1),tf.eye(self.hp.d_embed) ) ,ord=normlevel)*lmb#if not good, delete this global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) train_op = optimizer.minimize(loss, global_step=global_step) tf.summary.scalar('lr', lr) tf.summary.scalar("loss", loss) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def train(self, xs, ys): ''' Returns loss: scalar. train_op: training operation global_step: scalar. summaries: training summary node ''' # forward memory, sents1 = self.encode(xs) logits, y, sents2 = self.decode(xs, ys, memory) loss = self._calc_loss(y, logits) global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) train_op = optimizer.minimize(loss, global_step=global_step) tf.summary.scalar('lr', lr) tf.summary.scalar("loss", loss) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def train(self, xs, ys): # forward loss_weight = ys[-1] ys = ys[:-1] memory, sents1 = self.encode(xs) logits, preds, y, sents2 = self.decode(ys, memory) # train scheme y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size)) ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_) nonpadding = tf.to_float(tf.not_equal(y, self.token2idx["<pad>"])) # 0: <pad> a = ce * nonpadding print ('loss_weight1.shape', loss_weight.shape) print ('a.shape', a.shape) a = ce * nonpadding * (1 + loss_weight) b = nonpadding loss = tf.reduce_sum(a) / (tf.reduce_sum(b) + 1e-7) correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(preds, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(learning_rate=lr,beta1=0.9, beta2=0.997, epsilon=1e-9) train_op = optimizer.minimize(loss, global_step=global_step) tf.summary.scalar('lr', lr) tf.summary.scalar("loss", loss) tf.summary.scalar("accuracy", accuracy) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def train(self, xs, ys): # Forward memory, sents1 = self.encode(xs) logits, preds, y, sent2 = self.decode(ys, memory) # Train scheme y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size)) ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_) nonpadding = tf.to_float(tf.not_equal(y, self.token2idx['<PAD>'])) loss = tf.reduce_sum( ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) train_op = optimizer.minimize(loss, global_step=global_step) tf.summary.scalar('lr', lr) tf.summary.scalar('loss', loss) tf.summary.scalar('global_step', global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def train_multi_gpu(self, xs, ys): tower_grads = [] global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) loss, summaries = None, None with tf.variable_scope(tf.get_variable_scope()): for i, no in enumerate(self.hp.gpu_list): with tf.device("/gpu:%d" % no): with tf.name_scope("tower_%d" % no): memory, sents1 = self.encode(xs) logits, y, sents2 = self.decode(xs, ys, memory) tf.get_variable_scope().reuse_variables() loss = self._calc_loss(y, logits) grads = optimizer.compute_gradients(loss) tower_grads.append(grads) with tf.device("/cpu:0"): grads = self.average_gradients(tower_grads) train_op = optimizer.apply_gradients(grads, global_step=global_step) tf.summary.scalar('lr', lr) tf.summary.scalar("train_loss", loss) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def train(self, xs, ys, x_paraphrased_dict, synonym_label=None): # forward memory, sents1 = self.encode(xs) _, _, synonym_label_loss = self.labeling(synonym_label, memory) logits, preds, y, sents2 = self.decode(ys, x_paraphrased_dict, memory) # train scheme # generation loss y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size)) ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_) nonpadding = tf.to_float(tf.not_equal(y, self.token2idx["<pad>"])) # 0: <pad> loss = tf.reduce_sum(ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) # multi task loss tloss = self.hp.l_alpha * loss + (1.0-self.hp.l_alpha) * synonym_label_loss global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) train_op = optimizer.minimize(tloss, global_step=global_step) tf.summary.scalar('lr', lr) tf.summary.scalar("loss", loss) tf.summary.scalar("tloss", tloss) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def train(self, xs, ys): ''' Returns loss: scalar. train_op: training operation global_step: scalar. summaries: training summary node ''' # forward memory, sents1, src_masks = self.encode(xs) logits, preds, y, sents2 = self.decode(ys, memory, src_masks) # train scheme y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size)) ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_) nonpadding = tf.to_float(tf.not_equal( y, self.token2idx["<pad>"])) # 0: <pad> loss = tf.reduce_sum( ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) train_op = optimizer.minimize(loss, global_step=global_step) tf.summary.scalar('lr', lr) tf.summary.scalar("loss", loss) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def train(self, xs1, xs2, scores): global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.context.lr, global_step, self.context.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) gpus = get_available_gpus() if gpus: num_gpu = len(gpus) assert self.context.hparams.batch_size % num_gpu == 0 xs1s, xs2s = tf.split(xs1, num_gpu, axis=0), tf.split(xs2, num_gpu, axis=0) scoress = tf.split(scores, num_gpu, axis=0) tower_grads = [] losses = [] with tf.variable_scope(tf.get_variable_scope()) as scope: list_predictions = [] for i in range(num_gpu): with tf.device("/gpu:%d" % i): with tf.name_scope("tower_%d" % i): predictions = self._get_prediction( xs1s[i], xs2s[i]) list_predictions.append(predictions) # square loss partial_loss = tf.reduce_sum(tf.squared_difference( predictions, scoress[i]), name="loss") losses.append(partial_loss) tf.get_variable_scope().reuse_variables() grad = get_gradients_by_loss_and_optimizer( partial_loss, optimizer) tower_grads.append(grad) predictions = tf.concat(list_predictions, axis=0) loss = tf.reduce_mean(losses) grads_and_vars = average_gradients(tower_grads) else: predictions = self._get_prediction(xs1, xs2) loss = tf.reduce_sum(tf.squared_difference(predictions, scores), name="loss") grads_and_vars = get_gradients_by_loss_and_optimizer( loss, optimizer) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) for g, v in grads_and_vars: tf.summary.histogram(v.name, v) tf.summary.histogram(v.name + '_grad', g) tf.summary.scalar("pred_avg", tf.reduce_mean(predictions)) tf.summary.scalar("label_avg", tf.reduce_mean(scores)) tf.summary.scalar('lr', lr) tf.summary.scalar("loss", loss) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def train(self, inputs, targets): global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self._context.lr, global_step, self._context.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) gpus = get_available_gpus() loss_func = self._loss_func_dict.get(self._context.loss_func, self._get_loss) if gpus: num_gpu = len(gpus) assert self._context.hparams.batch_size % num_gpu == 0 partial_inputs = [[] for _ in range(num_gpu)] for input_tmp in inputs: input_tmps = tf.split(input_tmp, num_gpu, axis=0) for i in range(num_gpu): partial_inputs[i].append(input_tmps[i]) targetses = tf.split(targets, num_gpu, axis=0) tower_grads = [] losses = [] with tf.variable_scope(tf.get_variable_scope()) as scope: for i in range(num_gpu): with tf.device("/gpu:%d" % i): with tf.name_scope("tower_%d" % i): partial_loss = loss_func(partial_inputs[i], targetses[i]) losses.append(partial_loss) tf.get_variable_scope().reuse_variables() grad = get_gradients_by_loss_and_optimizer( partial_loss, optimizer) tower_grads.append(grad) loss = tf.reduce_mean(losses) grads_and_vars = average_gradients(tower_grads) else: loss = tf.reduce_mean(loss_func(inputs, targets)) grads_and_vars = get_gradients_by_loss_and_optimizer( loss, optimizer) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) for g, v in grads_and_vars: if g is None: # 无梯度 continue tf.summary.histogram(v.name, v) tf.summary.histogram(v.name + '_grad', g) tf.summary.scalar("pred_avg", tf.reduce_mean(self._outputs)) tf.summary.scalar("infr_avg", tf.reduce_mean(self._inferences)) tf.summary.scalar("label_avg", tf.reduce_mean(targets)) tf.summary.scalar('lr', lr) tf.summary.scalar("loss", loss) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def add_optimizer(self,global_step): # optimizer에 global_step을 넘겨줘야, global_step이 자동으로 증가된다. lr = noam_scheme(self.hp.learning_rate, global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) self.train_op = optimizer.minimize(self.loss, global_step=global_step) return self.train_op
def __init__(self, hp): self.hp = hp self.token2idx, self.idx2token = load_vocab(hp.vocab) self.embeddings = get_token_embeddings(self.hp.vocab_size, self.hp.d_model, zero_pad=True) self.input_x = tf.placeholder(dtype=tf.int32, shape=(None, None), name="input_x") self.decoder_input = tf.placeholder(dtype=tf.int32, shape=(None, None), name="decoder_input") self.target = tf.placeholder(dtype=tf.int32, shape=(None, None), name="target") self.is_training = tf.placeholder(dtype=tf.bool, name="is_training") # encoder self.encoder_hidden = self.encode(self.input_x, training=self.is_training) # decoder self.logits = self.decode(self.decoder_input, self.encoder_hidden, training=self.is_training) self.y_hat = tf.to_int32(tf.argmax(self.logits, axis=-1), name="y_predict_v2") # loss self.smoothing_y = label_smoothing( tf.one_hot(self.target, depth=self.hp.vocab_size)) self.ce_loss = tf.nn.softmax_cross_entropy_with_logits_v2( logits=self.logits, labels=self.smoothing_y) nonpadding = tf.to_float( tf.not_equal(self.target, self.token2idx["<pad>"])) self.loss = tf.reduce_sum( self.ce_loss * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) # optimize self.global_step = tf.train.get_or_create_global_step() self.lr = noam_scheme(self.hp.lr, self.global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.minimize(self.loss, global_step=self.global_step) # tensorboard tf.summary.scalar('lr', self.lr) tf.summary.scalar("loss", self.loss) tf.summary.scalar("global_step", self.global_step) self.summaries = tf.summary.merge_all() # predict part self.y_predict = tf.identity(self.greedy_search(), name="y_predict")
def train(self, xs, ys): ''' Returns loss: scalar. train_op: training operation global_step: scalar. summaries: training summary node ''' # forward memory, sents1 = self.encode(xs) # memory = tf.Print(memory, [memory], message='memory =', summarize=10) logits, preds, y, sents2 = self.decode(ys, memory) # logits = tf.Print(logits, [logits], message='logits =', summarize=10) print('train logits.shape, y.shape =', logits.shape, y.shape) # train scheme y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size)) # logits = tf.Print(logits, [logits], message='logits =', summarize=10) # y_ = tf.Print(y_, [y_], message='y_ =', summarize=10) ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_) nonpadding = tf.to_float(tf.not_equal( y, self.token2idx["<pad>"])) # 0: <pad> # nonpadding = tf.Print(nonpadding, [nonpadding], message='nonpadding =', # summarize=100) # nonpadding_print = tf.print('nonpadding =', tf.shape(nonpadding) # , summarize=20) # with tf.control_dependencies([nonpadding_print]): loss = tf.reduce_sum( ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) # loss = tf.Print(loss, [loss], message='loss =', summarize=10) global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) # gradients = optimizer.compute_gradients(loss) # # print_grad = tf.print('gradients =', gradients, summarize=10) # # with tf.control_dependencies([print_grad]): # clip_grads = [(tf.clip_by_value(grad, -100., 100.), var) for grad, var in gradients] # train_op = optimizer.apply_gradients(clip_grads, global_step=global_step) train_op = optimizer.minimize(loss, global_step=global_step) tf.summary.scalar('lr', lr) tf.summary.scalar("loss", loss) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def train_labeling(self, xs, synonym_label=None): # forward memory, sents1 = self.encode(xs) _, _, loss = self.labeling(synonym_label, memory) # train scheme global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) train_op = optimizer.minimize(loss, global_step=global_step) tf.summary.scalar('lr', lr) tf.summary.scalar("loss", loss) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def train(self, xs, ys): logits, y_, dec = self.encode_decode(xs, ys) ce = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=y_) loss = tf.reduce_sum(ce) global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) train_op = optimizer.minimize(loss, global_step=global_step) tf.summary.scalar('lr', lr) tf.summary.scalar("loss", loss) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def train(self, xs, ys): ''' Returns loss: scalar. train_op: training operation global_step: scalar. summaries: training summary node ''' # forward memory, sents1, src_masks = self.encode(xs) logits, preds, y, sents2 = self.decode(ys, memory, src_masks) # train scheme # y:(N, T2)值:[[ 5768 7128 7492 7128 7492 4501 7128 7128 14651],[ 5768 7128 7492 7128 7492 4501 7128 7128 14651]] # y_:(N, T2, vocab_size); 值:(N, T2,[0,0.999,0,.....,0] y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size)) # 预测值和label做交叉熵,生成损失值 # logits:预测id的概率 (N, T2,[0,0.999,0,.....,0]) # ce: (N,T2) 例如:(4, 42) 值:array([[ 6.8254533, 6.601975 , 6.5515084...,9.603574 , 10.001306 ],[6.8502007, 6.645137...]】,每个字粒度的损失 ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_) # nonpadding:(N,T2) 例如:(4, 42) 值:[[[1., 1., 1.,0,0,0],[1., 1., 1., 1., 1., 1.,.....] nonpadding = tf.to_float(tf.not_equal( y, self.token2idx["<pad>"])) # 0: <pad> # tf.reduce_sum 按照某一维度求和 不指定axis,默认所有维度 # ce * nonpadding:只求没有填充的词的损失,padding的去掉了 tf.reduce_sum(nonpadding):个数相加为了求平均 loss = tf.reduce_sum( ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) global_step = tf.train.get_or_create_global_step() # 根据训练步数,动态改变学习率 lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps) #定义优化器 optimizer = tf.train.AdamOptimizer(lr) train_op = optimizer.minimize(loss, global_step=global_step) tf.summary.scalar('lr', lr) tf.summary.scalar("loss", loss) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def train(self, xs, ys): # 用于训练模型 ''' Returns loss: scalar. train_op: training operation global_step: scalar. summaries: training summary node ''' # forward # 调用decode()和encode()来获取个部分的输出结果 memory, sents1, src_masks = self.encode(xs) logits, preds, y, sents2 = self.decode(ys, memory, src_masks) # train scheme # 利用one_hot表示每个词的索引在整个词表中的位置, 相当于构建出了要训练的目标Label, # 这里就是要使logits的最终结果,即vocab_size大小的向量中,目标词汇所在位置(索引)的值尽可能的大,而使其他位置的值尽可能的小。 # 构造出了输出和标签之后,就使用tf.nn.softmax_cross_entropy_with_logits()进行训练 y_ = label_smoothing(tf.one_hot( y, depth=self.hp.vocab_size)) # label_smoothing函数用来进行one hot函数的平滑处理 ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_) # 在计算Loss之前,还要进行一定的处理。由于一开始对有些句子长度不够maxlen的进行了padding,所以在计算Loss的时候,将这些位置上的误差清0 nonpadding = tf.to_float(tf.not_equal( y, self.token2idx["<pad>"])) # 0: <pad> # loss函数用到了交叉熵函数,但是在计算的时候去掉了padding的影响。 loss = tf.reduce_sum( ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) #对学习率进行调整,用到了warmup操作,初始阶段lr逐渐上升,迭代后期则逐渐下降 global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) train_op = optimizer.minimize(loss, global_step=global_step) # 最后即是利用了AdadeltaOptimizer优化器对loss进行优化。 # tf.summary.scalar()函数即是以key - value的形式保存数值,可以用于TensorBoard中对数据的可视化展示。 tf.summary.scalar('lr', lr) tf.summary.scalar("loss", loss) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def train(self, xs, ys): """ train model :param xs: dataset xs :param ys: dataset ys :return: loss train op global step tensorflow summary """ tower_grads = [] global_step = tf.train.get_or_create_global_step() global_step_ = global_step * self.hp.gpu_nums lr = noam_scheme(self.hp.d_model, global_step_, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) losses = [] xs, ys = split_input(xs, ys, self.hp.gpu_nums) with tf.variable_scope(tf.get_variable_scope()): for no in range(self.hp.gpu_nums): with tf.device("/gpu:%d" % no): with tf.name_scope("tower_%d" % no): memory_h, memory_u, sents1 = self.encode(xs[no]) logits, y, sents2 = self.decode( xs[no], ys[no], memory_h, memory_u) tf.get_variable_scope().reuse_variables() loss = self._calc_loss(y, logits) losses.append(loss) grads = optimizer.compute_gradients(loss) # print(grads) tower_grads.append(grads) with tf.device("/cpu:0"): grads = self.average_gradients(tower_grads) train_op = optimizer.apply_gradients(grads, global_step=global_step) loss = sum(losses) / len(losses) tf.summary.scalar('lr', lr) tf.summary.scalar("train_loss", loss) summaries = tf.summary.merge_all() return loss, train_op, global_step_, summaries
def train(self, xs, ys): ''' Returns loss: scalar. train_op: training operation global_step: scalar. summaries: training summary node ''' # 构建encoder和decoder memory, sents1 = self.encode(xs) logits, preds, y, sents2 = self.decode(ys, memory) # train scheme y_ = label_smoothing(tf.one_hot( y, depth=self.hp.vocab_size)) # batch_size*T*vocab_size ce = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits, labels=y_) # logits 未经过softmax处理,因为函数内部进行了处理, nonpadding = tf.to_float(tf.not_equal(y, self.token2idx["<pad>"]) ) # 0: <pad> 相当于mask,不等长序列最后计算loss要剔除padding项 # 计算整个batch内的平均loss,1e-7防止分母为0的情况发生 # ce * nonpadding 只计算非padding的 loss # 分母为 tf.reduce_sum(nonpadding) 表示计算整个batch的平均loss loss = tf.reduce_sum( ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) train_op = optimizer.minimize(loss, global_step=global_step) tf.summary.scalar('lr', lr) tf.summary.scalar("loss", loss) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def train(self, xs, ys, mode): ''' Returns loss: scalar. train_op: training operation global_step: scalar. summaries: training summary node ''' # forward mu, sigma = self.encoder_vae(xs, training=True, mode=mode) if mode == "TPAGE" or mode == "PPAGE": # 表示 训练VAE # 这里提醒自己一下 将embeding 全部设为True z = mu + sigma * tf.random_normal( tf.shape(mu), 0, 1, dtype=tf.float32) else: raise ("许海明在这里提醒你:出现非法mode") logits, preds, y, sents2 = self.decoder_vae(ys, z, training=True, mode=mode) # train scheme ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y) nonpadding = tf.to_float(tf.not_equal( y, self.token2idx["[PAD]"])) # 0: <pad> loss_decoder = tf.reduce_sum(ce * nonpadding) / tf.to_float( get_shape_list(xs[0], expected_rank=2)[0]) # 这里加上KL loss if mode == "TPAGE": KL_loss = tf.reduce_mean(0.5 * tf.reduce_sum( tf.square(mu) + tf.square(sigma) - tf.log(1e-8 + tf.square(sigma)) - 1, [1])) else: KL_loss = 0.0 loss = loss_decoder + KL_loss global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) train_op = optimizer.minimize(loss, global_step=global_step) # # monitor a random sample n = tf.random_uniform((), 0, tf.shape(preds)[0] - 1, tf.int32) print_demo = (xs[0][n], y[n], preds[n]) tf.summary.scalar('lr', lr) tf.summary.scalar("KL_loss", KL_loss) tf.summary.scalar("loss_decoder", loss_decoder) tf.summary.scalar("loss", loss) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries, print_demo
def build(self): # truth vector self.T = tf.get_variable('truth_vector', shape=[1, 1, self.item_embedding_dim], dtype=tf.float32, trainable=False) # embedding matrix self.user_embedding_layer = tf.get_variable( name='user_embedding_layer', shape=[self.num_users, self.user_embedding_dim], dtype=tf.float32) self.item_embedding_layer = tf.get_variable( name='item_embedding_layer', shape=[self.num_items, self.item_embedding_dim], dtype=tf.float32) # embedding self.user_emb_vec = tf.nn.embedding_lookup(self.user_embedding_layer, self.input_user) self.item_emb_vec = tf.nn.embedding_lookup(self.item_embedding_layer, self.input_items) self.target_emb_vec = tf.nn.embedding_lookup(self.item_embedding_layer, self.input_target) self.negative_sample_emb_vec = tf.nn.embedding_lookup( self.item_embedding_layer, self.input_negative_sample) # interaction self.encoder = interact_encoder(self.user_emb_vec, self.item_emb_vec, self.hidden1_dim, self.hidden2_dim, activation=self.activation, interact_type=self.interact_type) self.encoder_pos = interact_encoder(self.user_emb_vec, self.target_emb_vec, self.hidden1_dim, self.hidden2_dim, activation=self.activation, interact_type=self.interact_type) self.encoder_neg = interact_encoder(self.user_emb_vec, self.negative_sample_emb_vec, self.hidden1_dim, self.hidden2_dim, activation=self.activation, interact_type=self.interact_type) # NOT(*) operation feedback_to_oper = self.input_feedback_score[:, :, tf. newaxis] * tf.ones_like( self.encoder) applicable = tf.equal(feedback_to_oper, 1) encoder_to_oper = tf.where(applicable, self.encoder, tf.zeros_like(self.encoder)) not_encoder = not_modules(encoder_to_oper, self.hidden1_dim, self.hidden2_dim, activation=self.activation) self.not_encoder = tf.where(applicable, not_encoder, self.encoder) # OR(*) operation self.or_cell = OrMoudleCell(self.hidden1_dim, self.hidden2_dim) self.or_encoder, _ = tf.nn.dynamic_rnn( self.or_cell, self.not_encoder[:, 1:, :], initial_state=self.not_encoder[:, 0, :], dtype=tf.float32) self.or_encoder_last = self.or_encoder[:, -1, :] self.or_encoder_pos, _ = tf.nn.dynamic_rnn( self.or_cell, self.encoder_pos, initial_state=self.or_encoder_last, dtype=tf.float32) self.or_encoder_neg, _ = tf.nn.dynamic_rnn( self.or_cell, self.encoder_neg, initial_state=self.or_encoder_last, dtype=tf.float32) # cosine similarity self.probability_pos = cosine_probability(self.or_encoder_pos, self.T) self.probability_neg = cosine_probability(self.or_encoder_neg, self.T) # pair-wise loss self.traget_loss = -tf.reduce_sum( tf.log_sigmoid(self.probability_pos - self.probability_neg)) # L2 loss trainable_variables = tf.trainable_variables() self.l2_loss = tf.reduce_sum( [tf.nn.l2_loss(var) for var in trainable_variables]) # model loss self.lnn_loss = self.traget_loss + self.l2_weight * self.l2_loss # logical regularizer loss event_space_vectors = [ self.encoder, self.encoder_pos, self.encoder_neg, self.not_encoder, self.or_encoder, self.or_encoder_pos, self.or_encoder_neg ] event_space_vectors = tf.concat(event_space_vectors, axis=1) self.logical_loss = self.logical_regularizer(event_space_vectors) # sum self.loss = self.lnn_loss + self.logical_weight * self.logical_loss # Adam global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.learning_rate, global_step, self.warmup_steps) self.optimizer = tf.train.AdamOptimizer(learning_rate=lr) # train self.train_op = self.optimizer.minimize(self.loss, global_step=global_step) # tensorboard scalar tf.summary.scalar('loss', self.loss) tf.summary.scalar('traget_loss', self.traget_loss) tf.summary.scalar('l2_loss', self.l2_loss) tf.summary.scalar('logical_loss', self.logical_loss) tf.summary.scalar('lr', lr) tf.summary.scalar('global_step', global_step) self.summaries = tf.summary.merge_all()
def batch_split_train(self, xs, ys, split_num=4): ''' Returns loss: scalar. train_op: training operation global_step: scalar. summaries: training summary node ''' # forward #xs_split=tf.split(xs,gpu_num) #ys_split=tf.split(ys,gpu_num) #print(xs) #print(ys) #xs_split=[] #ys_split=[] #batchsize=self.hp.batch_size ''' divided_batch_size=batchsize//split_num for i in range(split_num): start=divided_batch_size*i end=start+divided_batch_size xs_split.append((xs[0][start:end],xs[1][start:end],xs[2][start:end])) ys_split.append((ys[0][start:end],ys[1][start:end],ys[2][start:end],ys[3][start:end])) ''' global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.hp.lr, global_step // split_num, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) #models=[] #for i in range(split_num): memory, sents1, src_masks = self.encode(xs) logits, preds, y, sents2 = self.decode(ys, memory, src_masks) # train scheme y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size)) ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_) nonpadding = tf.to_float(tf.not_equal( y, self.token2idx["<pad>"])) # 0: <pad> loss = tf.reduce_sum( ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) if self.hp.io_tie and self.hp.ortho_embedding: lmb = self.hp.ortho_lambda normlevel = self.hp.ortho_reg_norm if not self.hp.fac_embed: real_embedding = self.embeddings[1:, :] if not (self.hp.norm_embedding or self.embedding_normalization): loss = loss + (tf.norm(tf.subtract( tf.matmul(tf.transpose(real_embedding), real_embedding), tf.scalar_mul(tf.constant(2.0, dtype=tf.float32), tf.eye(self.hp.d_model))), ord=normlevel)**2) * lmb else: wtw = tf.matmul(tf.transpose(real_embedding), real_embedding) wtw_diag = tf.linalg.diag(tf.linalg.diag_part(wtw)) loss = loss + (tf.norm(tf.subtract(wtw, wtw_diag))** 2) * lmb else: loss = loss + (tf.norm(tf.subtract( tf.matmul(self.embeddings2, tf.transpose( self.embeddings2)), tf.eye(self.hp.d_embed)), ord=normlevel)**2) * lmb #loss=loss+tf.norm(tf.subtract( tf.matmul( tf.transpose(self.embeddings1), self.embeddings1),tf.eye(self.hp.d_embed) ) ,ord=normlevel)*lmb#if not good, delete this grads = optimizer.compute_gradients(loss) self.steps.append((loss, grads)) if len(self.steps) == split_num: tower_losses, tower_grads = zip(*self.steps) train_op = optimizer.apply_gradients( average_gradients(tower_grads), global_step=global_step) self.steps = [] else: train_op = optimizer.apply_gradients([], global_step=global_step) #aver_loss=tf.reduce_mean(tower_losses) tf.summary.scalar('lr', lr) tf.summary.scalar("loss", loss) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def build(self): position_encoding_outputs = modules.position_encoding( self.x_input, args.position_size) if args.position_encoding_type == 'add': outputs = position_encoding_outputs + self.x_input if args.position_encoding_type == "concat": outputs = tf.concat([self.x_input, position_encoding_outputs], axis=2) for i in range(6): sublayer1 = modules.multi_head_attention( outputs, outputs, outputs, args.head_num, args.head_size, self.dropout, self.training, type=args.attention_unit_type) self.mhas.append(sublayer1) outputs = modules.residual_connection(outputs, sublayer1, self.training) sublayer2 = modules.feed_forward(outputs, args.feed_forward_size, self.dropout, self.training) outputs = modules.residual_connection(outputs, sublayer2, self.training) outputs = tf.layers.dense(outputs, 1, use_bias=True, name='last_output') outputs = tf.squeeze(outputs, -1) # (batch_size, seqlen) outputs = tf.layers.dense(outputs, args.nlabel, name='output_logit') self.logits = outputs self.logits_softmax = tf.nn.softmax(outputs, name='output_logit_softmax') if self.training is not None: util.params_usage(tf.trainable_variables()) y = tf.one_hot(self.y_true, args.nlabel) self.y_smooth = modules.label_smoothing(y) loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.y_smooth, logits=self.logits) self.loss = tf.reduce_mean(loss) self.global_step = tf.train.get_or_create_global_step() self.lr = modules.noam_scheme(args.eta, global_step=self.global_step, warmup_steps=args.warmup) train_op = tf.compat.v1.train.AdamOptimizer(self.lr).minimize( self.loss, global_step=self.global_step) update_ops = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.UPDATE_OPS) self.train_op = tf.group([train_op, update_ops]) self.y_pred = tf.argmax(self.logits_softmax, axis=1, name="y_pred") pred_prob = tf.equal(tf.cast(self.y_pred, tf.int32), self.y_true) self.accuracy = tf.reduce_mean(tf.cast(pred_prob, tf.float32), name="accuracy") tf.compat.v1.summary.scalar('accuracy', self.accuracy) tf.compat.v1.summary.scalar('loss', self.loss) tf.compat.v1.summary.scalar('learning rate', self.lr) self.merged_summary_op = tf.compat.v1.summary.merge_all()