def getMLELoss(self, target, decoder_outputs, max_decoder_len): '''MLE损失 ''' masks = tf.sequence_mask(lengths=self.dec_len, maxlen=max_decoder_len, dtype=tf.float32, name='masks') weights = tf.slice(self.rewards, begin=[0, 0], size=[-1, max_decoder_len]) weights = tf.exp(weights) weights = tf.clip_by_value(weights, clip_value_min=1, clip_value_max=tf.reduce_min(weights) * 3) # weights=weights/tf.reduce_max(weights) weights = weights * masks #计算每个单词的加权 loss weighted_loss = seq2seq.sequence_loss(logits=decoder_outputs, targets=target, weights=weights, average_across_timesteps=True, average_across_batch=True) #mle loss loss = seq2seq.sequence_loss(logits=decoder_outputs, targets=target, weights=masks, average_across_timesteps=True, average_across_batch=True) return weighted_loss, loss
def training(self): softmax_loss_per_output = [] for i in range(len(self.logits)): if self.seq_length is not None: softmax_loss = s2s.sequence_loss(self.logits[i], self.target[:, :, i], self._numerical_mask) else: softmax_loss = s2s.sequence_loss( self.logits[i], self.target[:, :, i], tf.ones_like(self.target[:, :, i], self.logits[i].dtype)) softmax_loss_per_output.append(softmax_loss) if len(softmax_loss_per_output) == 1: self._softmax_loss = softmax_loss_per_output[0] else: self._softmax_loss = tf.add_n(softmax_loss_per_output) if isinstance(self.cell, ACTWrapper): self._ponder_loss = self.time_penalty * self.cell.get_ponder_cost( self.seq_length) self._ponder_steps = self.cell.get_ponder_steps(self.seq_length) total_loss = self._softmax_loss + self._ponder_loss else: total_loss = self._softmax_loss return self.optimizer.minimize(total_loss)
def compute_loss(self, logits): target_output = self.iterator.target_output if self.time_major: target_output = tf.transpose(target_output) max_time = self.get_max_time(target_output) target_weights = tf.sequence_mask(lengths=self.iterator.target_length, maxlen=max_time) if self.time_major: target_weights = tf.transpose(target_weights) target_weights = tf.cast(target_weights, tf.float32) # crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target_output, logits=logits) # _loss = tf.reduce_sum(crossent * target_weights) / tf.to_float(self.hps.batch_size) _loss = seq2seq.sequence_loss(logits=logits, targets=target_output, weights=target_weights) tf.summary.scalar(name='seq2seq-loss', tensor=_loss) return _loss
def add_loss_op(self, output): """Adds loss ops to the computational graph. Hint: Use tensorflow.python.ops.seq2seq.sequence_loss to implement sequence loss. Args: output: A tensor of shape (None, self.vocab) Actually according to the docs this should be of the shape below ([batch_size x sequence_length x logits] tensor) https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/sequence_loss But based on our result of the projection operation we should also try sequence_length x batch_size x logits Returns: loss: A 0-d tensor (scalar) """ ### YOUR CODE HERE print("calculating loss") with tf.variable_scope("loss_op") as scope: labels = tf.reshape(self.labels_placeholder, [self.config.num_steps, self.config.batch_size]) weights = tf.ones(shape=tf.shape(labels), dtype=tf.float32, name="weights") loss = sequence_loss(logits=output, targets=labels, weights=weights, name="sequence_loss") tf.summary.scalar("loss", loss) scope.reuse_variables() ### END YOUR CODE return loss
def __init__(self, args, training=True): self.args = args if not training: args.batch_size = 1 use_dropout = training and (args.output_keep_prob < 1.0 or args.input_keep_prob < 1.0) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) embedding = self.create_var('input', 'embedding', [args.vocab_size, args.rnn_size]) inputs = tf.nn.embedding_lookup(embedding, self.input_data) if use_dropout: inputs = tf.nn.dropout(inputs, args.output_keep_prob) cell_fn = self.select_cell_fn(args.model) cells_fw = self.create_cell_stack('hidden_fw', cell_fn, args, use_dropout=use_dropout) cells_bw = self.create_cell_stack('hidden_bw', cell_fn, args, use_dropout=use_dropout) self.cell_fw = rnn.MultiRNNCell(cells_fw, state_is_tuple=True) self.cell_bw = rnn.MultiRNNCell(cells_bw, state_is_tuple=True) self.initial_state = (self.cell_fw.zero_state(args.batch_size, tf.float32), self.cell_bw.zero_state(args.batch_size, tf.float32)) sequence_length = [args.seq_length]*args.batch_size outputs, self.final_state = tf.nn.bidirectional_dynamic_rnn(self.cell_fw, self.cell_bw, inputs, sequence_length, initial_state_fw=self.initial_state[0], initial_state_bw=self.initial_state[1]) # bidi dynamic rnn does not concatenate fw and bw cells by default output = tf.concat(outputs, 2, name="concat_outputs") softmax_w = self.create_var('rnlm', 'softmax_w', [2*args.rnn_size, args.vocab_size]) softmax_b = self.create_var('rnlm', 'softmax_b', [args.vocab_size]) # Reshape/matmul/reshape sequence self.logits = tf.einsum("ijk,kl->ijl", output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss(self.logits, self.targets, tf.ones([args.batch_size, args.seq_length]), average_across_batch=False) with tf.name_scope('cost'): self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.lr = tf.Variable(0.0, trainable=False) # apply clipping tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) with tf.name_scope('optimizer'): optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) # instrument tensorboard tf.summary.histogram('logits', self.logits) tf.summary.histogram('loss', loss) tf.summary.scalar('train_loss', self.cost)
def train(self): self.global_step = tf.train.get_or_create_global_step() training_logit, inference_logit = self.seq2seq_model \ (self.text_length, self.targets, self.summary_length, self.max_summary_length, self.vocab_size) # create tensor for train_logit and inference_logit self.training_logits = tf.identity(training_logit.rnn_output, 'logits') self.inference_logits = tf.identity(inference_logit.sample_id, 'predictions') # create weights for sequence_loss masks = tf.sequence_mask(self.summary_length, self.max_summary_length, dtype=tf.float32, name='masks') with tf.variable_scope('optimization'): self.cost = seq2seq.sequence_loss(self.training_logits, self.targets, masks) optimizer = tf.train.AdamOptimizer(self.learning_rate) gradients = optimizer.compute_gradients(self.cost) cliped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None] self.train_op = optimizer.apply_gradients( cliped_gradients, global_step=self.global_step) tf.summary.scalar('loss', self.cost) self.summary = tf.summary.merge_all()
def build_model(self): get_sample = tf.make_template('gen', self.Generate) get_disc = tf.make_template('disc', self.Discriminate) self.weight = tf.get_variable('weight_gen',initializer = tf.truncated_normal([args.rnn_size, args.vocab_size], stddev=0.1),dtype =tf.float32) self.bias = tf.get_variable('bias_gen',initializer = tf.constant(0.1, shape=[args.vocab_size]),dtype = tf.float32,trainable = False) self.gen_lr = self.args.gen_learning_rate #self.target = tf.placeholder(tf.float32,[args.batch_size,1]) self.W1 = tf.get_variable(initializer = tf.random_normal([args.dis_seq_length,args.fc_hidden],stddev = 0.35,dtype = tf.float64),name = 'disc_W1') self.W2 = tf.Variable(initializer = tf.random_normal([args.fc_hidden,1],stddev = 0.2,dtype = tf.float64),name = 'disc_W2') self.b1 = tf.Variable(initializer = tf.random_normal([args.batch_size,1],stddev = 0.2,dtype = tf.float64),name = 'disc_b1',trainable = False) self.b2 = tf.Variable(initializer = tf.random_normal([args.batch_size,1],stddev = 0.2,dtype = tf.float64),name = 'disc_b2',trainable = False) self.dis_lr = self.args.disc_learning_rate if self.flag == 'GAN': self.fake_data = get_sample() #self.loop = tf.make_template('gen', self.loop) self.D_fake, self.D_logit_fake = get_disc('fake') self.D_real, self.D_logit_real= get_disc('real') self.D_loss = -tf.reduce_mean(tf.log(self.D_real) + tf.log(1 - self.D_fake)) self.G_loss = -tf.reduce_mean(tf.log(1 - self.D_fake)) self.Dtvars = [v for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='gen')] self.Gtvars = [v for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='disc')] else: logits = get_sample(self.flag) weights = tf.concat([tf.ones([self.args.batch_size, 16]), tf.zeros([self.args.batch_size,34])]) self.G_Loss = seq2seq.sequence_loss(logits, self.response,weights, average_across_timesteps = True, average_across_batch = True, name = 'Sequence_loss') pred,_ = get_disc(flag) self.D_Loss = -tf.reduce_mean(self.target*tf.log(pred) + (1.-self.target)*tf.log(1. - pred) )
def add_loss_op(self, output): """Adds loss ops to the computational graph. Hint: Use tensorflow.python.ops.seq2seq.sequence_loss to implement sequence loss. Args: output: A tensor of shape (None, self.vocab) Returns: loss: A 0-d tensor (scalar) """ ### YOUR CODE HERE b_size = self.config.batch_size n_steps = self.config.num_steps targets = [tf.reshape(self.labels_placeholder, [-1])] weights = [tf.ones([b_size*n_steps])] print "\n\nLoss Op: " print "logits ", len(output), " - ", output[0].shape t = tf.reshape(self.labels_placeholder, [b_size, n_steps]) print "labels ", t #print "weights ", w = tf.ones([b_size, n_steps]) print "weights ", w f = tf.reshape(output, [b_size, n_steps, len(self.vocab)]) print "reshaped ", f s2s_loss = sequence_loss(logits=f, targets=t, weights=w) self.sMax = tf.nn.softmax(f) print "smax ", self.sMax tf.add_to_collection('total_loss', s2s_loss) loss = s2s_loss print loss #raise NotImplementedError ### END YOUR CODE return loss
def _init_optimizer(self): # 整理输出并计算loss logits = tf.transpose(self.decoder_logits_train, [1, 0, 2]) targets = tf.transpose(self.decoder_train_targets, [1, 0]) self.logits = tf.transpose(self.decoder_logits_train, [1, 0, 2]) self.targets = tf.transpose(self.decoder_train_targets, [1, 0]) self.loss = seq2seq.sequence_loss(logits=logits, targets=targets, weights=self.loss_weights) opt = tf.train.AdamOptimizer() self.train_op = opt.minimize(self.loss) # add params = tf.trainable_variables() self.gradient_norms = [] self.updates = [] gradients = tf.gradients(self.loss, params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, self.max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.global_variables())
def add_loss_op(self, output): cross_entropy = sequence_loss(output, self.inputs_placeholder_dict['tags'], self.weight) tf.add_to_collection('total loss', cross_entropy) loss = tf.add_n(tf.get_collection('total loss')) return loss
def build_model(self): questions=tf.nn.embedding_lookup(self.embedding,self.question_in) encoder_outputs,encoder_state=self.encoder(questions) print("encoder_outputs",encoder_outputs.get_shape()) decoder_inputs=tf.zeros(shape=[tf.shape(encoder_outputs)[0],self.subject_len,self.hidden_dim]) decoder_outputs,decoder_state,decoder_context=self.decoder(encoder_state=encoder_state,inputs=decoder_inputs) self.decoder_outptus=decoder_inputs print("decoder_outputs:",decoder_outputs.get_shape()) self.logits,_,_=self.decoder(encoder_state,inputs=encoder_outputs,is_train=False) #beam search解码 self.output=tf.nn.ctc_greedy_decoder(tf.transpose(self.logits,perm=[1,0,2]),sequence_length=self.seq_len,merge_repeated=False) weights=tf.cast(tf.greater(self.subject_in,0),tf.float32) def softmax_loss_function(inputs,labels): '''定义softmax_loss_function损失函数,由seq2seq.sequence_loss调用 ''' print("inputs:",inputs.get_shape()) print("labels:",labels.get_shape()) labels=tf.expand_dims(labels,1) loss=tf.nn.sampled_softmax_loss(self.dec_embedding,self.dec_bias,labels=labels,inputs=inputs,num_sampled=100,num_classes=self.vocab_size) return loss ''' 损失函数: logits: 3-D, shape=[batch_size,sequence_length,num_decoder_symbols], num_decoder_symbols为decoder的维度,可以是类别个数,也可以是词向量的维度(需要使用负采样的softmax_loss_function) targets: 2-D, shape=[batch_size,sequence_length], dtype为int,目标序列的index weights: 2-D, shape=[batch_size,sequence_length],dtype为float softmax_loss_function: Function(inputs-batch,labels-batch) ''' self.loss=seq2seq.sequence_loss(logits=decoder_outputs,targets=self.subject_in,weights=weights,softmax_loss_function=softmax_loss_function) self.opt=tf.train.AdamOptimizer(learning_rate=self.lr).minimize(self.loss)
def _build_loss(self, logits, target, target_length): with tf.variable_scope('loss'): weight_masks = tf.sequence_mask(lengths=target_length, maxlen=self.max_len, dtype=tf.float32) loss = seq2seq.sequence_loss(logits, target, weight_masks) return loss
def recon_loss(self): return seq2seq.sequence_loss( logits=self.logits, targets=self.train_targets, # * max sequence length? weights=self.loss_weights, name="reconstruction_loss", )
def _init_optimizer(self): logits = tf.transpose(self.decoder_logits_train, [1, 0, 2]) targets = tf.transpose(self.decoder_train_targets, [1, 0]) self.loss = seq2seq.sequence_loss(logits=logits, targets=targets, weights=self.loss_weights) self.train_op = tf.train.AdamOptimizer().minimize(self.loss)
def build_training_model(self): self.def_placeholder_and_components() emb_out = [] enc_h_out = [] past_for_decoder = [] for i in range(0, self.input_num): past_length = 0 h = tf.gather(self.wte, self.inputs[i]) + tf.gather( self.wpe, positions_for(self.inputs[i], past_length)) emb_out.append(h) presents, h_enc = self.encoder.encode(h, self.input_lens[i]) enc_h_out.append(h_enc) past_for_decoder.append(presents) all_logits = self.decoder.decode_all(tokens=self.target_in, past_list=past_for_decoder, enc_h_list=enc_h_out)['logits'] with tf.name_scope('loss'): batch_max_seq_len = tf.shape(self.target_in)[1] target_mask = tf.sequence_mask(self.target_len, maxlen=batch_max_seq_len, dtype=tf.float32) cost = sequence_loss(logits=all_logits, targets=self.target_out, weights=target_mask) return cost
def _init_optimizer(self): logits = tf.transpose(self.decoder_logits_train, [1, 0, 2]) targets = tf.transpose(self.decoder_train_targets, [1, 0]) #손실함수 self.loss = seq2seq.sequence_loss(logits=logits, targets=targets, weights=self.loss_weights) #기울기 클리핑 self.lr = tf.Variable(0.0, trainable=False, name='lr') # 훈련이 가능하다고 설정한 모든 변수들 tvars = tf.trainable_variables() # 여러 값들에 대한 기울기 클리핑 # contrib.keras.backend.gradients # gradients gradients of variables grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), config.max_grad_norm) #optimizer = tf.train.AdamOptimizer(self.lr) #self.train_op = optimizer.apply_gradients(zip(grads, tvars)) self.train_op = tf.train.AdamOptimizer().minimize(self.loss)
def _build_loss(self, config): # cost/evaluate/train if config.model_name.endswith("flat"): self.prob = tf.nn.softmax(self.logits) self.losses = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_flat) # TODO self.y at multi-labels input self.loss = tf.reduce_mean(self.losses) else: self.weights = tf.sequence_mask(self.y_seq_length, dtype=tf.float32) # epsilon = tf.constant(value=0.00001) self.check = self.logits # self.logits = tf.clip_by_value(self.logits, -1.0, 1.0) #softmax = tf.nn.softmax(logits) #cross_entropy = -tf.reduce_sum(labels * tf.log(softmax), reduction_indices=[1]) self.losses = sequence_loss(logits=self.logits, targets=self.y_seq, weights=self.weights, average_across_timesteps=False, average_across_batch=False) self.loss = tf.reduce_mean(self.losses) tf.summary.scalar(self.loss.op.name, self.loss) # TODO process compute_gradients() and apply_gradients() separetely self.train_op = tf.train.AdamOptimizer( learning_rate=config.learning_rate).minimize( self.loss, global_step=self.global_step)
def _build_loss(self): self.train_logits_seq = tf.transpose(self.train_logits, [1, 0, 2]) self.train_targets_seq = tf.transpose(self.train_targets, [1, 0]) self.unreg_loss = self.loss = seq2seq.sequence_loss( logits=self.train_logits_seq, targets=self.train_targets_seq, weights=self.loss_weights)
def build_model(self, int_to_vocab, rnn_size, rnn_layer_count, summary_output_dir): self.train_graph = tf.Graph() with self.train_graph.as_default(): vocab_size = len(int_to_vocab) self.input_text, self.targets, self.lr = self.get_inputs() input_data_shape = tf.shape(self.input_text) cell, self.initial_state = self.get_init_cell( input_data_shape[0], rnn_size, layer_count=rnn_layer_count) logits, self.final_state = self.build_nn(cell, rnn_size, self.input_text, vocab_size) # Probabilities for generating words probs = tf.nn.softmax(logits, name='probs') # Loss function self.cost = seq2seq.sequence_loss( logits, self.targets, tf.ones([input_data_shape[0], input_data_shape[1]])) tf.summary.scalar('train_loss', self.cost) # Optimizer optimizer = tf.train.AdamOptimizer(self.lr) # Gradient Clipping gradients = optimizer.compute_gradients(self.cost) capped_gradients = [(tf.clip_by_value(grad, -1.0, 1.0), var) for grad, var in gradients] self.train_op = optimizer.apply_gradients(capped_gradients) self.merged_summaries = tf.summary.merge_all() self.train_writer = tf.summary.FileWriter(summary_output_dir, graph=self.train_graph)
def build_decoder(self): print("building decoder and attention..") with tf.variable_scope('decoder'): self.decoder_cell, self.decoder_initial_state = self.build_decoder_cell() output_layer = Dense(self.num_symbols, name='output_projection') start_tokens = tf.ones([self.batch_size,], tf.int32) * data_utils.start_token end_token = data_utils.end_token helper = GumbelSoftmaxEmbeddingHelper(embedding=self.embeddings, start_tokens=start_tokens,end_token= end_token, tau=self.tau) max_decoder_length = tf.reduce_max(self.encoder_inputs_length) decoder = tf.contrib.seq2seq.BasicDecoder(cell=self.decoder_cell, helper=helper, initial_state=self.decoder_initial_state)#, output_layer=output_layer) (self.decoder_outputs_train, self.decoder_last_state_train, self.decoder_outputs_length_train) = (seq2seq.dynamic_decode(decoder=decoder, maximum_iterations=max_decoder_length,impute_finished=True)) self.decoder_logits_train = tf.identity(self.decoder_outputs_train.rnn_output)#IMPORTANT self.decoder_pred_decode = tf.argmax(self.decoder_outputs_train.sample_id, axis=-1, output_type=tf.int32)#IMPORTANT #newintput = data_utils.insertSequence(self.decoder_pred_decode.eval(), self.encoder_inputs.eval(),1, self.total_num) ''' _loss = 0 for i in range(self.detector.batch_size): source, source_len = data_utils.prepare_batch(newintput[i:i*self.detector.batch_size], self.detector.stride, self.detector.maxlen, self.detector.batch_size) _, logits = self.detector.predict(self.sess, source, source_len) _loss += logits[0] - logits[1] ''' self.accuracy = tf.reduce_mean(tf.cast(tf.equal(self.encoder_inputs, self.decoder_pred_decode), self.dtype)) masks = tf.sequence_mask(lengths=self.encoder_inputs_length, maxlen=max_decoder_length, dtype=self.dtype, name='masks') self.loss = seq2seq.sequence_loss(logits=self.decoder_logits_train, targets=self.encoder_inputs, weights=masks) #self.loss = _loss + np.sum(self.decoder_pred_decode**masks**2)/np.sum(masks)/2 tf.summary.scalar('loss', self.loss) self.init_optimizer()
def _define_loss(self, sampled_softmax): self.loss_reconstruct = seq2seq.sequence_loss( logits=self.decoder_outputs_train, targets=self.decoder_targets, weights=self.decoder_weights, softmax_loss_function=sampled_softmax, average_across_timesteps=True, average_across_batch=True) self.KL = tf.reduce_mean(-0.5 * tf.reduce_sum(1 + self.encoder_state_logsigma - tf.pow(self.encoder_state_mu, 2) - tf.exp(self.encoder_state_logsigma), axis=1)) self.loss = tf.add(self.annealing_term * self.KL, self.loss_reconstruct) # Keep track of the cost tf.summary.scalar('loss_reconstruct', self.loss_reconstruct) tf.summary.scalar('KL', self.KL) tf.summary.scalar('loss', self.loss) opt = tf.train.AdamOptimizer( learning_rate=self.args.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08 ) self.opt_op = opt.minimize(self.loss)
def build_training_graph(self, input, input_len, target, target_mask=None): batch_max_seq_len = tf.shape(input)[1] def step(hparams, tokens, past=None): lm_output = model.model(hparams=hparams, X=tokens, past=past, reuse=tf.AUTO_REUSE) logits = lm_output['logits'] presents = lm_output['present'] presents.set_shape( model.past_shape(hparams=hparams, batch_size=None)) return { 'logits': logits, 'presents': presents, } with tf.name_scope('sample_sequence'): all_logits = step(hparams=self.hparams, tokens=input)['logits'] with tf.name_scope('loss'): if target_mask is None: target_mask = tf.sequence_mask(input_len, maxlen=batch_max_seq_len, dtype=tf.float32) cost = sequence_loss(logits=all_logits, targets=target, weights=target_mask) return cost
def add_loss_op(self, output): """Adds loss ops to the computational graph. Hint: Use tensorflow.python.ops.seq2seq.sequence_loss to implement sequence loss. Args: output: A tensor of shape (None, self.vocab) Returns: loss: A 0-d tensor (scalar) """ ### YOUR CODE HERE logits = output e = tf.expand_dims(input=output, axis=0) print(e) targets = self.labels_placeholder f = tf.expand_dims(input=tf.reshape(targets, [-1]), axis=0) print(targets) weights = tf.ones((self.config.batch_size * self.config.num_steps)) print(weights) g = tf.expand_dims(input=weights, axis=0) loss = sequence_loss(e, f, g) # all_ones = [tf.ones([self.config.batch_size * self.config.num_steps])] # a = tf.reshape(self.labels_placeholder, [-1]) # b = tf.ones((self.config.batch_size * self.config.num_steps)) # d = [output] # c = sequence_loss(logits=output, targets=[a], weights=b) # cross_entropy = sequence_loss( # [output], [tf.reshape(self.labels_placeholder, [-1])], all_ones, len(self.vocab)) # tf.add_to_collection('total_loss', cross_entropy) # loss = tf.add_n(tf.get_collection('total_loss')) ### END YOUR CODE return loss
def seq_net(name, inputs, targets, sl, n_items, n_cates, cate_list, u_emb, rank, is_training, reuse): with tf.variable_scope(name+'-rnn'): output_layer = Dense(n_items+2, n_cates+1, cate_list, activation=None, name='output_projection') training_helper = seq2seq.TrainingHelper( inputs=inputs, sequence_length=sl, time_major=False) cell, initial_state = build_decoder_cell(rank, u_emb, tf.shape(inputs)[0]) training_decoder = seq2seq.BasicDecoder( cell=cell, helper=training_helper, initial_state=initial_state, output_layer=output_layer) max_decoder_length = tf.reduce_max(sl) output, _, _ = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_decoder_length) output = tf.identity(output.rnn_output) mask = tf.sequence_mask( lengths=sl, maxlen=max_decoder_length, dtype=tf.float32) loss = seq2seq.sequence_loss( logits=output, targets=targets, weights=mask, average_across_timesteps=True, average_across_batch=False) return loss, tf.shape(output), tf.shape(targets)
def _build_loss(self): config = self.config def sampled_loss(labels, inputs): labels = tf.reshape(labels, [-1, 1]) # We need to compute the sampled_softmax_loss using 32bit floats to # avoid numerical instabilities. local_w_t = tf.cast(tf.transpose(self.output_projection[0]), tf.float32) local_b = tf.cast(self.output_projection[1], tf.float32) local_inputs = tf.cast(inputs, tf.float32) return tf.cast( tf.nn.sampled_softmax_loss(weights=local_w_t, biases=local_b, labels=labels, inputs=local_inputs, num_sampled=self.vocab_size // 10, num_classes=self.vocab_size), tf.float32) self.loss = seq2seq.sequence_loss(logits=self.decoder_logits, targets=self.decoder_train_targets, weights=tf.sequence_mask( self.x_length, tf.shape(self.x)[1], dtype=tf.float32, name='masks'), softmax_loss_function=sampled_loss, name='loss') tf.summary.scalar(self.loss.op.name, self.loss) tf.add_to_collection('ema/scalar', self.loss)
def _compute_loss(logits, target_output, target_weights, batch_size): """Compute optimization loss.""" """logits shape=[batch_size, num_steps, num_classes]""" """思路 tf.logical_and(target_weights, new_weight)""" loss = sequence_loss(logits=logits, targets=target_output[:, 1:], weights=target_weights) return loss
def build_train_decoder(self): print('Building train decoder...') ending = tf.strided_slice(self.decoder_targets, [0, 0], [self.batch_size, -1], [1, 1]) decoder_input = tf.concat([tf.fill([self.batch_size, 1], self.word_to_id['<GO>']), ending], 1) decoder_inputs_embedded = tf.nn.embedding_lookup(self.embedding, decoder_input) if self.teacher_forcing: training_helper = ScheduledEmbeddingTrainingHelper( inputs=decoder_inputs_embedded, sequence_length=self.decoder_targets_length, embedding=self.embedding, sampling_probability=self.teacher_forcing_probability, time_major=False, name='teacher_forcing_training_helper' ) else: training_helper = TrainingHelper( inputs=decoder_inputs_embedded, sequence_length=self.decoder_targets_length, time_major=False, name='training_helper' ) training_decoder = BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, output_layer=self.output_layer ) decoder_outputs, _, _ = dynamic_decode( decoder=training_decoder, impute_finished=True, maximum_iterations=self.max_target_sequence_length ) self.decoder_logits_train = tf.identity(decoder_outputs.rnn_output) # loss #This is the weighted cross-entropy loss for a sequence of logits. #Param: #logits: [batch_size, sequence_length, num_decoder_symbols]. # The logits is the prediction across all classes at each timestep. #targets: [batch_size, sequence_length], representing true class at each time step #weights: [batch_size, sequence_length], This is the weighting of each prediction in the sequence. self.loss = sequence_loss( logits=self.decoder_logits_train, targets=self.decoder_targets, weights=self.mask ) # summary tf.summary.scalar('loss', self.loss) #Outputs a Summary protocol buffer containing a single scalar value. self.summary_op = tf.summary.merge_all() #Merges all summaries collected in the default graph. self.build_optimizer()
def losses(self): with tf.variable_scope("loss"): weights = tf.sequence_mask( tf.to_int32(self.output_seq_len), tf.to_int32(tf.shape(self.decoder_input_data)[1])) loss = seq2seq.sequence_loss(self.logits, self.decoder_input_label, tf.to_float(weights)) tf.summary.scalar("loss", loss) return loss
def createModel(int_to_vocab): train_graph = tf.Graph() with train_graph.as_default(): # 文字总量 vocab_size = len(int_to_vocab) # 获取模型的输入,目标以及学习率节点,这些都是tf的placeholder input_text, targets, lr = get_inputs() # 输入数据的shape input_data_shape = tf.shape(input_text) # 创建rnn的cell和初始状态节点,rnn的cell已经包含了lstm,dropout # 这里的rnn_size表示每个lstm cell中包含了多少的神经元 cell, initial_state = get_init_cell(input_data_shape[0], rnn_size) # 创建计算loss和finalstate的节点 logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size, embed_dim) # 使用softmax计算最后的预测概率 probs = tf.nn.softmax(logits, name='probs') # 计算loss cost = seq2seq.sequence_loss( logits, targets, tf.ones([input_data_shape[0], input_data_shape[1]])) # 使用Adam提督下降 optimizer = tf.train.AdamOptimizer(lr) # 裁剪一下Gradient输出,最后的gradient都在[-1, 1]的范围内 gradients = optimizer.compute_gradients(cost) capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None] train_op = optimizer.apply_gradients(capped_gradients) #return train_op, train_graph, initial_state, input_text, targets # 获得训练用的所有batch batches = get_batches(int_text, batch_size, seq_length) # 打开session开始训练,将上面创建的graph对象传递给session with tf.Session(graph=train_graph) as sess: sess.run(tf.global_variables_initializer()) for epoch_i in range(num_epochs): state = sess.run(initial_state, {input_text: batches[0][0]}) for batch_i, (x, y) in enumerate(batches): feed = { input_text: x, targets: y, initial_state: state, lr: learning_rate } train_loss, state, _ = sess.run( [cost, final_state, train_op], feed) # 打印训练信息 if (epoch_i * len(batches) + batch_i) % show_every_n_batches == 0: print( 'Epoch {:>3} Batch {:>4}/{} train_loss = {:.3f}'. format(epoch_i, batch_i, len(batches), train_loss)) # 保存模型 saver = tf.train.Saver() saver.save(sess, save_dir) print('Model Trained and Saved') helper.save_params((seq_length, save_dir))
def _init_optimiser(self): self.current_lr = self._hparams.learning_rate self._loss_weights = tf.sequence_mask(lengths=self._labels_length, dtype=self._hparams.dtype) self.batch_loss = seq2seq.sequence_loss( logits=self.decoder_train_outputs, targets=self._labels, weights=self._loss_weights, softmax_loss_function=None, average_across_batch=True, average_across_timesteps=True) self.reg_loss = 0 if self._hparams.recurrent_l2_regularisation is not None: regularisable_vars = _get_trainable_vars(self._hparams.cell_type) reg = tf.contrib.layers.l2_regularizer( scale=self._hparams.recurrent_l2_regularisation) self.reg_loss = tf.contrib.layers.apply_regularization( reg, regularisable_vars) self.batch_loss = self.batch_loss + self.reg_loss if self._hparams.optimiser == 'Adam': optimiser = tf.train.AdamOptimizer( learning_rate=self.current_lr, epsilon=1e-8 if self._hparams.dtype == tf.float32 else 1e-4, ) elif self._hparams.optimiser == 'AdamW': from tensorflow.contrib.opt import AdamWOptimizer optimiser = AdamWOptimizer( learning_rate=self.current_lr, weight_decay=self._hparams.weight_decay, epsilon=1e-8 if self._hparams.dtype == tf.float32 else 1e-4, ) elif self._hparams.optimiser == 'Momentum': optimiser = tf.train.MomentumOptimizer( learning_rate=self.current_lr, momentum=0.9, use_nesterov=False) else: raise Exception('Unsupported optimiser, try Adam') variables = tf.trainable_variables() gradients = tf.gradients(self.batch_loss, variables) if self._hparams.clip_gradients is True: gradients, _ = tf.clip_by_global_norm( gradients, self._hparams.max_gradient_norm) self.train_op = optimiser.apply_gradients( grads_and_vars=zip(gradients, variables), global_step=tf.train.get_global_step())
def _init_optimizer(self): self.targets = tf.transpose(self.decoder_train_targets,[1,0]) self.logits = tf.nn.softmax(tf.transpose(self.logits, [1,0,2])) print 'targets:',self.targets print 'logits:',self.logits self.loss = seq2seq.sequence_loss(logits=self.logits, targets=self.targets, weights=self.loss_weights) print 'self.loss:',self.loss # define train op tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), 10) optimizer = tf.train.AdamOptimizer(1e-3) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, int_to_vocab, config): """ Construct Recurrent Neural Network. Args: config: Dictionary of configuration parameters. """ lstm_layers = config['lstm_layers'] rnn_size = config['rnn_size'] embed_dim = config['embed_dim'] dropout_keep_prob = config['dropout_keep_prob'] self.train_graph = tf.Graph() with self.train_graph.as_default(): tf.set_random_seed(1234) vocab_size = len(int_to_vocab) self.input_text, self.targets, self.lr = get_inputs() input_data_shape = tf.shape(self.input_text) # Unpack the shape of what the RNN outputs (array) to the shape that the RNN expects in its next training # step (tuples) self.init_state = tf.placeholder(tf.float32, [lstm_layers, 2, None, rnn_size], name='initial_state') state_per_layer_list = tf.unstack(self.init_state, axis=0) rnn_tuple_state = tuple( [tf.nn.rnn_cell.LSTMStateTuple(state_per_layer_list[idx][0], state_per_layer_list[idx][1]) for idx in range(lstm_layers)] ) cell = get_init_cell(rnn_size, dropout_keep_prob, lstm_layers) logits, self.final_state = build_nn(cell, rnn_tuple_state, self.input_text, vocab_size, embed_dim) # Probabilities for generating words # Not used locally but referred to by tensor name during text # generation. probs = tf.nn.softmax(logits, name='probs') # Loss function self.cost = seq2seq.sequence_loss( logits, self.targets, tf.ones([input_data_shape[0], input_data_shape[1]])) # Optimizer optimizer = tf.train.AdamOptimizer(self.lr) # Gradient Clipping gradients = optimizer.compute_gradients(self.cost) capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None] self.train_op = optimizer.apply_gradients(capped_gradients)
def __init__(self, vocab_size, hidden_size, dropout, num_layers, max_gradient_norm, batch_size, learning_rate, lr_decay_factor, max_target_length, max_source_length, decoder_mode=False): ''' vocab_size: number of vocab tokens buckets: buckets of max sequence lengths hidden_size: dimension of hidden layers num_layers: number of hidden layers max_gradient_norm: maximum gradient magnitude batch_size: number of training examples fed to network at once learning_rate: starting learning rate of network lr_decay_factor: amount by which to decay learning rate num_samples: number of samples for sampled softmax decoder_mode: Whether to build backpass nodes or not ''' GO_ID = config.GO_ID EOS_ID = config.EOS_ID self.max_source_length = max_source_length self.max_target_length = max_target_length self.vocab_size = vocab_size self.batch_size = batch_size self.global_step = tf.Variable(0, trainable=False) self.learning_rate = learning_rate self.encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs') self.source_lengths = tf.placeholder(shape=(None,), dtype=tf.int32, name='source_lengths') self.decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets') self.target_lengths = tf.placeholder(shape=(None,), dtype=tf.int32, name="target_lengths") with tf.variable_scope('embeddings') as scope: embeddings = tf.Variable(tf.random_uniform([vocab_size, hidden_size], -1.0, 1.0), dtype=tf.float32) encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, self.encoder_inputs) targets_embedding = tf.nn.embedding_lookup(embeddings, self.decoder_targets) with tf.variable_scope('encoder') as scope: encoder_cell = rnn.LSTMCell(hidden_size) encoder_cell = rnn.DropoutWrapper(encoder_cell, input_keep_prob=dropout) encoder_cell = rnn.MultiRNNCell([encoder_cell] * num_layers) _, encoder_state = tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_cell, cell_bw=encoder_cell, sequence_length=self.source_lengths, inputs=encoder_inputs_embedded, dtype=tf.float32, time_major=False) with tf.variable_scope('decoder') as scope: decoder_cell = rnn.LSTMCell(hidden_size) decoder_cell = rnn.DropoutWrapper(decoder_cell, input_keep_prob=dropout) decoder_cell = rnn.MultiRNNCell([decoder_cell] * num_layers) #TODO add attention #seq2seq.BahdanauAttention(num_units=,memory=encoder_output) #decoder_cell = seq2seq.AttentionWrapper(cell=decoder_cell, # attention_mechanism=) if decoder_mode: decoder = seq2seq.BeamSearchDecoder(embedding=embeddings, start_tokens=tf.tile([GOD_ID], [batch_size]), end_token=EOS_ID, initial_state=encoder_state[0], beam_width=2) else: helper = seq2seq.TrainingHelper(inputs=targets_embedding, sequence_length=self.target_lengths) decoder = seq2seq.BasicDecoder(cell=decoder_cell, helper=helper, initial_state=encoder_state[-1], output_layer=Dense(vocab_size)) final_outputs, final_state, final_sequence_lengths =\ seq2seq.dynamic_decode(decoder=decoder) self.logits = final_outputs.rnn_output if not decoder_mode: with tf.variable_scope("loss") as scope: #have to pad logits, dynamic decode produces results not consistent #in shape with targets pad_size = self.max_target_length - tf.reduce_max(final_sequence_lengths) self.logits = tf.pad(self.logits, [[0, 0], [0,pad_size], [0, 0]]) weights = tf.sequence_mask(lengths=final_sequence_lengths, maxlen=self.max_target_length, dtype=tf.float32, name='weights') x_entropy_loss = seq2seq.sequence_loss(logits=self.logits, targets=self.decoder_targets, weights=weights) self.loss = tf.reduce_mean(x_entropy_loss) optimizer = tf.train.AdamOptimizer() gradients = optimizer.compute_gradients(x_entropy_loss) capped_grads = [(tf.clip_by_value(grad, -max_gradient_norm, max_gradient_norm), var) for grad, var in gradients] self.train_op = optimizer.apply_gradients(capped_grads, global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables())
def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, target_dict_dim, is_generating, beam_size, max_generation_length): src_word_idx = tf.placeholder(tf.int32, shape=[None, None]) src_sequence_length = tf.placeholder(tf.int32, shape=[None, ]) src_embedding_weights = tf.get_variable("source_word_embeddings", [source_dict_dim, embedding_dim]) src_embedding = tf.nn.embedding_lookup(src_embedding_weights, src_word_idx) src_forward_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size) src_reversed_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size) # no peephole encoder_outputs, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=src_forward_cell, cell_bw=src_reversed_cell, inputs=src_embedding, sequence_length=src_sequence_length, dtype=tf.float32) # concat the forward outputs and backward outputs encoded_vec = tf.concat(encoder_outputs, axis=2) # project the encoder outputs to size of decoder lstm encoded_proj = tf.contrib.layers.fully_connected( inputs=tf.reshape( encoded_vec, shape=[-1, embedding_dim * 2]), num_outputs=decoder_size, activation_fn=None, biases_initializer=None) encoded_proj_reshape = tf.reshape( encoded_proj, shape=[-1, tf.shape(encoded_vec)[1], decoder_size]) # get init state for decoder lstm's H backword_first = tf.slice(encoder_outputs[1], [0, 0, 0], [-1, 1, -1]) decoder_boot = tf.contrib.layers.fully_connected( inputs=tf.reshape( backword_first, shape=[-1, embedding_dim]), num_outputs=decoder_size, activation_fn=tf.nn.tanh, biases_initializer=None) # prepare the initial state for decoder lstm cell_init = tf.zeros(tf.shape(decoder_boot), tf.float32) initial_state = LSTMStateTuple(cell_init, decoder_boot) # create decoder lstm cell decoder_cell = LSTMCellWithSimpleAttention( decoder_size, encoded_vec if not is_generating else seq2seq.tile_batch(encoded_vec, beam_size), encoded_proj_reshape if not is_generating else seq2seq.tile_batch(encoded_proj_reshape, beam_size), src_sequence_length if not is_generating else seq2seq.tile_batch(src_sequence_length, beam_size), forget_bias=0.0) output_layer = Dense(target_dict_dim, name='output_projection') if not is_generating: trg_word_idx = tf.placeholder(tf.int32, shape=[None, None]) trg_sequence_length = tf.placeholder(tf.int32, shape=[None, ]) trg_embedding_weights = tf.get_variable( "target_word_embeddings", [target_dict_dim, embedding_dim]) trg_embedding = tf.nn.embedding_lookup(trg_embedding_weights, trg_word_idx) training_helper = seq2seq.TrainingHelper( inputs=trg_embedding, sequence_length=trg_sequence_length, time_major=False, name='training_helper') training_decoder = seq2seq.BasicDecoder( cell=decoder_cell, helper=training_helper, initial_state=initial_state, output_layer=output_layer) # get the max length of target sequence max_decoder_length = tf.reduce_max(trg_sequence_length) decoder_outputs_train, _, _ = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_decoder_length) decoder_logits_train = tf.identity(decoder_outputs_train.rnn_output) decoder_pred_train = tf.argmax( decoder_logits_train, axis=-1, name='decoder_pred_train') masks = tf.sequence_mask( lengths=trg_sequence_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks') # place holder of label sequence lbl_word_idx = tf.placeholder(tf.int32, shape=[None, None]) # compute the loss loss = seq2seq.sequence_loss( logits=decoder_logits_train, targets=lbl_word_idx, weights=masks, average_across_timesteps=True, average_across_batch=True) # return feeding list and loss operator return { 'src_word_idx': src_word_idx, 'src_sequence_length': src_sequence_length, 'trg_word_idx': trg_word_idx, 'trg_sequence_length': trg_sequence_length, 'lbl_word_idx': lbl_word_idx }, loss else: start_tokens = tf.ones([tf.shape(src_word_idx)[0], ], tf.int32) * START_TOKEN_IDX # share the same embedding weights with target word trg_embedding_weights = tf.get_variable( "target_word_embeddings", [target_dict_dim, embedding_dim]) inference_decoder = beam_search_decoder.BeamSearchDecoder( cell=decoder_cell, embedding=lambda tokens: tf.nn.embedding_lookup(trg_embedding_weights, tokens), start_tokens=start_tokens, end_token=END_TOKEN_IDX, initial_state=tf.nn.rnn_cell.LSTMStateTuple( tf.contrib.seq2seq.tile_batch(initial_state[0], beam_size), tf.contrib.seq2seq.tile_batch(initial_state[1], beam_size)), beam_width=beam_size, output_layer=output_layer) decoder_outputs_decode, _, _ = seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=False, #impute_finished=True,# error occurs maximum_iterations=max_generation_length) predicted_ids = decoder_outputs_decode.predicted_ids return { 'src_word_idx': src_word_idx, 'src_sequence_length': src_sequence_length }, predicted_ids