def teacherforce_loss(): if self.model_config.number_samples > 0: loss_fn = tf.nn.sampled_softmax_loss else: loss_fn = None loss = sequence_loss( logits=tf.stack(output.decoder_logit_list, axis=1), targets=gt_target, weights=decode_word_weight, # softmax_loss_function=loss_fn, # w=w, # b=b, # decoder_outputs=decoder_outputs, # number_samples=self.model_config.number_samples ) return loss
def create_model(self): with tf.variable_scope('variables'): abstr_ph = [] for _ in range(self.model_config.max_abstr_len): abstr_ph.append( tf.zeros(self.model_config.batch_size, tf.int32, name='abstract_input')) kwords_ph = [] for _ in range(self.model_config.max_cnt_kword): kword = [] for _ in range(self.model_config.max_kword_len): kword.append( tf.zeros(self.model_config.batch_size, tf.int32, name='kword_input')) kwords_ph.append(kword) emb_abstr, emb_kword, proj_w, proj_b = self.get_embedding() abstr = tf.stack(self.embedding_fn(abstr_ph, emb_abstr), axis=1) kwords = [] for kword_idx in range(self.model_config.max_cnt_kword): kwords.append( self.embedding_fn(kwords_ph[kword_idx], emb_kword)) with tf.variable_scope('model_encoder'): if self.hparams.pos == 'timing': abstr = common_attention.add_timing_signal_1d(abstr) encoder_embed_inputs = tf.nn.dropout( abstr, 1.0 - self.hparams.layer_prepostprocess_dropout) abstr_bias = common_attention.attention_bias_ignore_padding( tf.to_float( tf.equal(tf.stack(abstr_ph, axis=1), self.voc_kword.encode(constant.SYMBOL_PAD)))) abstr_outputs = transformer.transformer_encoder( encoder_embed_inputs, abstr_bias, self.hparams) if 'tuzhaopeng' in self.model_config.cov_mode: attn_stick = tf.ones([ self.model_config.batch_size, self.model_config.num_heads, 1, self.model_config.dimension / self.model_config.num_heads ], tf.float32, 'attn_memory') losses = [] targets = [] obj = {} with tf.variable_scope('model_decoder'): for kword_idx in range(self.model_config.max_cnt_kword): if self.is_train: kword = kwords[kword_idx][:-1] kword_ph = kwords_ph[kword_idx] kword_output_list, new_attn_stick = self.decode_step( kword, abstr_outputs, abstr_bias, attn_stick) kword_logit_list = [ self.output_to_logit(o, proj_w, proj_b) for o in kword_output_list ] kword_target_list = [ tf.argmax(o, output_type=tf.int32, axis=-1) for o in kword_logit_list ] attn_stick = new_attn_stick if self.model_config.number_samples > 0: loss_fn = tf.nn.sampled_softmax_loss else: loss_fn = None kword_lossbias = [ tf.to_float( tf.not_equal( d, self.voc_kword.encode(constant.SYMBOL_PAD))) for d in kword_ph ] kword_lossbias = tf.stack(kword_lossbias, axis=1) loss = sequence_loss( logits=tf.stack(kword_logit_list, axis=1), targets=tf.stack(kword_ph, axis=1), weights=kword_lossbias, softmax_loss_function=loss_fn, w=proj_w, b=proj_b, decoder_outputs=tf.stack(kword_output_list, axis=1), number_samples=self.model_config.number_samples) targets.append(tf.stack(kword_target_list, axis=1)) if 'tuzhaopeng' in self.model_config.cov_mode and 'kp_attn' in self.model_config.cov_mode: target_emb = tf.stack(self.embedding_fn( kword_target_list, emb_kword), axis=1) target_emb = common_attention.split_heads( target_emb, self.model_config.num_heads) target_emb = tf.reduce_mean(target_emb, axis=2) target_emb_trans = tf.get_variable( 'dim_weight_trans', shape=[ 1, target_emb.get_shape()[-1].value, target_emb.get_shape()[-1].value ], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) target_emb = tf.nn.conv1d(target_emb, target_emb_trans, 1, 'SAME') target_emb = tf.expand_dims(target_emb, axis=2) attn_stick += target_emb losses.append(loss) else: if self.model_config.beam_search_size > 0: loss, target, new_attn_stick = self.transformer_beam_search( abstr_outputs, abstr_bias, emb_kword, proj_w, proj_b, attn_stick=attn_stick) else: loss, target, new_attn_stick = self.greed_search( kword_idx, abstr_outputs, abstr_bias, emb_kword, proj_w, proj_b, attn_stick=attn_stick) targets.append(target) losses = loss attn_stick = new_attn_stick if 'tuzhaopeng' in self.model_config.cov_mode and 'kp_attn' in self.model_config.cov_mode: target.set_shape([ self.model_config.batch_size, self.model_config.max_kword_len ]) target_list = tf.unstack(target, axis=1) target_emb = tf.stack(self.embedding_fn( target_list, emb_kword), axis=1) target_emb = common_attention.split_heads( target_emb, self.model_config.num_heads) target_emb = tf.reduce_mean(target_emb, axis=2) target_emb_trans = tf.get_variable( 'dim_weight_trans', shape=[ 1, target_emb.get_shape()[-1].value, target_emb.get_shape()[-1].value ], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) target_emb = tf.nn.conv1d(target_emb, target_emb_trans, 1, 'SAME') target_emb = tf.expand_dims(target_emb, axis=2) attn_stick += target_emb tf.get_variable_scope().reuse_variables() if targets: obj['targets'] = tf.stack(targets, axis=1) obj['abstr_ph'] = abstr_ph obj['kwords_ph'] = kwords_ph obj['attn_stick'] = attn_stick if type(losses) is list: losses = tf.add_n(losses) return losses, obj
def create_model(self): with tf.variable_scope('variables'): abstr_ph = [] for _ in range(self.model_config.max_abstr_len): abstr_ph.append(tf.zeros(self.model_config.batch_size, tf.int32, name='abstract_input')) kwords_ph = [] for _ in range(self.model_config.max_cnt_kword): kword = [] for _ in range(self.model_config.max_kword_len): kword.append(tf.zeros(self.model_config.batch_size, tf.int32, name='kword_input')) kwords_ph.append(kword) # Train for length control if self.is_train: kword_occupies_ph = [] for _ in range(self.model_config.max_cnt_kword): kword_occupies_ph.append( tf.zeros(self.model_config.batch_size, tf.float32, name='kword_occupy_input')) emb_abstr, emb_kword, proj_w, proj_b = self.get_embedding() abstr = tf.stack(self.embedding_fn(abstr_ph, emb_abstr), axis=1) kwords = [] for kword_idx in range(self.model_config.max_cnt_kword): kwords.append(self.embedding_fn(kwords_ph[kword_idx], emb_kword)) with tf.variable_scope('model_encoder'): if self.hparams.pos == 'timing': abstr = common_attention.add_timing_signal_1d(abstr) encoder_embed_inputs = tf.nn.dropout(abstr, 1.0 - self.hparams.layer_prepostprocess_dropout) abstr_bias = common_attention.attention_bias_ignore_padding( tf.to_float(tf.equal(tf.stack(abstr_ph, axis=1), self.voc_kword.encode(constant.SYMBOL_PAD)))) abstr_outputs = transformer.transformer_encoder( encoder_embed_inputs, abstr_bias, self.hparams) losses = [] targets = [] pred_occupies = [] obj = {} hist_vector = None if 'kp_attn' in self.model_config.cov_mode: hist_vector = tf.zeros( [self.model_config.batch_size, 1, self.model_config.dimension,]) with tf.variable_scope('model_decoder'): if self.model_config.subword_vocab_size: go_id = self.voc_kword.encode(constant.SYMBOL_GO)[0] else: go_id = self.voc_kword.encode(constant.SYMBOL_GO) batch_go = tf.tile( tf.expand_dims(self.embedding_fn(go_id, emb_kword), axis=0), [self.model_config.batch_size, 1]) for kword_idx in range(self.model_config.max_cnt_kword): if self.is_train: kword = kwords[kword_idx][:-1] kword_ph = kwords_ph[kword_idx] kword_output, kword_output_list = self.decode_step( kword, abstr_outputs, abstr_bias, batch_go, hist_vector=hist_vector) kword_logit_list = [self.output_to_logit(o, proj_w, proj_b) for o in kword_output_list] kword_target_list = [tf.argmax(o, output_type=tf.int32, axis=-1) for o in kword_logit_list] kword_lossbias = [ tf.to_float(tf.not_equal(d, self.voc_kword.encode(constant.SYMBOL_PAD))) for d in kword_ph] kword_lossbias = tf.stack(kword_lossbias, axis=1) if self.model_config.number_samples > 0: loss_fn = tf.nn.sampled_softmax_loss else: loss_fn = None loss = sequence_loss(logits=tf.stack(kword_logit_list, axis=1), targets=tf.stack(kword_ph, axis=1), weights=kword_lossbias, softmax_loss_function=loss_fn, w=proj_w, b=proj_b, decoder_outputs=tf.stack(kword_output_list, axis=1), number_samples=self.model_config.number_samples ) kword_target = tf.stack(kword_target_list, axis=1) targets.append(kword_target) if 'kp_attn' in self.model_config.cov_mode: kword_embed = self.embedding_fn(kword_ph, emb_kword) hist_vector += tf.expand_dims(tf.reduce_mean( tf.stack(kword_embed, axis=1), axis=1), axis=1) # Train for length control pred_occupy = self.get_pred_occupy_logit(hist_vector, abstr_outputs) occupy_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=pred_occupy, labels=kword_occupies_ph[kword_idx]) loss += tf.reduce_mean(occupy_loss) pred_occupies.append(pred_occupy) losses.append(loss) else: loss, kword_target = self.transformer_beam_search( abstr_outputs, abstr_bias, emb_kword, proj_w, proj_b, hist_vector=hist_vector) targets.append(kword_target) losses = loss if 'kp_attn' in self.model_config.cov_mode: kword_embed = self.embedding_fn(kword_target, emb_kword) hist_vector += tf.expand_dims(tf.reduce_mean(kword_embed, axis=1), axis=1) pred_occupy = tf.round(tf.sigmoid(self.get_pred_occupy_logit(hist_vector, abstr_outputs))) pred_occupies.append(pred_occupy) tf.get_variable_scope().reuse_variables() if targets: obj['targets'] = tf.stack(targets, axis=1) obj['abstr_ph'] = abstr_ph obj['kwords_ph'] = kwords_ph if self.is_train: obj['kword_occupies_ph'] = kword_occupies_ph pred_occupies = tf.stack(pred_occupies, axis=1) obj['pred_occupies'] = pred_occupies if type(losses) is list: losses = tf.add_n(losses) return losses, obj
def create_model(self): # Input with tf.variable_scope('embedding'): self.emb = tf.get_variable( 'embedding', [args.event_size + constant.NUM_SPEC_MARK, args.dimension], tf.float32, initializer=xavier_initializer()) with tf.variable_scope('inputs'): self.inputs_ph = [] for step in range(args.max_len): self.inputs_ph.append( tf.zeros(args.batch_size, tf.int32, name='event')) self.inpt_events = [ tf.zeros(args.batch_size, tf.int32, name='go') ] + self.inputs_ph[:-1] self.inpt_events_emb = tf.stack(self.get_embedding( self.inpt_events), axis=1) self.pred_events = self.inputs_ph self_attention_bias = ( common_attention.attention_bias_lower_triangle(args.max_len)) with tf.variable_scope('model'): outputs = self.attention_lm_decoder(self.inpt_events_emb, self_attention_bias, self.hparams, 'trans') self.w = tf.get_variable( 'output_w', [args.dimension, args.event_size + constant.NUM_SPEC_MARK], tf.float32, initializer=xavier_initializer()) self.b = tf.get_variable( 'output_b', [args.event_size + constant.NUM_SPEC_MARK], tf.float32, initializer=xavier_initializer()) # logits = tf.nn.xw_plus_b(outputs, tf.transpose(self.w), self.b) logits = tf.nn.conv1d(outputs, tf.expand_dims(self.w, 0), 1, 'SAME') with tf.variable_scope('loss'): self.loss = sequence_loss(logits=logits, targets=tf.stack(self.pred_events, axis=1)) with tf.variable_scope('optim'): self.global_step = tf.get_variable('global_step', initializer=tf.constant( 0, dtype=tf.int64), trainable=False) if self.is_train: self.increment_global_step = tf.assign_add(self.global_step, 1) opt = tf.train.AdagradOptimizer(args.learning_rate) grads_and_vars = opt.compute_gradients( self.loss, var_list=tf.trainable_variables()) grads = [g for (g, v) in grads_and_vars] clipped_grads, _ = tf.clip_by_global_norm(grads, 5.0) self.train_op = opt.apply_gradients( zip(clipped_grads, tf.trainable_variables()), global_step=self.global_step) else: self.last_event = tf.argmax(logits[:, -1, :], axis=-1) self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2) print('Graph Built.')