def build_graph(self): with self._graph.as_default(): rand_seed = self._conf['rand_seed'] tf.set_random_seed(rand_seed) #word embedding if self._word_embedding_init is not None: word_embedding_initializer = tf.constant_initializer( self._word_embedding_init) else: word_embedding_initializer = tf.random_normal_initializer( stddev=0.1) self._word_embedding = tf.get_variable( name='word_embedding', shape=[self._conf['vocab_size'] + 1, self._conf['emb_size']], dtype=tf.float32, initializer=word_embedding_initializer) #define placehloders self.turns = tf.placeholder(tf.int32, shape=[ self._conf["batch_size"], self._conf["max_turn_num"], self._conf["max_turn_len"] ]) self.tt_turns_len = tf.placeholder( tf.int32, shape=[self._conf["batch_size"]]) self.every_turn_len = tf.placeholder( tf.int32, shape=[self._conf["batch_size"], self._conf["max_turn_num"]]) self.response = tf.placeholder( tf.int32, shape=[self._conf["batch_size"], self._conf["max_turn_len"]]) self.response_len = tf.placeholder( tf.int32, shape=[self._conf["batch_size"]]) self.label = tf.placeholder(tf.float32, shape=[self._conf["batch_size"]]) #define operations #response part Hr = tf.nn.embedding_lookup(self._word_embedding, self.response) if self._conf['is_positional'] and self._conf['stack_num'] > 0: with tf.variable_scope('positional'): Hr = op.positional_encoding_vector(Hr, max_timescale=10) for index in range(self._conf['stack_num']): with tf.variable_scope('self_stack_' + str(index)): Hr = layers.block(Hr, Hr, Hr, Q_lengths=self.response_len, K_lengths=self.response_len) #context part #a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len] list_turn_t = tf.unstack(self.turns, axis=1) list_turn_length = tf.unstack(self.every_turn_len, axis=1) sim_turns = [] #for every turn_t calculate matching vector for turn_t, t_turn_length in zip(list_turn_t, list_turn_length): Hu = tf.nn.embedding_lookup( self._word_embedding, turn_t) #[batch, max_turn_len, emb_size] if self._conf['is_positional'] and self._conf['stack_num'] > 0: with tf.variable_scope('positional', reuse=True): Hu = op.positional_encoding_vector(Hu, max_timescale=10) for index in range(self._conf['stack_num']): with tf.variable_scope('self_stack_' + str(index), reuse=True): Hu = layers.block(Hu, Hu, Hu, Q_lengths=t_turn_length, K_lengths=t_turn_length) with tf.variable_scope('u_attentd_r_' + str(index)): try: u_a_r = layers.block(Hu, Hr, Hr, Q_lengths=t_turn_length, K_lengths=self.response_len) except ValueError: tf.get_variable_scope().reuse_variables() u_a_r = layers.block(Hu, Hr, Hr, Q_lengths=t_turn_length, K_lengths=self.response_len) with tf.variable_scope('r_attend_u_' + str(index)): try: r_a_u = layers.block(Hr, Hu, Hu, Q_lengths=self.response_len, K_lengths=t_turn_length) except ValueError: tf.get_variable_scope().reuse_variables() r_a_u = layers.block(Hr, Hu, Hu, Q_lengths=self.response_len, K_lengths=t_turn_length) u_a_r = tf.stack([u_a_r, Hu], axis=-1) r_a_u = tf.stack([r_a_u, Hr], axis=-1) #calculate similarity matrix with tf.variable_scope('similarity'): # sim shape [batch, max_turn_len, max_turn_len, 2*stack_num+1] # divide sqrt(200) to prevent gradient explosion sim = tf.einsum('biks,bjks->bijs', r_a_u, u_a_r) / tf.sqrt(200.0) sim_turns.append(sim) #cnn and aggregation sim = tf.stack(sim_turns, axis=1) print('sim shape: %s' % sim.shape) with tf.variable_scope('cnn_aggregation'): final_info = layers.CNN_3d(sim, 32, 16) #for douban #final_info = layers.CNN_3d(sim, 16, 16) #loss and train with tf.variable_scope('loss'): self.loss, self.logits = layers.loss(final_info, self.label) self.global_step = tf.Variable(0, trainable=False) initial_learning_rate = self._conf['learning_rate'] self.learning_rate = tf.train.exponential_decay( initial_learning_rate, global_step=self.global_step, decay_steps=400, decay_rate=0.9, staircase=True) Optimizer = tf.train.AdamOptimizer(self.learning_rate) self.optimizer = Optimizer.minimize(self.loss) self.init = tf.global_variables_initializer() self.saver = tf.train.Saver( max_to_keep=self._conf["max_to_keep"]) self.all_variables = tf.global_variables() self.all_operations = self._graph.get_operations() self.grads_and_vars = Optimizer.compute_gradients(self.loss) for grad, var in self.grads_and_vars: if grad is None: print var self.capped_gvs = [(tf.clip_by_value(grad, -1, 1), var) for grad, var in self.grads_and_vars] self.g_updates = Optimizer.apply_gradients( self.capped_gvs, global_step=self.global_step) return self._graph
def build_graph(self): with self._graph.as_default(): if self._conf['rand_seed'] is not None: rand_seed = self._conf['rand_seed'] tf.set_random_seed(rand_seed) print('set tf random seed: %s' % self._conf['rand_seed']) #word embedding if self._word_embedding_init is not None: word_embedding_initializer = tf.constant_initializer( self._word_embedding_init) else: word_embedding_initializer = tf.random_normal_initializer( stddev=0.1) self._word_embedding = tf.get_variable( name='word_embedding', shape=[self._conf['vocab_size'] + 1, self._conf['emb_size']], dtype=tf.float32, initializer=word_embedding_initializer) #define placehloders #config max_turn_history_num self.turns_history = tf.placeholder( tf.int32, shape=[ self._conf["batch_size"], self._conf["max_turn_history_num"], self._conf["max_turn_len"] ]) self.turns = tf.placeholder(tf.int32, shape=[ self._conf["batch_size"], self._conf["max_turn_num"], self._conf["max_turn_len"] ]) self.tt_turns_len = tf.placeholder( tf.int32, shape=[self._conf["batch_size"]]) self.every_turn_len = tf.placeholder( tf.int32, shape=[self._conf["batch_size"], self._conf["max_turn_num"]]) self.response = tf.placeholder( tf.int32, shape=[self._conf["batch_size"], self._conf["max_turn_len"]]) self.response_len = tf.placeholder( tf.int32, shape=[self._conf["batch_size"]]) self.label = tf.placeholder(tf.float32, shape=[self._conf["batch_size"]]) #define operations #response part Hr = tf.nn.embedding_lookup(self._word_embedding, self.response) turns_history_embedding = tf.nn.embedding_lookup( self._word_embedding, self.turns_history) if self._conf['is_positional'] and self._conf['stack_num'] > 0: with tf.variable_scope('positional'): Hr = op.positional_encoding_vector(Hr, max_timescale=10) Hr_stack = [Hr] _batch_size, _turn_nums, _turn_words, _emb_size = turns_history_embedding.get_shape( ).as_list() turns_history_embedding = tf.reshape(turns_history_embedding, [-1, _turn_words, _emb_size]) for index in range(self._conf['stack_num']): turns_history_embedding, _ = self._multihead( turns_history_embedding, turns_history_embedding, turns_history_embedding) turns_history_embedding = tf.reshape( turns_history_embedding, [_batch_size, _turn_nums, _turn_words, _emb_size]) for index in range(self._conf['stack_num']): with tf.variable_scope('self_stack_' + str(index)): Hr = layers.block(Hr, Hr, Hr, Q_lengths=self.response_len, K_lengths=self.response_len) Hr_stack.append(Hr) with tf.variable_scope('respone_extraction_history'): turn_important_inf = [] #需要增加一个全链接层 for _t in tf.split(turns_history_embedding, self._conf['max_turn_history_num'], 1): _t = tf.squeeze(_t) #_match_result = layers.attention(Hr_stack[-1], _t, _t, self.response_len, self.response_len) _match_result = layers.attention( self._dense1(Hr_stack[-1]), _t, _t, self.response_len, self.response_len) turn_important_inf.append(tf.expand_dims(_match_result, 1)) best_turn_match = tf.concat(turn_important_inf, 1) with tf.variable_scope('response_extraciton_best_information'): #best_information,_ = self._multihead(Hr_stack[-1], best_turn_match, best_turn_match) best_information, _ = self._multihead( self._dense2(Hr_stack[-1]), best_turn_match, best_turn_match) best_information = layers.FFN(best_information) #context part #a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len] list_turn_t = tf.unstack(self.turns, axis=1) list_turn_length = tf.unstack(self.every_turn_len, axis=1) sim_turns = [] #for every turn_t calculate matching vector for turn_t, t_turn_length in zip(list_turn_t, list_turn_length): Hu = tf.nn.embedding_lookup( self._word_embedding, turn_t) #[batch, max_turn_len, emb_size] if self._conf['is_positional'] and self._conf['stack_num'] > 0: with tf.variable_scope('positional', reuse=True): Hu = op.positional_encoding_vector(Hu, max_timescale=10) Hu_stack = [Hu] for index in range(self._conf['stack_num']): with tf.variable_scope('self_stack_' + str(index), reuse=True): Hu = layers.block(Hu, Hu, Hu, Q_lengths=t_turn_length, K_lengths=t_turn_length) Hu_stack.append(Hu) r_a_t_stack = [] t_a_r_stack = [] for index in range(self._conf['stack_num'] + 1): with tf.variable_scope('t_attend_r_' + str(index)): try: t_a_r = layers.block(tf.add( Hu_stack[index], best_information), Hr_stack[index], Hr_stack[index], Q_lengths=t_turn_length, K_lengths=self.response_len) except ValueError: tf.get_variable_scope().reuse_variables() t_a_r = layers.block(tf.add( Hu_stack[index], best_information), Hr_stack[index], Hr_stack[index], Q_lengths=t_turn_length, K_lengths=self.response_len) with tf.variable_scope('r_attend_t_' + str(index)): try: r_a_t = layers.block( Hr_stack[index], tf.add(Hu_stack[index], best_information), tf.add(Hu_stack[index], best_information), Q_lengths=self.response_len, K_lengths=t_turn_length) except ValueError: tf.get_variable_scope().reuse_variables() r_a_t = layers.block( Hr_stack[index], tf.add(Hu_stack[index], best_information), tf.add(Hu_stack[index], best_information), Q_lengths=self.response_len, K_lengths=t_turn_length) t_a_r_stack.append(t_a_r) r_a_t_stack.append(r_a_t) t_a_r_stack.extend(Hu_stack) r_a_t_stack.extend(Hr_stack) t_a_r = tf.stack(t_a_r_stack, axis=-1) r_a_t = tf.stack(r_a_t_stack, axis=-1) #calculate similarity matrix with tf.variable_scope('similarity'): # sim shape [batch, max_turn_len, max_turn_len, 2*stack_num+1] # divide sqrt(200) to prevent gradient explosion sim = tf.einsum('biks,bjks->bijs', t_a_r, r_a_t) / tf.sqrt(200.0) sim_turns.append(sim) #cnn and aggregation sim = tf.stack(sim_turns, axis=1) print('sim shape: %s' % sim.shape) with tf.variable_scope('cnn_aggregation'): final_info = layers.CNN_3d(sim, 32, 16) #final_info_dim = final_info.get_shape().as_list()[-1] #for douban #final_info = layers.CNN_3d(sim, 16, 16) # _x = self._conv1d(best_information) # _x = self._pool1d(_x) #final_info = tf.concat([final_info,best_information],-1) #loss and train with tf.variable_scope('loss'): self.loss, self.logits = layers.loss(final_info, self.label) self.global_step = tf.Variable(0, trainable=False) initial_learning_rate = self._conf['learning_rate'] self.learning_rate = tf.train.exponential_decay( initial_learning_rate, global_step=self.global_step, decay_steps=400, decay_rate=0.9, staircase=True) Optimizer = tf.train.AdamOptimizer(self.learning_rate) self.optimizer = Optimizer.minimize( self.loss, global_step=self.global_step) self.init = tf.global_variables_initializer() self.saver = tf.train.Saver( max_to_keep=self._conf["max_to_keep"]) self.all_variables = tf.global_variables() self.all_operations = self._graph.get_operations() self.grads_and_vars = Optimizer.compute_gradients(self.loss) for grad, var in self.grads_and_vars: if grad is None: print(var) self.capped_gvs = [(tf.clip_by_value(grad, -1, 1), var) for grad, var in self.grads_and_vars] self.g_updates = Optimizer.apply_gradients( self.capped_gvs, global_step=self.global_step) return self._graph
def build_graph(self): with self._graph.as_default(): rand_seed = self._conf['rand_seed'] tf.set_random_seed(rand_seed) #word embedding if self._word_embedding_init is not None: word_embedding_initializer = tf.constant_initializer( self._word_embedding_init) else: word_embedding_initializer = tf.random_normal_initializer( stddev=0.1) self._word_embedding = tf.get_variable( name='word_embedding', shape=[self._conf['vocab_size'] + 1, self._conf['emb_size']], dtype=tf.float32, initializer=word_embedding_initializer, trainable=True) batch_size = None #define placehloders self.turns1 = tf.placeholder(tf.int32, shape=[ batch_size, self._conf["max_turn_num"], self._conf["max_turn_len"] ], name="turns1") self.tt_turns_len1 = tf.placeholder(tf.int32, shape=[ batch_size, ], name="tt_turns_len1") self.every_turn_len1 = tf.placeholder( tf.int32, shape=[batch_size, self._conf["max_turn_num"]], name="every_turn_len1") self.response = tf.placeholder( tf.int32, shape=[batch_size, self._conf["max_turn_len"]], name="response") self.response_len = tf.placeholder(tf.int32, shape=[ batch_size, ], name="response_len") self.keep_rate = tf.placeholder(tf.float32, [], name="keep_rate") self.label = tf.placeholder(tf.float32, shape=[ batch_size, ]) # ==================================== Building Model ============================= print( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), "Starting build Model") if self.cr_model == "SMN": input_x = self.turns1 input_y = self.response with tf.variable_scope('model_cr_smn'): final_info_cr = smn_model(input_x, None, input_y, None, self._word_embedding, self.keep_rate, self._conf, x_len=self.every_turn_len1, y_len=self.response_len) final_info_cr = tf.layers.dense( final_info_cr, 50, kernel_initializer=tf.contrib.layers. xavier_initializer()) # DAM elif self.cr_model == "DAM": input_x = self.turns1 input_y = self.response with tf.variable_scope('model_cr_dam'): final_info_cr = dam_model(input_x, None, input_y, None, self._word_embedding, self.keep_rate, self._conf, x_len=self.every_turn_len1, y_len=self.response_len) final_info_cr = tf.layers.dense( final_info_cr, 50, kernel_initializer=tf.contrib.layers. xavier_initializer()) # MSN elif self.cr_model == "MSN": input_x = self.turns1 input_y = self.response with tf.variable_scope('model_cr_msn'): final_info_cr, self.final_score = msn_model( input_x, None, input_y, None, self._word_embedding, self.keep_rate, self._conf, x_len=self.every_turn_len1, y_len=self.response_len) final_info_cr = tf.layers.dense( final_info_cr, 50, kernel_initializer=tf.contrib.layers. xavier_initializer()) # ESIM elif self.cr_model == "ESIM": input_x = tf.reshape(self.turns1, [ -1, self._conf["max_turn_num"] * self._conf["max_turn_len"] ]) input_x_mask = tf.sequence_mask(self.every_turn_len1, self._conf["max_turn_len"]) input_x_mask = tf.reshape(input_x_mask, [ -1, self._conf["max_turn_num"] * self._conf["max_turn_len"] ]) input_y = self.response input_y_mask = tf.sequence_mask(self.response_len, self._conf["max_turn_len"]) with tf.variable_scope('model_cr_esim'): final_info_cr = esim_model(input_x, input_x_mask, input_y, input_y_mask, self._word_embedding, self.keep_rate) final_info_cr = tf.layers.dense( final_info_cr, 50, kernel_initializer=tf.contrib.layers. xavier_initializer()) elif self.cr_model == "IOI": input_x = tf.reshape(self.turns1, [ -1, self._conf["max_turn_num"] * self._conf["max_turn_len"] ]) input_x_mask = tf.sequence_mask(self.every_turn_len1, self._conf["max_turn_len"]) input_x_mask = tf.reshape(input_x_mask, [ -1, self._conf["max_turn_num"] * self._conf["max_turn_len"] ]) input_y = self.response input_y_mask = tf.sequence_mask(self.response_len, self._conf["max_turn_len"]) with tf.variable_scope('model_cr_ioi'): final_info_cr, final_info_cr_ioi = ioi_model( input_x, input_x_mask, input_y, input_y_mask, self._word_embedding, self.keep_rate, self._conf) final_info_cr = tf.layers.dense( final_info_cr, 50, kernel_initializer=tf.contrib.layers. xavier_initializer()) # ==================================== Calculating Model ============================= self.trainops = { "cr": dict(), } loss_input = final_info_cr for loss_type in [ "cr", ]: with tf.variable_scope('loss_' + loss_type): if self.cr_model == "IOI": loss_list = [] logits_list = [] for i, j in enumerate(final_info_cr_ioi): with tf.variable_scope("loss" + str(i)): loss_per, logits_per = layers.loss( j, self.label) loss_list.append(loss_per) logits_list.append(logits_per) self.trainops[loss_type]["loss"] = sum([ ((idx + 1) / self._conf["ioi_layer_num"]) * item for idx, item in enumerate(loss_list) ]) self.trainops[loss_type]["logits"] = sum(logits_list) else: self.trainops[loss_type]["loss"], self.trainops[ loss_type]["logits"] = layers.loss( final_info_cr, self.label) self.trainops[loss_type]["global_step"] = tf.Variable( 0, trainable=False) initial_learning_rate = self._conf['learning_rate'] self.trainops[loss_type][ "learning_rate"] = tf.train.exponential_decay( initial_learning_rate, global_step=self.trainops[loss_type] ["global_step"], decay_steps=self._conf["decay_step"], decay_rate=self._conf["decay_rate"], staircase=True) Optimizer = tf.train.AdamOptimizer( self.trainops[loss_type]["learning_rate"]) self.trainops[loss_type]["optimizer"] = Optimizer.minimize( self.trainops[loss_type]["loss"]) self.trainops[loss_type][ "grads_and_vars"] = Optimizer.compute_gradients( self.trainops[loss_type]["loss"]) self.trainops[loss_type]["capped_gvs"] = [ (tf.clip_by_value(grad, -5, 5), var) for grad, var in self.trainops[loss_type]["grads_and_vars"] if grad != None ] self.trainops[loss_type][ "g_updates"] = Optimizer.apply_gradients( self.trainops[loss_type]["capped_gvs"], global_step=self.trainops[loss_type] ["global_step"]) self.all_variables = tf.global_variables() self.init = tf.global_variables_initializer() self.saver = tf.train.Saver(max_to_keep=self._conf["max_to_keep"]) self.all_operations = self._graph.get_operations() return self._graph
def build_graph(self): with self._graph.as_default(): rand_seed = self._conf['rand_seed'] tf.set_random_seed(rand_seed) #word embedding if self._word_embedding_init is not None: word_embedding_initializer = tf.constant_initializer( self._word_embedding_init) else: word_embedding_initializer = tf.random_normal_initializer( stddev=0.1) self._word_embedding = tf.get_variable( name='word_embedding', shape=[self._conf['vocab_size'] + 1, self._conf['emb_size']], dtype=tf.float32, initializer=word_embedding_initializer, trainable=True) batch_size = None initializer_opt = tf.contrib.layers.variance_scaling_initializer( factor=1.0, mode='FAN_AVG', uniform=True, dtype=tf.float32) initializer_opt = tf.truncated_normal_initializer(stddev=0.02) self.turns_sess_num = self._conf["max_turn_num_hf"] * 2 + 1 self.turns_q_num = self._conf["max_turn_num"] #define placehloders self.turns1 = tf.placeholder(tf.int32, shape=[ batch_size, self._conf["max_turn_num"], self._conf["max_turn_len"] ], name="turns1") self.tt_turns_len1 = tf.placeholder(tf.int32, shape=[ batch_size, ], name="tt_turns_len1") self.every_turn_len1 = tf.placeholder( tf.int32, shape=[batch_size, self._conf["max_turn_num"]], name="every_turn_len1") self.turns2 = tf.placeholder(tf.int32, shape=[ batch_size, self._conf["max_turn_num_hf"], self._conf["max_turn_len"] ], name="turns2") self.tt_turns_len2 = tf.placeholder(tf.int32, shape=[ batch_size, ], name="tt_turns_len2") self.every_turn_len2 = tf.placeholder( tf.int32, shape=[batch_size, self._conf["max_turn_num_hf"]], name="every_turn_len2") self.turnsf = tf.placeholder(tf.int32, shape=[ batch_size, self._conf["max_turn_num_hf"], self._conf["max_turn_len"] ], name="turnsf") self.tt_turns_lenf = tf.placeholder(tf.int32, shape=[ batch_size, ], name="tt_turns_lenf") self.every_turn_lenf = tf.placeholder( tf.int32, shape=[batch_size, self._conf["max_turn_num_hf"]], name="every_turn_lenf") self.response = tf.placeholder( tf.int32, shape=[batch_size, self._conf["max_turn_len"]], name="response") self.response_len = tf.placeholder(tf.int32, shape=[ batch_size, ], name="response_len") self.turnsa = tf.placeholder( tf.int32, shape=[ batch_size, self._conf["max_turn_len"] * self.turns_sess_num ], name="turnsa") self.turnsa_len = tf.placeholder(tf.int32, shape=[ batch_size, ], name="turnsa_len") self.turnsq = tf.placeholder( tf.int32, shape=[ batch_size, self._conf["max_turn_len"] * self.turns_q_num ], name="turnsq") self.turnsq_len = tf.placeholder(tf.int32, shape=[ batch_size, ], name="turnsq_len") self.keep_rate = tf.placeholder(tf.float32, [], name="keep_rate") self.turns_sess = tf.placeholder( tf.int32, shape=[ batch_size, self._conf["max_turn_num_sess"], self._conf["max_turn_len"] ], name="turns_sess") self.tt_turns_len_sess = tf.placeholder(tf.int32, shape=[ batch_size, ], name="tt_turns_len_sess") self.every_turn_len_sess = tf.placeholder( tf.int32, shape=[batch_size, self._conf["max_turn_num_sess"]], name="every_turn_len_sess") self.label = tf.placeholder(tf.float32, shape=[ batch_size, ]) # ==================================== CS Model ============================= print( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), "Starting build CS Model") input_x = self.turns1 input_x_len = self.every_turn_len1 input_x_mask = tf.sequence_mask(input_x_len, self._conf["max_turn_len"]) input_xf = self.turnsf input_x_lenf = self.every_turn_lenf input_xf = tf.concat( [tf.expand_dims(self.response, axis=1), input_xf], axis=1) input_x_lenf = tf.concat( [input_x_lenf, tf.expand_dims(self.response_len, axis=1)], axis=1) input_x_maskf = tf.sequence_mask(input_x_lenf, self._conf["max_turn_len"]) input_x2 = self.turns2 input_x_len2 = self.every_turn_len2 input_x2 = tf.concat( [input_x2, tf.expand_dims(self.response, axis=1)], axis=1) input_x_len2 = tf.concat( [input_x_len2, tf.expand_dims(self.response_len, axis=1)], axis=1) input_x_mask2 = tf.sequence_mask(input_x_len2, self._conf["max_turn_len"]) with tf.variable_scope('model_crdms'): final_info_cs, final_info_css, self.all_mem_weight_dict, self.save_dynamic_dict, self.sim_ori = cs_model( input_x, input_x_mask, input_x_len, input_x2, input_x_mask2, input_x_len2, input_xf, input_x_maskf, input_x_lenf, self._word_embedding, self._conf) final_info_cs = tf.layers.dense( final_info_cs, 50, kernel_initializer=tf.contrib.layers.xavier_initializer()) # ==================================== Calculate Loss ============================= print( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), "Starting calculate Loss") self.trainops = {"cs": dict()} all_loss_inouts = [ ["cs", final_info_cs], ] for loss_type, loss_input in all_loss_inouts: if loss_type != self._conf["train_type"] and loss_type != "cr": continue with tf.variable_scope('loss_' + loss_type): self.trainops[loss_type]["loss"], self.trainops[loss_type][ "logits"] = layers.loss(loss_input, self.label) use_loss_weight = True loss_added, logits_added = [], [] num_loss = len(final_info_css) for i, j in enumerate(final_info_css): with tf.variable_scope("losscc" + str(i)): loss_per, logits_per = layers.loss(j, self.label) if num_loss == 6 and i >= 2 and use_loss_weight: loss_per = loss_per * 0.5 logits_per = logits_per * 0.5 loss_added.append(loss_per) logits_added.append(logits_per) if num_loss == 6 and use_loss_weight: num_loss = num_loss - 2 self.trainops[loss_type]["loss"] += sum( loss_added) / num_loss self.trainops[loss_type]["logits"] += sum( logits_added) / num_loss self.trainops[loss_type]["global_step"] = tf.Variable( 0, trainable=False) initial_learning_rate = self._conf['learning_rate'] self.trainops[loss_type][ "learning_rate"] = tf.train.exponential_decay( initial_learning_rate, global_step=self.trainops[loss_type] ["global_step"], decay_steps=self._conf["decay_step"], decay_rate=0.9, staircase=True) Optimizer = tf.train.AdamOptimizer( self.trainops[loss_type]["learning_rate"]) self.trainops[loss_type]["optimizer"] = Optimizer.minimize( self.trainops[loss_type]["loss"]) self.trainops[loss_type][ "grads_and_vars"] = Optimizer.compute_gradients( self.trainops[loss_type]["loss"]) self.trainops[loss_type]["capped_gvs"] = [ (tf.clip_by_value(grad, -1, 1), var) for grad, var in self.trainops[loss_type]["grads_and_vars"] if grad != None ] self.trainops[loss_type][ "g_updates"] = Optimizer.apply_gradients( self.trainops[loss_type]["capped_gvs"], global_step=self.trainops[loss_type] ["global_step"]) self.all_variables = tf.global_variables() self.init = tf.global_variables_initializer() self.saver = tf.train.Saver(max_to_keep=self._conf["max_to_keep"]) self.all_operations = self._graph.get_operations() return self._graph
def build_graph(self): with self._graph.as_default(): rand_seed = self._conf['rand_seed'] tf.set_random_seed(rand_seed) #word embedding if self._word_embedding_init is not None: word_embedding_initializer = tf.constant_initializer( self._word_embedding_init) else: word_embedding_initializer = tf.random_normal_initializer( stddev=0.1) self._word_embedding = tf.get_variable( name='word_embedding', shape=[self._conf['vocab_size'] + 1, self._conf['emb_size']], dtype=tf.float32, initializer=word_embedding_initializer) #define placehloders self.turns1 = tf.placeholder(tf.int32, shape=[ self._conf["batch_size"], self._conf["max_turn_num"], self._conf["max_turn_len"] ], name="turns1") self.tt_turns_len1 = tf.placeholder( tf.int32, shape=[self._conf["batch_size"]], name="tt_turns_len1") self.every_turn_len1 = tf.placeholder( tf.int32, shape=[self._conf["batch_size"], self._conf["max_turn_num"]], name="every_turn_len1") self.turns2 = tf.placeholder(tf.int32, shape=[ self._conf["batch_size"], self._conf["max_turn_num"], self._conf["max_turn_len"] ], name="turns2") self.tt_turns_len2 = tf.placeholder( tf.int32, shape=[self._conf["batch_size"]], name="tt_turns_len2") self.every_turn_len2 = tf.placeholder( tf.int32, shape=[self._conf["batch_size"], self._conf["max_turn_num"]], name="every_turn_len2") self.response = tf.placeholder( tf.int32, shape=[self._conf["batch_size"], self._conf["max_turn_len"]], name="response") self.response_len = tf.placeholder( tf.int32, shape=[self._conf["batch_size"]], name="response_len") self.keep_rate = tf.placeholder(tf.float32, [], name="keep_rate") self.label = tf.placeholder(tf.float32, shape=[self._conf["batch_size"]]) self.turns1_e = tf.nn.embedding_lookup(self._word_embedding, self.turns1) self.turns2_e = tf.nn.embedding_lookup(self._word_embedding, self.turns2) self.response_e = tf.nn.embedding_lookup(self._word_embedding, self.response) # SMN if self.cr_model == "SMN": input_x = self.turns1 input_y = self.response final_info_cr = smn_model(input_x, None, input_y, None, self._word_embedding, self.keep_rate, self._conf, x_len=self.every_turn_len1, y_len=self.response_len) with tf.variable_scope('final_smn_mlp_cr'): final_info_cr = tf.layers.dense( final_info_cr, 50, kernel_initializer=tf.contrib.layers. xavier_initializer()) # DAM elif self.cr_model == "DAM": input_x = self.turns1 input_y = self.response final_info_cr = dam_model(input_x, None, input_y, None, self._word_embedding, self.keep_rate, self._conf, x_len=self.every_turn_len1, y_len=self.response_len) with tf.variable_scope('final_esim_mlp_cr'): final_info_cr = tf.layers.dense( final_info_cr, 50, kernel_initializer=tf.contrib.layers. xavier_initializer()) # IOI elif self.cr_model == "IOI": input_x = tf.reshape(self.turns1, [self._conf["batch_size"], -1]) input_x_mask = tf.sequence_mask(self.every_turn_len1, self._conf["max_turn_len"]) input_x_mask = tf.reshape(input_x_mask, [self._conf["batch_size"], -1]) input_x2 = tf.reshape(self.turns2, [self._conf["batch_size"], -1]) input_x_mask2 = tf.sequence_mask(self.every_turn_len2, self._conf["max_turn_len"]) input_x_mask2 = tf.reshape(input_x_mask2, [self._conf["batch_size"], -1]) input_y = self.response input_y_mask = tf.sequence_mask(self.response_len, self._conf["max_turn_len"]) final_info_cr, final_info_cr_ioi = ioi_model( input_x, input_x_mask, input_y, input_y_mask, self._word_embedding, self.keep_rate, self._conf) with tf.variable_scope('final_esim_mlp_cr'): final_info_cr = tf.layers.dense( final_info_cr, 50, kernel_initializer=tf.contrib.layers. xavier_initializer()) if self.cc_model == "cc": input_x = self.turns1 input_x_mask = tf.sequence_mask(self.every_turn_len1, self._conf["max_turn_len"]) #input_x_mask = tf.reshape(input_x_mask, [-1, self._conf["max_turn_num"]*self._conf["max_turn_len"]]) input_x_len = self.every_turn_len1 input_x2 = self.turns2 input_x_mask2 = tf.sequence_mask(self.every_turn_len2, self._conf["max_turn_len"]) #input_x_mask2 = tf.reshape(input_x_mask2, [-1, self._conf["max_turn_num_s"]*self._conf["max_turn_len"]]) input_x_len2 = self.every_turn_len2 final_info_cc, self.att_weight_print = cc_model( input_x, input_x_mask, input_x_len, input_x2, input_x_mask2, input_x_len2, self._word_embedding, self._conf, con_c=self.con_c) with tf.variable_scope('final_mlp_cc'): final_info_cc = tf.layers.dense( final_info_cc, 50, kernel_initializer=tf.contrib.layers. xavier_initializer()) elif self.cc_model == "onesent": input_x = self.turns1 input_x_mask = tf.sequence_mask(self.every_turn_len1, self._conf["max_turn_len"]) #input_x_mask = tf.reshape(input_x_mask, [-1, self._conf["max_turn_num"]*self._conf["max_turn_len"]]) input_x_len = self.every_turn_len1 input_x2 = self.turns2 input_x_mask2 = tf.sequence_mask(self.every_turn_len2, self._conf["max_turn_len"]) #input_x_mask2 = tf.reshape(input_x_mask2, [-1, self._conf["max_turn_num_s"]*self._conf["max_turn_len"]]) input_x_len2 = self.every_turn_len2 final_info_cc = cc_model(input_x, input_x_mask, input_x_len, input_x2, input_x_mask2, input_x_len2, self._word_embedding, self._conf, con_c=True) with tf.variable_scope('final_mlp_onesent_cc'): final_info_cc = tf.layers.dense( final_info_cc, 50, kernel_initializer=tf.contrib.layers. xavier_initializer()) elif self.cc_model == "bimpm": input_x = tf.reshape(self.turns1, [self._conf["batch_size"], -1]) input_x_mask = tf.sequence_mask(self.every_turn_len1, self._conf["max_turn_len"]) input_x_mask = tf.reshape(input_x_mask, [self._conf["batch_size"], -1]) input_y = tf.reshape(self.turns2, [self._conf["batch_size"], -1]) input_y_mask = tf.sequence_mask(self.every_turn_len2, self._conf["max_turn_len"]) input_y_mask = tf.reshape(input_y_mask, [self._conf["batch_size"], -1]) with tf.variable_scope('final_bimpm_cc_cr'): final_info_cc = bimpm_model(input_x, input_x_mask, input_y, input_y_mask, self._word_embedding, self.keep_rate) final_info_cc = tf.layers.dense( final_info_cc, 50, kernel_initializer=tf.contrib.layers. xavier_initializer()) #loss and train with tf.variable_scope('loss_cc'): self.loss_cc, self.logits_cc = layers.loss( final_info_cc, self.label) self.global_step_cc = tf.Variable(0, trainable=False) initial_learning_rate = self._conf['learning_rate'] self.learning_rate_cc = tf.train.exponential_decay( initial_learning_rate, global_step=self.global_step_cc, decay_steps=5000, decay_rate=0.9, staircase=True) Optimizer_cc = tf.train.AdamOptimizer(self.learning_rate_cc) self.optimizer_cc = Optimizer_cc.minimize(self.loss_cc) #self.all_operations = self._graph.get_operations() self.grads_and_vars_cc = Optimizer_cc.compute_gradients( self.loss_cc) self.capped_gvs_cc = [(tf.clip_by_value(grad, -1, 1), var) for grad, var in self.grads_and_vars_cc if grad != None] self.g_updates_cc = Optimizer_cc.apply_gradients( self.capped_gvs_cc, global_step=self.global_step_cc) with tf.variable_scope('loss_cr'): if self.cr_model == "IOI": loss_list = [] logits_list = [] for i, j in enumerate(final_info_cr_ioi): with tf.variable_scope("loss" + str(i)): loss_per, logits_per = layers.loss(j, self.label) loss_list.append(loss_per) logits_list.append(logits_per) self.loss_cr = sum([((idx + 1) / 7.0) * item for idx, item in enumerate(loss_list)]) self.logits_cr = sum(logits_list) else: self.loss_cr, self.logits_cr = layers.loss( final_info_cr, self.label) self.global_step_cr = tf.Variable(0, trainable=False) initial_learning_rate = self._conf['learning_rate'] self.learning_rate_cr = tf.train.exponential_decay( initial_learning_rate, global_step=self.global_step_cr, decay_steps=10000, decay_rate=0.9, staircase=True) Optimizer_cr = tf.train.AdamOptimizer(self.learning_rate_cr) self.optimizer_cr = Optimizer_cr.minimize(self.loss_cr) self.grads_and_vars_cr = Optimizer_cr.compute_gradients( self.loss_cr) self.capped_gvs_cr = [(tf.clip_by_value(grad, -1, 1), var) for grad, var in self.grads_and_vars_cr if grad != None] self.g_updates_cr = Optimizer_cr.apply_gradients( self.capped_gvs_cr, global_step=self.global_step_cr) with tf.variable_scope('loss_ccr'): if self._conf["fusion"] == "fusion": final_att = tf.concat([final_info_cc, final_info_cr], axis=1) final_att = tf.layers.dense(final_att, 1, kernel_initializer=tf.contrib. layers.xavier_initializer(), name="nosave") final_att = tf.nn.sigmoid(final_att) self.final_att_print = final_att final_info_ccr = final_info_cc * final_att + final_info_cr * ( 1 - final_att) elif self._conf["fusion"] == "con": #print(final_info_cr.shape) final_att = tf.concat([final_info_cc, final_info_cr], axis=1) final_att = tf.layers.dense(final_att, final_info_cr.shape[-1], kernel_initializer=tf.contrib. layers.xavier_initializer(), name="nosave") final_att = tf.nn.sigmoid(final_att) self.final_att_print = final_att final_info_ccr = tf.concat([final_info_cr, final_info_cc], axis=1) elif self._conf["fusion"] == "none": final_info_ccr = final_info_cc + final_info_cr else: assert False self.loss_ccr, self.logits_ccr = layers.loss( final_info_ccr, self.label) self.loss_ccr += self.loss_cr self.logits_ccr += self.logits_cr self.global_step_ccr = tf.Variable(0, trainable=False) initial_learning_rate = self._conf['learning_rate'] self.learning_rate_ccr = tf.train.exponential_decay( initial_learning_rate, global_step=self.global_step_ccr, decay_steps=5000, decay_rate=0.9, staircase=True) Optimizer_ccr = tf.train.AdamOptimizer(self.learning_rate_ccr) self.optimizer_ccr = Optimizer_ccr.minimize(self.loss_ccr) self.grads_and_vars_ccr = Optimizer_ccr.compute_gradients( self.loss_ccr) self.capped_gvs_ccr = [(tf.clip_by_value(grad, -1, 1), var) for grad, var in self.grads_and_vars_ccr if grad != None] self.g_updates_ccr = Optimizer_ccr.apply_gradients( self.capped_gvs_ccr, global_step=self.global_step_ccr) self.all_variables = tf.global_variables() self.init = tf.global_variables_initializer() self.saver_load = tf.train.Saver( max_to_keep=self._conf["max_to_keep"]) self.saver_save = self.saver_load self.all_operations = self._graph.get_operations() return self._graph
def build_graph(self): with self._graph.as_default(): if self._conf['rand_seed'] is not None: rand_seed = self._conf['rand_seed'] tf.set_random_seed(rand_seed) print('set tf random seed: %s' % self._conf['rand_seed']) # word embedding if self._word_embedding_init is not None: word_embedding_initializer = tf.constant_initializer( self._word_embedding_init) else: word_embedding_initializer = tf.random_normal_initializer( stddev=0.1) self._word_embedding = tf.get_variable( name='word_embedding', shape=[self._conf['vocab_size'] + 1, self._conf['emb_size']], dtype=tf.float32, initializer=word_embedding_initializer) # define placehloders self.turns = tf.placeholder( tf.int32, shape=[self._conf["batch_size"], self._conf["max_turn_num"], self._conf["max_turn_len"]]) self.tt_turns_len = tf.placeholder( # turn_num tf.int32, shape=[self._conf["batch_size"]]) self.every_turn_len = tf.placeholder( tf.int32, shape=[self._conf["batch_size"], self._conf["max_turn_num"]]) self.turns_intent = tf.placeholder( tf.float32, shape=[self._conf["batch_size"], self._conf["max_turn_num"], self._conf["intent_size"]]) self.response = tf.placeholder( tf.int32, shape=[self._conf["batch_size"], self._conf["max_turn_len"]]) self.response_len = tf.placeholder( tf.int32, shape=[self._conf["batch_size"]]) self.response_intent = tf.placeholder( tf.float32, shape=[self._conf["batch_size"], self._conf["intent_size"]]) self.label = tf.placeholder( tf.float32, shape=[self._conf["batch_size"]]) # define operations # response part Hr = tf.nn.embedding_lookup(self._word_embedding, self.response) # [batch_size, max_turn_len, embed_size] # print('[after embedding_lookup] Hr shape: %s' % Hr.shape) if self._conf['is_positional'] and self._conf['stack_num'] > 0: with tf.variable_scope('positional'): Hr = op.positional_encoding_vector(Hr, max_timescale=10) Hr_stack = [Hr] # 1st element of Hr_stack is the orginal embedding # lyang comments: self attention for index in range(self._conf['stack_num']): # print('[self attention for response] stack index: %d ' % index) with tf.variable_scope('self_stack_' + str(index)): # [batch, max_turn_len, emb_size] Hr = layers.block( # attentive module Hr, Hr, Hr, Q_lengths=self.response_len, K_lengths=self.response_len) # print('[after layers.block] Hr shape: %s' % Hr.shape) # Hr is still [batch_size, max_turn_len, embed_size] Hr_stack.append(Hr) # print('[after self attention of response] len(Hr_stack)', # len(Hr_stack)) # 1+stack_num # context part # a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len] list_turn_t = tf.unstack(self.turns, axis=1) list_turn_length = tf.unstack(self.every_turn_len, axis=1) list_turn_intent = tf.unstack(self.turns_intent, axis=1) sim_turns = [] attention_turns = [] # intent based attention on each turn # for every turn_t calculate matching vector turn_index = 0 for turn_t, t_turn_length, t_intent in zip(list_turn_t, list_turn_length, list_turn_intent): print('current turn_index : ', turn_index) turn_index += 1 Hu = tf.nn.embedding_lookup(self._word_embedding, turn_t) # [batch, max_turn_len, emb_size] # print('[after embedding_lookup] Hu shape: %s' % Hu.shape) if self._conf['is_positional'] and self._conf['stack_num'] > 0: with tf.variable_scope('positional', reuse=True): Hu = op.positional_encoding_vector(Hu, max_timescale=10) Hu_stack = [Hu] # 1st element of Hu_stack is the orginal embedding # lyang comments: self attention for index in range(self._conf['stack_num']): # print('[self attention for context turn] stack index: %d ' % index) with tf.variable_scope('self_stack_' + str(index), reuse=True): # [batch, max_turn_len, emb_size] Hu = layers.block( # attentive module Hu, Hu, Hu, Q_lengths=t_turn_length, K_lengths=t_turn_length) # print('[after layers.block] Hu shape: %s' % Hu.shape) Hu_stack.append(Hu) # print('[after self attention of context turn] len(Hu_stack)', # len(Hu_stack)) # 1+stack_num # lyang comments: cross attention # print('[cross attention ...]') r_a_t_stack = [] t_a_r_stack = [] # cross attention for index in range(self._conf['stack_num'] + 1): # print('[cross attention] stack index = ', index) with tf.variable_scope('t_attend_r_' + str(index)): try: # [batch, max_turn_len, emb_size] t_a_r = layers.block( # attentive module Hu_stack[index], Hr_stack[index], Hr_stack[index], Q_lengths=t_turn_length, K_lengths=self.response_len) except ValueError: tf.get_variable_scope().reuse_variables() t_a_r = layers.block( # [batch, max_turn_len, emb_size] Hu_stack[index], Hr_stack[index], Hr_stack[index], Q_lengths=t_turn_length, K_lengths=self.response_len) # print('[cross attention t_attend_r_] stack index: %d, t_a_r.shape: %s' % ( # index, t_a_r.shape)) with tf.variable_scope('r_attend_t_' + str(index)): try: # [batch, max_turn_len, emb_size] r_a_t = layers.block( # attentive module Hr_stack[index], Hu_stack[index], Hu_stack[index], Q_lengths=self.response_len, K_lengths=t_turn_length) except ValueError: tf.get_variable_scope().reuse_variables() r_a_t = layers.block( Hr_stack[index], Hu_stack[index], Hu_stack[index], Q_lengths=self.response_len, K_lengths=t_turn_length) # print('[cross attention r_a_t_] stack index: %d, r_a_t.shape: %s' % ( # index, r_a_t.shape)) t_a_r_stack.append(t_a_r) r_a_t_stack.append(r_a_t) # print('[cross attention] len(t_a_r_stack):', len(t_a_r_stack)) # print('[cross attention] len(r_a_t_stack):', len(r_a_t_stack)) # print('[before extend] len(t_a_r_stack):', len(t_a_r_stack)) # print('[before extend] len(r_a_t_stack):', len(r_a_t_stack)) # lyang comments: 3D aggregation t_a_r_stack.extend( Hu_stack) # half from self-attention; half from cross-attention r_a_t_stack.extend( Hr_stack) # half from self-attention; half from cross-attention # after extend, len(t_a_r_stack)) = 2*(stack_num+1) # print('[after extend] len(t_a_r_stack):', len(t_a_r_stack)) # print('[after extend] len(r_a_t_stack):', len(r_a_t_stack)) t_a_r = tf.stack(t_a_r_stack, axis=-1) r_a_t = tf.stack(r_a_t_stack, axis=-1) # print('after stack along the last dimension: ') # print('t_a_r shape: %s' % t_a_r.shape) # print('r_a_t shape: %s' % r_a_t.shape) # after stack, t_a_r and r_a_t are (batch, max_turn_len, embed_size, 2*(stack_num+1)) with tf.variable_scope('intent_based_attention', reuse=tf.AUTO_REUSE): # share parameter across different turns # there are 3 different ways to implement intent based attention # implement these three different variations and compare the # effectiveness as model abalation analysis # let I_u_t and I_r_k are intent vector in [12,1] # 1. dot: w * [I_u_t, I_r_k], where w is [24,1] # 2. biliear: I_u_t' * w * I_r_k, where w is [12,12] # 3. outprod: I_u_t * I_r_k' -> [12,12] out product -> # flaten to [144,1] outprod -> w*outprod # where w is [1,144] attention_logits = layers.attention_intent(t_intent, self.response_intent, self._conf['intent_attention_type']) # print('[intent_based_attention] attention_logits.shape: %s' % attention_logits.shape) attention_turns.append(attention_logits) # calculate similarity matrix with tf.variable_scope('similarity'): # sim shape [batch, max_turn_len, max_turn_len, 2*(stack_num+1)] # divide sqrt(200) to prevent gradient explosion # A_biks * B_bjks -> C_bijs sim = tf.einsum('biks,bjks->bijs', t_a_r, r_a_t) / tf.sqrt( 200.0) # (batch, max_turn_len, embed_size, 2*(stack_num+1)) * # (batch, max_turn_len, embed_size, 2*(stack_num+1)) -> # [batch, max_turn_len, max_turn_len, 2*(stack_num+1)] # where k is corresponding to the dimension of embed_size, # which can be eliminated by dot product with einsum # print('[similarity] after einsum dot prod sim shape: %s' % sim.shape) # [batch, max_turn_len, max_turn_len, 2*(stack_num+1)] # ! Here we multipy sim by intent based attention weights before # append sim into sim_turns in order to generate the weighted # stack in the next step sim_turns.append(sim) # print('[similarity] after append, len(sim_turns):', len(sim_turns)) attention_logits = tf.stack(attention_turns, axis=1) # [batch, max_turn_num] print('[attention_logits] after stack attention_logits.shape: %s' % attention_logits.shape) # add mask in attention following the way in BERT # real turn_num is in self.tt_turns_len [batch] # return a mask tensor with shape [batch, conf['max_turn_num']] attention_mask = tf.sequence_mask(self.tt_turns_len, self._conf['max_turn_num'], dtype=tf.float32) print('[attention_mask] attention_mask.shape: %s' % attention_mask.shape) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. adder = (1.0 - attention_mask) * -10000.0 # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_logits += adder attention = tf.nn.softmax(attention_logits) # by default softmax along dim=-1 [batch, max_turn_num] print('[attention] attention.shape: %s' % attention_mask.shape) self.attention = attention # will print it for visualization # cnn and aggregation # lyang comments aggregation by 3D CNN layer # [3d cnn aggregation] sim shape: (32, 9, 180, 180, 10) # conv_0 shape: (32, 9, 180, 180, 16) # pooling_0 shape: (32, 3, 60, 60, 16) # conv_1 shape: (32, 3, 60, 60, 16) # pooling_1 shape: (32, 1, 20, 20, 16) # [3d cnn aggregation] final_info: (32, 6400) # [batch * feature_size] # [batch, max_turn_num, max_turn_len, max_turn_len, 2*(stack_num+1)] # (32, 9, 180, 180, 10) sim = tf.stack(sim_turns, axis=1) # multipy sim by attention score sim = tf.einsum('bijks,bi->bijks', sim, attention) print('[3d cnn aggregation] sim shape: %s' % sim.shape) with tf.variable_scope('cnn_aggregation'): final_info = layers.CNN_3d(sim, self._conf['cnn_3d_oc0'], self._conf['cnn_3d_oc1']) # for udc # final_info = layers.CNN_3d(sim, 32, 16) # for douban # final_info = layers.CNN_3d(sim, 16, 16) print('[3d cnn aggregation] final_info: %s' % final_info.shape) # loss and train with tf.variable_scope('loss'): self.loss, self.logits = layers.loss(final_info, self.label) self.global_step = tf.Variable(0, trainable=False) initial_learning_rate = self._conf['learning_rate'] self.learning_rate = tf.train.exponential_decay( initial_learning_rate, global_step=self.global_step, decay_steps=400, decay_rate=0.9, staircase=True) Optimizer = tf.train.AdamOptimizer(self.learning_rate) self.optimizer = Optimizer.minimize( self.loss, global_step=self.global_step) self.init = tf.global_variables_initializer() self.saver = tf.train.Saver( max_to_keep=self._conf["max_to_keep"]) self.all_variables = tf.global_variables() self.all_operations = self._graph.get_operations() self.grads_and_vars = Optimizer.compute_gradients(self.loss) for grad, var in self.grads_and_vars: if grad is None: print var self.capped_gvs = [(tf.clip_by_value(grad, -1, 1), var) for grad, var in self.grads_and_vars] self.g_updates = Optimizer.apply_gradients( self.capped_gvs, global_step=self.global_step) return self._graph
def create_network(self): mask_cache = dict() if self.use_mask_cache else None response_emb = fluid.layers.embedding( input=self.response, size=[self._vocab_size + 1, self._emb_size], is_sparse=self.use_sparse_embedding, param_attr=fluid.ParamAttr( name=self.word_emb_name, initializer=fluid.initializer.Normal(scale=0.1))) # response part Hr = response_emb Hr_stack = [Hr] for index in six.moves.xrange(self._stack_num): Hr = layers.block( name="response_self_stack" + str(index), query=Hr, key=Hr, value=Hr, d_key=self._emb_size, q_mask=self.response_mask, k_mask=self.response_mask, mask_cache=mask_cache) Hr_stack.append(Hr) # context part sim_turns = [] for t in six.moves.xrange(self._max_turn_num): Hu = fluid.layers.embedding( input=self.turns_data[t], size=[self._vocab_size + 1, self._emb_size], is_sparse=self.use_sparse_embedding, param_attr=fluid.ParamAttr( name=self.word_emb_name, initializer=fluid.initializer.Normal(scale=0.1))) Hu_stack = [Hu] for index in six.moves.xrange(self._stack_num): # share parameters Hu = layers.block( name="turn_self_stack" + str(index), query=Hu, key=Hu, value=Hu, d_key=self._emb_size, q_mask=self.turns_mask[t], k_mask=self.turns_mask[t], mask_cache=mask_cache) Hu_stack.append(Hu) # cross attention r_a_t_stack = [] t_a_r_stack = [] for index in six.moves.xrange(self._stack_num + 1): t_a_r = layers.block( name="t_attend_r_" + str(index), query=Hu_stack[index], key=Hr_stack[index], value=Hr_stack[index], d_key=self._emb_size, q_mask=self.turns_mask[t], k_mask=self.response_mask, mask_cache=mask_cache) r_a_t = layers.block( name="r_attend_t_" + str(index), query=Hr_stack[index], key=Hu_stack[index], value=Hu_stack[index], d_key=self._emb_size, q_mask=self.response_mask, k_mask=self.turns_mask[t], mask_cache=mask_cache) t_a_r_stack.append(t_a_r) r_a_t_stack.append(r_a_t) t_a_r_stack.extend(Hu_stack) r_a_t_stack.extend(Hr_stack) if self.use_stack_op: t_a_r = fluid.layers.stack(t_a_r_stack, axis=1) r_a_t = fluid.layers.stack(r_a_t_stack, axis=1) else: for index in six.moves.xrange(len(t_a_r_stack)): t_a_r_stack[index] = fluid.layers.unsqueeze( input=t_a_r_stack[index], axes=[1]) r_a_t_stack[index] = fluid.layers.unsqueeze( input=r_a_t_stack[index], axes=[1]) t_a_r = fluid.layers.concat(input=t_a_r_stack, axis=1) r_a_t = fluid.layers.concat(input=r_a_t_stack, axis=1) # sim shape: [batch_size, 2*(stack_num+1), max_turn_len, max_turn_len] sim = fluid.layers.matmul( x=t_a_r, y=r_a_t, transpose_y=True, alpha=1 / np.sqrt(200.0)) sim_turns.append(sim) if self.use_stack_op: sim = fluid.layers.stack(sim_turns, axis=2) else: for index in six.moves.xrange(len(sim_turns)): sim_turns[index] = fluid.layers.unsqueeze( input=sim_turns[index], axes=[2]) # sim shape: [batch_size, 2*(stack_num+1), max_turn_num, max_turn_len, max_turn_len] sim = fluid.layers.concat(input=sim_turns, axis=2) final_info = layers.cnn_3d(sim, self._channel1_num, self._channel2_num) loss, logits = layers.loss(final_info, self.label) return loss, logits
def build_graph(self): with self._graph.as_default(): if self._conf['rand_seed'] is not None: rand_seed = self._conf['rand_seed'] tf.set_random_seed(rand_seed) print('set tf random seed: %s' % self._conf['rand_seed']) #word embedding with tf.device('/cpu:0'), tf.name_scope("embedding"): self._word_embedding = tf.get_variable( 'word_embedding', shape=(self._conf['vocab_size'], self._conf['emb_size']), dtype=tf.float32, trainable=False) self.emb_placeholder = tf.placeholder( tf.float32, shape=[self._conf['vocab_size'], self._conf['emb_size']]) self.emb_init = self._word_embedding.assign( self.emb_placeholder) #define placehloders self.turns = tf.placeholder( # context data tf.int32, shape=[ None, self._conf["max_turn_num"], self._conf["max_turn_len"] ]) self.tt_turns_len = tf.placeholder( # utterance num of context tf.int32, shape=[None]) self.every_turn_len = tf.placeholder( # length of each utterance in context tf.int32, shape=[None, self._conf["max_turn_num"]]) self.response = tf.placeholder( # response data tf.int32, shape=[None, self._conf["max_turn_len"]]) self.response_len = tf.placeholder( # response len tf.int32, shape=[None]) self.label = tf.placeholder( # scale label tf.float32, shape=[None]) self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") #define operations #build response embedding Hr = tf.nn.embedding_lookup(self._word_embedding, self.response) Hr = tf.nn.dropout(Hr, self.dropout_keep_prob) if self._conf['is_positional']: with tf.variable_scope('positional'): Hr = op.positional_encoding_vector(Hr, max_timescale=10) with tf.variable_scope('attention_cnn_block'): hr_conv_list = layers.agdr_block( Hr, self._conf['repeat_times'], self._conf['delation_list'], self._conf['dcnn_filter_width'], self._conf['dcnn_channel'], self.dropout_keep_prob) list_turn_t = tf.unstack(self.turns, axis=1) list_turn_length = tf.unstack(self.every_turn_len, axis=1) reuse = None sim_turns = [] #for every turn_t, build embedding and calculate matching vector for turn_t, t_turn_length in zip(list_turn_t, list_turn_length): Hu = tf.nn.embedding_lookup(self._word_embedding, turn_t) Hu = tf.nn.dropout(Hu, self.dropout_keep_prob) if self._conf['is_positional']: with tf.variable_scope('positional', reuse=True): Hu = op.positional_encoding_vector(Hu, max_timescale=10) # multi-level sim matrix of response and each utterance sim_matrix = [layers.Word_Sim(Hr, Hu)] with tf.variable_scope('attention_cnn_block', reuse=True): hu_conv_list = layers.agdr_block( Hu, self._conf['repeat_times'], self._conf['delation_list'], self._conf['dcnn_filter_width'], self._conf['dcnn_channel'], self.dropout_keep_prob) for index in range(len(hu_conv_list)): with tf.variable_scope('segment_sim'): sim_matrix.append( layers.Word_Sim(hr_conv_list[index], hu_conv_list[index])) sim_matrix = tf.stack(sim_matrix, axis=-1, name='one_matrix_stack') with tf.variable_scope('cnn_aggregation', reuse=tf.AUTO_REUSE): matching_vector = layers.CNN_2d(sim_matrix, 32, 16, self.dropout_keep_prob) if not reuse: reuse = True sim_turns.append(matching_vector) #aggregation with a gru sim = tf.stack(sim_turns, axis=1, name='matching_stack') with tf.variable_scope("sent_rnn"): sent_rnn_outputs, _ = layers.bigru_sequence( sim, 64, None, self.dropout_keep_prob) # TODO:CHECK # attention at sentence level: sent_atten_inputs = tf.concat(sent_rnn_outputs, 2) with tf.variable_scope("sent_atten"): rev_outs, alphas_sents = layers.intro_attention( sent_atten_inputs, 50) #loss and train with tf.variable_scope('loss'): self.loss, self.logits = layers.loss(rev_outs, self.label, is_clip=True) self.global_step = tf.Variable(0, trainable=False) initial_learning_rate = self._conf['learning_rate'] self.learning_rate = tf.train.exponential_decay( initial_learning_rate, global_step=self.global_step, decay_steps=5000, decay_rate=0.96, staircase=True) Optimizer = tf.train.AdamOptimizer(self.learning_rate) self.optimizer = Optimizer.minimize( self.loss, global_step=self.global_step) self.init = tf.global_variables_initializer() self.saver = tf.train.Saver( max_to_keep=self._conf["max_to_keep"]) self.all_variables = tf.global_variables() self.all_operations = self._graph.get_operations() self.grads_and_vars = Optimizer.compute_gradients(self.loss) for grad, var in self.grads_and_vars: if grad is None: print(var) self.capped_gvs = [(tf.clip_by_value(grad, -1, 1), var) for grad, var in self.grads_and_vars] self.g_updates = Optimizer.apply_gradients( self.capped_gvs, global_step=self.global_step) # summary grad_summaries = [] for g, v in self.grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram( "gradient/{}/hist".format(v.name), g) sparsity_summary = tf.summary.scalar( "gradient/{}/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) logit_summary = tf.summary.histogram("{}".format(self.logits.name), self.logits) # Loss Summaries loss_summary = tf.summary.scalar("loss", self.loss) # Train, Dev Summaries self.train_summary_op = tf.summary.merge( [loss_summary, logit_summary, grad_summaries_merged]) self.dev_summary_op = tf.summary.merge([ loss_summary, ]) return self._graph