def testSequenceLoss(self): self.config_default_values() with self.cached_session(use_gpu=True): average_loss_per_example = loss.sequence_loss( self.logits, self.targets, self.weights, average_across_timesteps=True, average_across_batch=True) res = self.evaluate(average_loss_per_example) self.assertAllClose(self.expected_loss, res) average_loss_per_sequence = loss.sequence_loss( self.logits, self.targets, self.weights, average_across_timesteps=False, average_across_batch=True) res = self.evaluate(average_loss_per_sequence) compare_per_sequence = np.full((self.sequence_length), self.expected_loss) self.assertAllClose(compare_per_sequence, res) average_loss_per_batch = loss.sequence_loss( self.logits, self.targets, self.weights, average_across_timesteps=True, average_across_batch=False) res = self.evaluate(average_loss_per_batch) compare_per_batch = np.full((self.batch_size), self.expected_loss) self.assertAllClose(compare_per_batch, res) total_loss = loss.sequence_loss( self.logits, self.targets, self.weights, average_across_timesteps=False, average_across_batch=False) res = self.evaluate(total_loss) compare_total = np.full((self.batch_size, self.sequence_length), self.expected_loss) self.assertAllClose(compare_total, res)
def testSequenceLoss(self): with self.session(use_gpu=True) as sess: with variable_scope.variable_scope( 'root', initializer=init_ops.constant_initializer(0.5)): batch_size = 2 sequence_length = 3 number_of_classes = 5 logits = [ constant_op.constant( i + 0.5, shape=[batch_size, number_of_classes]) for i in range(sequence_length) ] logits = array_ops.stack(logits, axis=1) targets = [ constant_op.constant( i, dtypes.int32, shape=[batch_size]) for i in range(sequence_length) ] targets = array_ops.stack(targets, axis=1) weights = [ constant_op.constant( 1.0, shape=[batch_size]) for i in range(sequence_length) ] weights = array_ops.stack(weights, axis=1) average_loss_per_example = loss.sequence_loss( logits, targets, weights, average_across_timesteps=True, average_across_batch=True) res = sess.run(average_loss_per_example) self.assertAllClose(1.60944, res) average_loss_per_sequence = loss.sequence_loss( logits, targets, weights, average_across_timesteps=False, average_across_batch=True) res = sess.run(average_loss_per_sequence) compare_per_sequence = np.ones((sequence_length)) * 1.60944 self.assertAllClose(compare_per_sequence, res) average_loss_per_batch = loss.sequence_loss( logits, targets, weights, average_across_timesteps=True, average_across_batch=False) res = sess.run(average_loss_per_batch) compare_per_batch = np.ones((batch_size)) * 1.60944 self.assertAllClose(compare_per_batch, res) total_loss = loss.sequence_loss( logits, targets, weights, average_across_timesteps=False, average_across_batch=False) res = sess.run(total_loss) compare_total = np.ones((batch_size, sequence_length)) * 1.60944 self.assertAllClose(compare_total, res)
def testZeroWeights(self): self.config_default_values() weights = [ constant_op.constant(0.0, shape=[self.batch_size]) for _ in range(self.sequence_length) ] weights = array_ops.stack(weights, axis=1) with self.cached_session(use_gpu=True): average_loss_per_example = loss.sequence_loss( self.logits, self.targets, weights, average_across_timesteps=True, average_across_batch=True) res = self.evaluate(average_loss_per_example) self.assertAllClose(0.0, res) average_loss_per_sequence = loss.sequence_loss( self.logits, self.targets, weights, average_across_timesteps=False, average_across_batch=True) res = self.evaluate(average_loss_per_sequence) compare_per_sequence = np.zeros((self.sequence_length)) self.assertAllClose(compare_per_sequence, res) average_loss_per_batch = loss.sequence_loss( self.logits, self.targets, weights, average_across_timesteps=True, average_across_batch=False) res = self.evaluate(average_loss_per_batch) compare_per_batch = np.zeros((self.batch_size)) self.assertAllClose(compare_per_batch, res) total_loss = loss.sequence_loss( self.logits, self.targets, weights, average_across_timesteps=False, average_across_batch=False) res = self.evaluate(total_loss) compare_total = np.zeros((self.batch_size, self.sequence_length)) self.assertAllClose(compare_total, res)
def build(self, inputs, for_deploy): conf = self.conf name = self.name job_type = self.job_type dtype = self.dtype self.beam_size = 1 if (not for_deploy or self.conf.variants == "score") else sum(self.conf.beam_splits) conf.keep_prob = conf.keep_prob if not for_deploy else 1.0 self.enc_str_inps = inputs["enc_inps:0"] self.dec_str_inps = inputs["dec_inps:0"] self.enc_lens = inputs["enc_lens:0"] self.dec_lens = inputs["dec_lens:0"] #self.down_wgts = inputs["down_wgts:0"] with tf.name_scope("TableLookup"): # lookup tables self.in_table = lookup.MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.out_table = lookup.MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value="_UNK", shared_name="out_table", name="out_table", checkpoint=True) self.enc_inps = self.in_table.lookup(self.enc_str_inps) self.dec_inps = self.in_table.lookup(self.dec_str_inps) # Create encode graph and get attn states graphlg.info("Creating embeddings and embedding enc_inps.") with ops.device("/cpu:0"): self.embedding = variable_scope.get_variable( "embedding", [conf.output_vocab_size, conf.embedding_size]) with tf.name_scope("Embed") as scope: dec_inps = tf.slice(self.dec_inps, [0, 0], [-1, conf.output_max_len + 1]) with ops.device("/cpu:0"): self.emb_inps = embedding_lookup_unique( self.embedding, self.enc_inps) emb_dec_inps = embedding_lookup_unique(self.embedding, dec_inps) # output projector (w, b) with tf.variable_scope("OutProj"): if conf.out_layer_size: w = tf.get_variable( "proj_w", [conf.out_layer_size, conf.output_vocab_size], dtype=dtype) elif conf.bidirectional: w = tf.get_variable( "proj_w", [conf.num_units * 2, conf.output_vocab_size], dtype=dtype) else: w = tf.get_variable("proj_w", [conf.num_units, conf.output_vocab_size], dtype=dtype) b = tf.get_variable("proj_b", [conf.output_vocab_size], dtype=dtype) graphlg.info("Creating dynamic rnn...") self.enc_outs, self.enc_states, mem_size, enc_state_size = DynRNN( conf.cell_model, conf.num_units, conf.num_layers, self.emb_inps, self.enc_lens, keep_prob=conf.keep_prob, bidi=conf.bidirectional, name_scope="DynRNNEncoder") batch_size = tf.shape(self.enc_outs)[0] # to modify the output states of all encoder layers for dec init final_enc_states = self.enc_states with tf.name_scope("DynRNNDecode") as scope: with tf.name_scope("ShapeToBeam") as scope: beam_memory = tf.reshape( tf.tile(self.enc_outs, [1, 1, self.beam_size]), [-1, conf.input_max_len, mem_size]) beam_memory_lens = tf.squeeze( tf.reshape( tf.tile(tf.expand_dims(self.enc_lens, 1), [1, self.beam_size]), [-1, 1]), 1) def _to_beam(t): return tf.reshape(tf.tile(t, [1, self.beam_size]), [-1, int(t.get_shape()[1])]) beam_init_states = tf.contrib.framework.nest.map_structure( _to_beam, final_enc_states) max_mem_size = self.conf.input_max_len + self.conf.output_max_len + 2 cell = AttnCell(cell_model=conf.cell_model, num_units=mem_size, num_layers=conf.num_layers, attn_type=self.conf.attention, memory=beam_memory, mem_lens=beam_memory_lens, max_mem_size=max_mem_size, addmem=self.conf.addmem, keep_prob=conf.keep_prob, dtype=tf.float32, name_scope="AttnCell") dec_init_state = DecStateInit(all_enc_states=beam_init_states, decoder_cell=cell, batch_size=batch_size * self.beam_size, init_type=conf.dec_init_type, use_proj=conf.use_init_proj) if not for_deploy: hp_train = helper.ScheduledEmbeddingTrainingHelper( inputs=emb_dec_inps, sequence_length=self.dec_lens, embedding=self.embedding, sampling_probability=self.conf.sample_prob, out_proj=(w, b)) output_layer = layers_core.Dense( self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None my_decoder = basic_decoder.BasicDecoder( cell=cell, helper=hp_train, initial_state=dec_init_state, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode( decoder=my_decoder, impute_finished=True, maximum_iterations=conf.output_max_len + 1, scope=scope) elif self.conf.variants == "score": hp_train = helper.ScheduledEmbeddingTrainingHelper( inputs=emb_dec_inps, sequence_length=self.dec_lens, embedding=self.embedding, sampling_probability=0.0, out_proj=(w, b)) output_layer = layers_core.Dense( self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None my_decoder = score_decoder.ScoreDecoder( cell=cell, helper=hp_train, out_proj=(w, b), initial_state=dec_init_state, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode( decoder=my_decoder, scope=scope, maximum_iterations=self.conf.output_max_len, impute_finished=True) else: hp_infer = helper.GreedyEmbeddingHelper( embedding=self.embedding, start_tokens=tf.ones(shape=[batch_size * self.beam_size], dtype=tf.int32), end_token=EOS_ID, out_proj=(w, b)) output_layer = layers_core.Dense( self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None dec_init_state = beam_decoder.BeamState( tf.zeros([batch_size * self.beam_size]), dec_init_state, tf.zeros([batch_size * self.beam_size], tf.int32)) my_decoder = beam_decoder.BeamDecoder( cell=cell, helper=hp_infer, out_proj=(w, b), initial_state=dec_init_state, beam_splits=self.conf.beam_splits, max_res_num=self.conf.max_res_num, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode( decoder=my_decoder, scope=scope, maximum_iterations=self.conf.output_max_len, impute_finished=True) if not for_deploy: outputs = cell_outs.rnn_output # Output ouputprojected to logits L = tf.shape(outputs)[1] outputs = tf.reshape(outputs, [-1, int(w.shape[0])]) outputs = tf.matmul(outputs, w) + b logits = tf.reshape(outputs, [-1, L, int(w.shape[1])]) # branch 1 for debugging, doesn't have to be called with tf.name_scope("DebugOutputs") as scope: self.outputs = tf.argmax(logits, axis=2) self.outputs = tf.reshape(self.outputs, [-1, L]) self.outputs = self.out_table.lookup( tf.cast(self.outputs, tf.int64)) with tf.name_scope("Loss") as scope: tars = tf.slice(self.dec_inps, [0, 1], [-1, L]) # wgts may be a more complicated form, for example a partial down-weighting of a sequence # but here i just use 1.0 weights for all no-padding label wgts = tf.cumsum(tf.one_hot(self.dec_lens, L), axis=1, reverse=True) #wgts = wgts * tf.expand_dims(self.down_wgts, 1) loss_matrix = loss.sequence_loss( logits=logits, targets=tars, weights=wgts, average_across_timesteps=False, average_across_batch=False) self.loss = see_loss = tf.reduce_sum( loss_matrix) / tf.reduce_sum(wgts) with tf.name_scope(self.model_kind): tf.summary.scalar("loss", see_loss) graph_nodes = { "loss": self.loss, "inputs": {}, "outputs": {}, "debug_outputs": self.outputs } elif self.conf.variants == "score": L = tf.shape(cell_outs.logprobs)[1] one_hot = tf.one_hot(tf.slice(self.dec_inps, [0, 1], [-1, L]), depth=self.conf.output_vocab_size, axis=-1, on_value=1.0, off_value=0.0) outputs = tf.reduce_sum(cell_outs.logprobs * one_hot, 2) outputs = tf.reduce_sum(outputs, axis=1) graph_nodes = { "loss": None, "inputs": { "enc_inps:0": self.enc_str_inps, "enc_lens:0": self.enc_lens, "dec_inps:0": self.dec_str_inps, "dec_lens:0": self.dec_lens }, "outputs": { "logprobs": outputs }, "visualize": None } else: L = tf.shape(cell_outs.beam_ends)[1] beam_symbols = cell_outs.beam_symbols beam_parents = cell_outs.beam_parents beam_ends = cell_outs.beam_ends beam_end_parents = cell_outs.beam_end_parents beam_end_probs = cell_outs.beam_end_probs alignments = cell_outs.alignments beam_ends = tf.reshape(tf.transpose(beam_ends, [0, 2, 1]), [-1, L]) beam_end_parents = tf.reshape( tf.transpose(beam_end_parents, [0, 2, 1]), [-1, L]) beam_end_probs = tf.reshape( tf.transpose(beam_end_probs, [0, 2, 1]), [-1, L]) ## Creating tail_ids batch_size = tf.Print(batch_size, [batch_size], message="BATCH") batch_offset = tf.expand_dims( tf.cumsum( tf.ones([batch_size, self.beam_size], dtype=tf.int32) * self.beam_size, axis=0, exclusive=True), 2) offset2 = tf.expand_dims( tf.cumsum( tf.ones([batch_size, self.beam_size * 2], dtype=tf.int32) * self.beam_size, axis=0, exclusive=True), 2) out_len = tf.shape(beam_symbols)[1] self.beam_symbol_strs = tf.reshape( self.out_table.lookup(tf.cast(beam_symbols, tf.int64)), [batch_size, self.beam_size, -1]) self.beam_parents = tf.reshape( beam_parents, [batch_size, self.beam_size, -1]) - batch_offset self.beam_ends = tf.reshape(beam_ends, [batch_size, self.beam_size * 2, -1]) self.beam_end_parents = tf.reshape( beam_end_parents, [batch_size, self.beam_size * 2, -1]) - offset2 self.beam_end_probs = tf.reshape( beam_end_probs, [batch_size, self.beam_size * 2, -1]) self.beam_attns = tf.reshape( alignments, [batch_size, self.beam_size, out_len, -1]) graph_nodes = { "loss": None, "inputs": { "enc_inps:0": self.enc_str_inps, "enc_lens:0": self.enc_lens }, "outputs": { "beam_symbols": self.beam_symbol_strs, "beam_parents": self.beam_parents, "beam_ends": self.beam_ends, "beam_end_parents": self.beam_end_parents, "beam_end_probs": self.beam_end_probs, "beam_attns": self.beam_attns }, "visualize": {} } return graph_nodes
def build(self, inputs, for_deploy): scope = "" conf = self.conf name = self.name job_type = self.job_type dtype = self.dtype self.beam_splits = conf.beam_splits self.beam_size = 1 if not for_deploy else sum(self.beam_splits) self.enc_str_inps = inputs["enc_inps:0"] self.dec_str_inps = inputs["dec_inps:0"] self.enc_lens = inputs["enc_lens:0"] self.dec_lens = inputs["dec_lens:0"] self.down_wgts = inputs["down_wgts:0"] with tf.name_scope("TableLookup"): # Input maps self.in_table = lookup.MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.out_table = lookup.MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value="_UNK", shared_name="out_table", name="out_table", checkpoint=True) # lookup self.enc_inps = self.in_table.lookup(self.enc_str_inps) self.dec_inps = self.in_table.lookup(self.dec_str_inps) graphlg.info("Preparing decoder inps...") dec_inps = tf.slice(self.dec_inps, [0, 0], [-1, conf.output_max_len + 1]) # Create encode graph and get attn states graphlg.info("Creating embeddings and embedding enc_inps.") with ops.device("/cpu:0"): self.embedding = variable_scope.get_variable("embedding", [conf.output_vocab_size, conf.embedding_size]) with tf.name_scope("Embed") as scope: dec_inps = tf.slice(self.dec_inps, [0, 0], [-1, conf.output_max_len + 1]) with ops.device("/cpu:0"): self.emb_inps = embedding_lookup_unique(self.embedding, self.enc_inps) emb_dec_inps = embedding_lookup_unique(self.embedding, dec_inps) graphlg.info("Creating dynamic x rnn...") self.enc_outs, self.enc_states, mem_size, enc_state_size = DynRNN(conf.cell_model, conf.num_units, conf.num_layers, self.emb_inps, self.enc_lens, keep_prob=1.0, bidi=conf.bidirectional, name_scope="DynRNNEncoder") batch_size = tf.shape(self.enc_outs)[0] if self.conf.attention: init_h = self.enc_states[-1].h else: mechanism = dynamic_attention_wrapper.LuongAttention(num_units=conf.num_units, memory=self.enc_outs, max_mem_size=self.conf.input_max_len, memory_sequence_length=self.enc_lens) init_h = mechanism(self.enc_states[-1].h) if isinstance(self.enc_states[-1], LSTMStateTuple): enc_state = LSTMStateTuple(self.enc_states[-1].c, init_h) hidden_units = int(math.sqrt(mem_size * self.conf.enc_latent_dim)) z, mu_prior, logvar_prior = PriorNet([enc_state], hidden_units, self.conf.enc_latent_dim, stddev=1.0, prior_type=conf.prior_type) KLD = 0.0 # Different graph for training and inference time if not for_deploy: # Y inputs for posterior z with tf.name_scope("YEncode"): y_emb_inps = tf.slice(emb_dec_inps, [0, 1, 0], [-1, -1, -1]) y_enc_outs, y_enc_states, y_mem_size, y_enc_state_size = DynRNN(conf.cell_model, conf.num_units, conf.num_layers, y_emb_inps, self.dec_lens, keep_prob=1.0, bidi=False, name_scope="y_enc") y_enc_state = y_enc_states[-1] z, KLD, l2 = CreateVAE([enc_state, y_enc_state], self.conf.enc_latent_dim, mu_prior, logvar_prior) # project z + x_thinking_state to decoder state raw_dec_states = [z, enc_state] # add BOW loss #num_hidden_units = int(math.sqrt(conf.output_vocab_size * int(decision_state.shape[1]))) #bow_l1 = layers_core.Dense(num_hidden_units, use_bias=True, name="bow_hidden", activation=tf.tanh) #bow_l2 = layers_core.Dense(conf.output_vocab_size, use_bias=True, name="bow_out", activation=None) #bow = bow_l2(bow_l1(decision_state)) #y_dec_inps = tf.slice(self.dec_inps, [0, 1], [-1, -1]) #bow_y = tf.reduce_sum(tf.one_hot(y_dec_inps, on_value=1.0, off_value=0.0, axis=-1, depth=conf.output_vocab_size), axis=1) #batch_bow_losses = tf.reduce_sum(bow_y * (-1.0) * tf.nn.log_softmax(bow), axis=1) max_mem_size = self.conf.input_max_len + self.conf.output_max_len + 2 with tf.name_scope("ShapeToBeam") as scope: def _to_beam(t): beam_t = tf.reshape(tf.tile(t, [1, self.beam_size]), [-1, int(t.get_shape()[1])]) return beam_t beam_raw_dec_states = tf.contrib.framework.nest.map_structure(_to_beam, raw_dec_states) beam_memory = tf.reshape(tf.tile(self.enc_outs, [1, 1, self.beam_size]), [-1, conf.input_max_len, mem_size]) beam_memory_lens = tf.squeeze(tf.reshape(tf.tile(tf.expand_dims(self.enc_lens, 1), [1, self.beam_size]), [-1, 1]), 1) cell = AttnCell(cell_model=conf.cell_model, num_units=mem_size, num_layers=conf.num_layers, attn_type=self.conf.attention, memory=beam_memory, mem_lens=beam_memory_lens, max_mem_size=max_mem_size, addmem=self.conf.addmem, keep_prob=1.0, dtype=tf.float32, name_scope="AttnCell") # Fit decision states to shape of attention decoder cell states zero_attn_states = DecStateInit(beam_raw_dec_states, cell, batch_size * self.beam_size) # Output projection with tf.variable_scope("OutProj"): graphlg.info("Creating out_proj...") if conf.out_layer_size: w = tf.get_variable("proj_w", [conf.out_layer_size, conf.output_vocab_size], dtype=dtype) else: w = tf.get_variable("proj_w", [mem_size, conf.output_vocab_size], dtype=dtype) b = tf.get_variable("proj_b", [conf.output_vocab_size], dtype=dtype) self.out_proj = (w, b) if not for_deploy: inputs = {} dec_init_state = zero_attn_states hp_train = helper.ScheduledEmbeddingTrainingHelper(inputs=emb_dec_inps, sequence_length=self.dec_lens, embedding=self.embedding, sampling_probability=0.0, out_proj=self.out_proj) output_layer = layers_core.Dense(self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None my_decoder = basic_decoder.BasicDecoder(cell=cell, helper=hp_train, initial_state=dec_init_state, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode(decoder=my_decoder, impute_finished=False, maximum_iterations=conf.output_max_len + 1, scope=scope) outputs = cell_outs.rnn_output L = tf.shape(outputs)[1] outputs = tf.reshape(outputs, [-1, int(self.out_proj[0].shape[0])]) outputs = tf.matmul(outputs, self.out_proj[0]) + self.out_proj[1] logits = tf.reshape(outputs, [-1, L, int(self.out_proj[0].shape[1])]) # branch 1 for debugging, doesn't have to be called #m = tf.shape(self.outputs)[0] #self.mask = tf.zeros([m, int(w.shape[1])]) #for i in [3]: # self.mask = self.mask + tf.one_hot(indices=tf.ones([m], dtype=tf.int32) * i, on_value=100.0, depth=int(w.shape[1])) #self.outputs = self.outputs - self.mask with tf.name_scope("DebugOutputs") as scope: self.outputs = tf.argmax(logits, axis=2) self.outputs = tf.reshape(self.outputs, [-1, L]) self.outputs = self.out_table.lookup(tf.cast(self.outputs, tf.int64)) # branch 2 for loss with tf.name_scope("Loss") as scope: tars = tf.slice(self.dec_inps, [0, 1], [-1, L]) wgts = tf.cumsum(tf.one_hot(self.dec_lens, L), axis=1, reverse=True) #wgts = wgts * tf.expand_dims(self.down_wgts, 1) self.loss = loss.sequence_loss(logits=logits, targets=tars, weights=wgts, average_across_timesteps=False, average_across_batch=False) batch_wgt = tf.reduce_sum(self.down_wgts) + 1e-12 #bow_loss = tf.reduce_sum(batch_bow_losses * self.down_wgts) / batch_wgt example_losses = tf.reduce_sum(self.loss, 1) see_loss = tf.reduce_sum(example_losses / tf.cast(self.dec_lens, tf.float32) * self.down_wgts) / batch_wgt KLD = tf.reduce_sum(KLD * self.down_wgts) / batch_wgt self.loss = tf.reduce_sum((example_losses + self.conf.kld_ratio * KLD) / tf.cast(self.dec_lens, tf.float32) * self.down_wgts) / batch_wgt with tf.name_scope(self.model_kind): tf.summary.scalar("loss", see_loss) tf.summary.scalar("kld", KLD) #tf.summary.scalar("bow", bow_loss) graph_nodes = { "loss":self.loss, "inputs":inputs, "debug_outputs":self.outputs, "outputs":{}, "visualize":None } return graph_nodes else: hp_infer = helper.GreedyEmbeddingHelper(embedding=self.embedding, start_tokens=tf.ones(shape=[batch_size * self.beam_size], dtype=tf.int32), end_token=EOS_ID, out_proj=self.out_proj) output_layer = layers_core.Dense(self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None dec_init_state = beam_decoder.BeamState(tf.zeros([batch_size * self.beam_size]), zero_attn_states, tf.zeros([batch_size * self.beam_size], tf.int32)) my_decoder = beam_decoder.BeamDecoder(cell=cell, helper=hp_infer, out_proj=self.out_proj, initial_state=dec_init_state, beam_splits=self.beam_splits, max_res_num=self.conf.max_res_num, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode(decoder=my_decoder, scope=scope, maximum_iterations=self.conf.output_max_len) L = tf.shape(cell_outs.beam_ends)[1] beam_symbols = cell_outs.beam_symbols beam_parents = cell_outs.beam_parents beam_ends = cell_outs.beam_ends beam_end_parents = cell_outs.beam_end_parents beam_end_probs = cell_outs.beam_end_probs alignments = cell_outs.alignments beam_ends = tf.reshape(tf.transpose(beam_ends, [0, 2, 1]), [-1, L]) beam_end_parents = tf.reshape(tf.transpose(beam_end_parents, [0, 2, 1]), [-1, L]) beam_end_probs = tf.reshape(tf.transpose(beam_end_probs, [0, 2, 1]), [-1, L]) # Creating tail_ids batch_size = tf.Print(batch_size, [batch_size], message="CVAERNN batch") #beam_symbols = tf.Print(cell_outs.beam_symbols, [tf.shape(cell_outs.beam_symbols)], message="beam_symbols") #beam_parents = tf.Print(cell_outs.beam_parents, [tf.shape(cell_outs.beam_parents)], message="beam_parents") #beam_ends = tf.Print(cell_outs.beam_ends, [tf.shape(cell_outs.beam_ends)], message="beam_ends") #beam_end_parents = tf.Print(cell_outs.beam_end_parents, [tf.shape(cell_outs.beam_end_parents)], message="beam_end_parents") #beam_end_probs = tf.Print(cell_outs.beam_end_probs, [tf.shape(cell_outs.beam_end_probs)], message="beam_end_probs") #alignments = tf.Print(cell_outs.alignments, [tf.shape(cell_outs.alignments)], message="beam_attns") batch_offset = tf.expand_dims(tf.cumsum(tf.ones([batch_size, self.beam_size], dtype=tf.int32) * self.beam_size, axis=0, exclusive=True), 2) offset2 = tf.expand_dims(tf.cumsum(tf.ones([batch_size, self.beam_size * 2], dtype=tf.int32) * self.beam_size, axis=0, exclusive=True), 2) out_len = tf.shape(beam_symbols)[1] self.beam_symbol_strs = tf.reshape(self.out_table.lookup(tf.cast(beam_symbols, tf.int64)), [batch_size, self.beam_size, -1]) self.beam_parents = tf.reshape(beam_parents, [batch_size, self.beam_size, -1]) - batch_offset self.beam_ends = tf.reshape(beam_ends, [batch_size, self.beam_size * 2, -1]) self.beam_end_parents = tf.reshape(beam_end_parents, [batch_size, self.beam_size * 2, -1]) - offset2 self.beam_end_probs = tf.reshape(beam_end_probs, [batch_size, self.beam_size * 2, -1]) self.beam_attns = tf.reshape(alignments, [batch_size, self.beam_size, out_len, -1]) #cell_outs.alignments #self.outputs = tf.concat([outputs_str, tf.cast(cell_outs.beam_parents, tf.string)], 1) #ones = tf.ones([batch_size, self.beam_size], dtype=tf.int32) #aux_matrix = tf.cumsum(ones * self.beam_size, axis=0, exclusive=True) #tm_beam_parents_reverse = tf.reverse(tf.transpose(cell_outs.beam_parents), axis=[0]) #beam_probs = final_state[1] #def traceback(prev_out, curr_input): # return tf.gather(curr_input, prev_out) # #tail_ids = tf.reshape(tf.cumsum(ones, axis=1, exclusive=True) + aux_matrix, [-1]) #tm_symbol_index_reverse = tf.scan(traceback, tm_beam_parents_reverse, initializer=tail_ids) ## Create beam index for symbols, and other info #tm_symbol_index = tf.concat([tf.expand_dims(tail_ids, 0), tm_symbol_index_reverse], axis=0) #tm_symbol_index = tf.reverse(tm_symbol_index, axis=[0]) #tm_symbol_index = tf.slice(tm_symbol_index, [1, 0], [-1, -1]) #symbol_index = tf.expand_dims(tf.transpose(tm_symbol_index), axis=2) #symbol_index = tf.concat([symbol_index, tf.cumsum(tf.ones_like(symbol_index), exclusive=True, axis=1)], axis=2) ## index alignments and output symbols #alignments = tf.gather_nd(cell_outs.alignments, symbol_index) #symbol_ids = tf.gather_nd(cell_outs.beam_symbols, symbol_index) ## outputs and other info #self.others = [alignments, beam_probs] #self.outputs = self.out_table.lookup(tf.cast(symbol_ids, tf.int64)) inputs = { "enc_inps:0":self.enc_str_inps, "enc_lens:0":self.enc_lens } outputs = { "beam_symbols":self.beam_symbol_strs, "beam_parents":self.beam_parents, "beam_ends":self.beam_ends, "beam_end_parents":self.beam_end_parents, "beam_end_probs":self.beam_end_probs, "beam_attns":self.beam_attns } graph_nodes = { "loss":None, "inputs":inputs, "outputs":outputs, "visualize":{"z":z} } return graph_nodes
def testSequenceLoss(self): with self.test_session() as sess: with variable_scope.variable_scope( 'root', initializer=init_ops.constant_initializer(0.5)): batch_size = 2 sequence_length = 3 number_of_classes = 5 logits = [ constant_op.constant(i + 0.5, shape=[batch_size, number_of_classes]) for i in range(sequence_length) ] logits = array_ops.stack(logits, axis=1) targets = [ constant_op.constant(i, dtypes.int32, shape=[batch_size]) for i in range(sequence_length) ] targets = array_ops.stack(targets, axis=1) weights = [ constant_op.constant(1.0, shape=[batch_size]) for i in range(sequence_length) ] weights = array_ops.stack(weights, axis=1) average_loss_per_example = loss.sequence_loss( logits, targets, weights, average_across_timesteps=True, average_across_batch=True) res = sess.run(average_loss_per_example) self.assertAllClose(1.60944, res) average_loss_per_sequence = loss.sequence_loss( logits, targets, weights, average_across_timesteps=False, average_across_batch=True) res = sess.run(average_loss_per_sequence) compare_per_sequence = np.ones((sequence_length)) * 1.60944 self.assertAllClose(compare_per_sequence, res) average_loss_per_batch = loss.sequence_loss( logits, targets, weights, average_across_timesteps=True, average_across_batch=False) res = sess.run(average_loss_per_batch) compare_per_batch = np.ones((batch_size)) * 1.60944 self.assertAllClose(compare_per_batch, res) total_loss = loss.sequence_loss(logits, targets, weights, average_across_timesteps=False, average_across_batch=False) res = sess.run(total_loss) compare_total = np.ones( (batch_size, sequence_length)) * 1.60944 self.assertAllClose(compare_total, res)
def __init__(self, num_symbols, num_embed_units, num_units, num_layers, embed, entity_embed=None, num_entities=0, num_trans_units=100, learning_rate=0.0001, learning_rate_decay_factor=0.95, max_gradient_norm=5.0, num_samples=512, max_length=60, output_alignments=True, use_lstm=False): self.posts = tf.placeholder(tf.string, (None, None), 'enc_inps') # batch*len self.posts_length = tf.placeholder(tf.int32, (None), 'enc_lens') # batch self.responses = tf.placeholder(tf.string, (None, None), 'dec_inps') # batch*len self.responses_length = tf.placeholder(tf.int32, (None), 'dec_lens') # batch self.entities = tf.placeholder(tf.string, (None, None), 'entities') # batch self.entity_masks = tf.placeholder(tf.string, (None, None), 'entity_masks') # batch self.triples = tf.placeholder(tf.string, (None, None, 3), 'triples') # batch self.posts_triple = tf.placeholder(tf.int32, (None, None, 1), 'enc_triples') # batch self.responses_triple = tf.placeholder(tf.string, (None, None, 3), 'dec_triples') # batch self.match_triples = tf.placeholder(tf.int32, (None, None), 'match_triples') # batch encoder_batch_size, encoder_len = tf.unstack(tf.shape(self.posts)) triple_num = tf.shape(self.triples)[1] #use_triples = tf.reduce_sum(tf.cast(tf.greater_equal(self.match_triples, 0), tf.float32), axis=-1) one_hot_triples = tf.one_hot(self.match_triples, triple_num) use_triples = tf.reduce_sum(one_hot_triples, axis=[2]) self.symbol2index = MutableHashTable( key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.index2symbol = MutableHashTable( key_dtype=tf.int64, value_dtype=tf.string, default_value='_UNK', shared_name="out_table", name="out_table", checkpoint=True) self.entity2index = MutableHashTable( key_dtype=tf.string, value_dtype=tf.int64, default_value=NONE_ID, shared_name="entity_in_table", name="entity_in_table", checkpoint=True) self.index2entity = MutableHashTable( key_dtype=tf.int64, value_dtype=tf.string, default_value='_NONE', shared_name="entity_out_table", name="entity_out_table", checkpoint=True) # build the vocab table (string to index) self.posts_word_id = self.symbol2index.lookup(self.posts) # batch*len self.posts_entity_id = self.entity2index.lookup(self.posts) # batch*len #self.posts_word_id = tf.Print(self.posts_word_id, ['use_triples', use_triples, 'one_hot_triples', one_hot_triples], summarize=1e6) self.responses_target = self.symbol2index.lookup(self.responses) #batch*len batch_size, decoder_len = tf.shape(self.responses)[0], tf.shape(self.responses)[1] self.responses_word_id = tf.concat([tf.ones([batch_size, 1], dtype=tf.int64)*GO_ID, tf.split(self.responses_target, [decoder_len-1, 1], 1)[0]], 1) # batch*len self.decoder_mask = tf.reshape(tf.cumsum(tf.one_hot(self.responses_length-1, decoder_len), reverse=True, axis=1), [-1, decoder_len]) # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('word_embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('word_embed', dtype=tf.float32, initializer=embed) if entity_embed is None: # initialize the embedding randomly self.entity_trans = tf.get_variable('entity_embed', [num_entities, num_trans_units], tf.float32, trainable=False) else: # initialize the embedding by pre-trained word vectors self.entity_trans = tf.get_variable('entity_embed', dtype=tf.float32, initializer=entity_embed, trainable=False) self.entity_trans_transformed = tf.layers.dense(self.entity_trans, num_trans_units, activation=tf.tanh, name='trans_transformation') padding_entity = tf.get_variable('entity_padding_embed', [7, num_trans_units], dtype=tf.float32, initializer=tf.zeros_initializer()) self.entity_embed = tf.concat([padding_entity, self.entity_trans_transformed], axis=0) triples_embedding = tf.reshape(tf.nn.embedding_lookup(self.entity_embed, self.entity2index.lookup(self.triples)), [encoder_batch_size, triple_num, 3 * num_trans_units]) entities_word_embedding = tf.reshape(tf.nn.embedding_lookup(self.embed, self.symbol2index.lookup(self.entities)), [encoder_batch_size, -1, num_embed_units]) self.encoder_input = tf.nn.embedding_lookup(self.embed, self.posts_word_id) #batch*len*unit self.decoder_input = tf.nn.embedding_lookup(self.embed, self.responses_word_id) #batch*len*unit encoder_cell = MultiRNNCell([GRUCell(num_units) for _ in range(num_layers)]) decoder_cell = MultiRNNCell([GRUCell(num_units) for _ in range(num_layers)]) # rnn encoder encoder_output, encoder_state = dynamic_rnn(encoder_cell, self.encoder_input, self.posts_length, dtype=tf.float32, scope="encoder") # get output projection function output_fn, selector_fn, sequence_loss, sampled_sequence_loss, total_loss = output_projection_layer(num_units, num_symbols, num_samples) with tf.variable_scope('decoder'): # get attention function attention_keys_init, attention_values_init, attention_score_fn_init, attention_construct_fn_init \ = prepare_attention(encoder_output, 'bahdanau', num_units, imem=triples_embedding, output_alignments=output_alignments)#'luong', num_units) decoder_fn_train = attention_decoder_fn_train( encoder_state, attention_keys_init, attention_values_init, attention_score_fn_init, attention_construct_fn_init, output_alignments=output_alignments, max_length=tf.reduce_max(self.responses_length)) self.decoder_output, _, alignments_ta = dynamic_rnn_decoder(decoder_cell, decoder_fn_train, self.decoder_input, self.responses_length, scope="decoder_rnn") if output_alignments: self.alignments = tf.transpose(alignments_ta.stack(), perm=[1,0,2]) #self.alignments = tf.Print(self.alignments, [self.alignments], summarize=1e8) self.decoder_loss, self.ppx_loss, self.sentence_ppx = total_loss(self.decoder_output, self.responses_target, self.decoder_mask, self.alignments, triples_embedding, use_triples, one_hot_triples) self.sentence_ppx = tf.identity(self.sentence_ppx, 'ppx_loss') #self.decoder_loss = tf.Print(self.decoder_loss, ['decoder_loss', self.decoder_loss], summarize=1e6) else: self.decoder_loss, self.sentence_ppx = sequence_loss(self.decoder_output, self.responses_target, self.decoder_mask) self.sentence_ppx = tf.identity(self.sentence_ppx, 'ppx_loss') with tf.variable_scope('decoder', reuse=True): # get attention function attention_keys, attention_values, attention_score_fn, attention_construct_fn \ = prepare_attention(encoder_output, 'bahdanau', num_units, reuse=True, imem=triples_embedding, output_alignments=output_alignments)#'luong', num_units) decoder_fn_inference = attention_decoder_fn_inference( output_fn, encoder_state, attention_keys, attention_values, attention_score_fn, attention_construct_fn, self.embed, GO_ID, EOS_ID, max_length, num_symbols, imem=entities_word_embedding, selector_fn=selector_fn) self.decoder_distribution, _, output_ids_ta = dynamic_rnn_decoder(decoder_cell, decoder_fn_inference, scope="decoder_rnn") if output_alignments: output_len = tf.shape(self.decoder_distribution)[1] output_ids = tf.transpose(output_ids_ta.gather(tf.range(output_len))) word_ids = tf.cast(tf.clip_by_value(output_ids, 0, num_symbols), tf.int64) entity_ids = tf.reshape(tf.clip_by_value(-output_ids, 0, num_symbols) + tf.reshape(tf.range(encoder_batch_size) * tf.shape(entities_word_embedding)[1], [-1, 1]), [-1]) entities = tf.reshape(tf.gather(tf.reshape(self.entities, [-1]), entity_ids), [-1, output_len]) words = self.index2symbol.lookup(word_ids) self.generation = tf.where(output_ids > 0, words, entities, name='generation') else: self.generation_index = tf.argmax(self.decoder_distribution, 2) self.generation = self.index2symbol.lookup(self.generation_index, name='generation') # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.params = tf.global_variables() # calculate the gradient of parameters #opt = tf.train.GradientDescentOptimizer(self.learning_rate) opt = tf.train.AdamOptimizer(learning_rate=learning_rate) self.lr = opt._lr gradients = tf.gradients(self.decoder_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) tf.summary.scalar('decoder_loss', self.decoder_loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) self.merged_summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) self.saver_epoch = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=1000, pad_step_number=True)
def __init__( self, num_symbols, # 词汇表size num_embed_units, # 词嵌入size num_units, # RNN 每层单元数 num_layers, # RNN 层数 embed, # 词嵌入 entity_embed=None, # num_entities=0, # num_trans_units=100, # learning_rate=0.0001, learning_rate_decay_factor=0.95, # max_gradient_norm=5.0, # num_samples=500, # 样本个数,sampled softmax max_length=60, mem_use=True, output_alignments=True, use_lstm=False): self.posts = tf.placeholder(tf.string, (None, None), 'enc_inps') # batch_size * encoder_len self.posts_length = tf.placeholder(tf.int32, (None), 'enc_lens') # batch_size self.responses = tf.placeholder(tf.string, (None, None), 'dec_inps') # batch_size * decoder_len self.responses_length = tf.placeholder(tf.int32, (None), 'dec_lens') # batch_size self.entities = tf.placeholder( tf.string, (None, None, None), 'entities') # batch_size * triple_num * triple_len self.entity_masks = tf.placeholder(tf.string, (None, None), 'entity_masks') # 没用到 self.triples = tf.placeholder( tf.string, (None, None, None, 3), 'triples') # batch_size * triple_num * triple_len * 3 self.posts_triple = tf.placeholder( tf.int32, (None, None, 1), 'enc_triples') # batch_size * encoder_len self.responses_triple = tf.placeholder( tf.string, (None, None, 3), 'dec_triples') # batch_size * decoder_len * 3 self.match_triples = tf.placeholder( tf.int32, (None, None, None), 'match_triples') # batch_size * decoder_len * triple_num # 获得 encoder_batch_size ,编码器的 encoder_len encoder_batch_size, encoder_len = tf.unstack(tf.shape(self.posts)) # 获得 triple_num # 每个 post 包含的知识图个数(补齐过的) triple_num = tf.shape(self.triples)[1] # 获得 triple_len # 每个知识图包含的关联实体个数(补齐过的) triple_len = tf.shape(self.triples)[2] # 使用的知识三元组 one_hot_triples = tf.one_hot( self.match_triples, triple_len) # batch_size * decoder_len * triple_num * triple_len # 用 1 标注了哪个时间步产生的回复用了知识三元组 use_triples = tf.reduce_sum(one_hot_triples, axis=[2, 3]) # batch_size * decoder_len # 词汇映射到 index 的 hash table self.symbol2index = MutableHashTable( key_dtype=tf.string, # key张量的类型 value_dtype=tf.int64, # value张量的类型 default_value=UNK_ID, # 缺少key的默认值 shared_name= "in_table", # If non-empty, this table will be shared under the given name across multiple sessions name="in_table", # 操作名 checkpoint=True ) # if True, the contents of the table are saved to and restored from checkpoints. If shared_name is empty for a checkpointed table, it is shared using the table node name. # index 映射到词汇的 hash table self.index2symbol = MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value='_UNK', shared_name="out_table", name="out_table", checkpoint=True) # 实体映射到 index 的 hash table self.entity2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=NONE_ID, shared_name="entity_in_table", name="entity_in_table", checkpoint=True) # index 映射到实体的 hash table self.index2entity = MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value='_NONE', shared_name="entity_out_table", name="entity_out_table", checkpoint=True) # 将 post 的 string 映射成词汇 id self.posts_word_id = self.symbol2index.lookup( self.posts) # batch_size * encoder_len # 将 post 的 string 映射成实体 id self.posts_entity_id = self.entity2index.lookup( self.posts) # batch_size * encoder_len # 将 response 的 string 映射成词汇 id self.responses_target = self.symbol2index.lookup( self.responses) # batch_size * decoder_len # 获得解码器的 batch_size,decoder_len batch_size, decoder_len = tf.shape(self.responses)[0], tf.shape( self.responses)[1] # 去掉 responses_target 的最后一列,给第一列加上 GO_ID self.responses_word_id = tf.concat([ tf.ones([batch_size, 1], dtype=tf.int64) * GO_ID, tf.split(self.responses_target, [decoder_len - 1, 1], 1)[0] ], 1) # batch_size * decoder_len # 得到 response 的 mask # 首先将回复的长度 one_hot 编码 # 然后横着从右向左累计求和,形成一个如果该位置在长度范围内,则为1,否则则为0的矩阵,最后一步 reshape 应该没有必要 self.decoder_mask = tf.reshape( tf.cumsum(tf.one_hot(self.responses_length - 1, decoder_len), reverse=True, axis=1), [-1, decoder_len]) # batch_size * decoder_len # 初始化 词嵌入 和 实体嵌入,传入了参数就直接赋值,没有的话就随机初始化 if embed is None: self.embed = tf.get_variable('word_embed', [num_symbols, num_embed_units], tf.float32) else: self.embed = tf.get_variable('word_embed', dtype=tf.float32, initializer=embed) if entity_embed is None: self.entity_trans = tf.get_variable( 'entity_embed', [num_entities, num_trans_units], tf.float32, trainable=False) else: self.entity_trans = tf.get_variable('entity_embed', dtype=tf.float32, initializer=entity_embed, trainable=False) # 添加一个全连接层,输入是实体的嵌入,该层的 size=num_trans_units,激活函数是tanh # 为什么还要用全连接层连一下?????? self.entity_trans_transformed = tf.layers.dense( self.entity_trans, num_trans_units, activation=tf.tanh, name='trans_transformation') # 7 * num_trans_units 的全零初始化的数组 padding_entity = tf.get_variable('entity_padding_embed', [7, num_trans_units], dtype=tf.float32, initializer=tf.zeros_initializer()) # 把 padding_entity 添加到 entity_trans_transformed 的最前,补了有什么用????????????? self.entity_embed = tf.concat( [padding_entity, self.entity_trans_transformed], axis=0) # tf.nn.embedding_lookup 以后维度会+1,所以通过reshape来取消这个多出来的维度 triples_embedding = tf.reshape( tf.nn.embedding_lookup(self.entity_embed, self.entity2index.lookup(self.triples)), [encoder_batch_size, triple_num, -1, 3 * num_trans_units]) entities_word_embedding = tf.reshape( tf.nn.embedding_lookup(self.embed, self.symbol2index.lookup(self.entities)), [encoder_batch_size, -1, num_embed_units ]) # [batch_size,triple_num*triple_len,num_embed_units] # 把 head,relation,tail分割开来 head, relation, tail = tf.split(triples_embedding, [num_trans_units] * 3, axis=3) # 静态图注意力机制 with tf.variable_scope('graph_attention'): # 将头和尾连接起来 head_tail = tf.concat( [head, tail], axis=3) # batch_size * triple_num * triple_len * 200 # tanh(dot(W, head_tail)) head_tail_transformed = tf.layers.dense( head_tail, num_trans_units, activation=tf.tanh, name='head_tail_transform' ) # batch_size * triple_num * triple_len * 100 # dot(W, relation) relation_transformed = tf.layers.dense( relation, num_trans_units, name='relation_transform' ) # batch_size * triple_num * triple_len * 100 # 两个向量先元素乘,再求和,等于两个向量的内积 # dot(traspose(dot(W, relation)), tanh(dot(W, head_tail))) e_weight = tf.reduce_sum( relation_transformed * head_tail_transformed, axis=3) # batch_size * triple_num * triple_len # 图中每个三元组的 alpha 权值 alpha_weight = tf.nn.softmax( e_weight) # batch_size * triple_num * triple_len # tf.expand_dims 使 alpha_weight 维度+1 batch_size * triple_num * triple_len * 1 # 对第2个维度求和,由此产生每个图 100 维的图向量表示 graph_embed = tf.reduce_sum( tf.expand_dims(alpha_weight, 3) * head_tail, axis=2) # batch_size * triple_num * 100 """ [0, 1, 2... encoder_batch_size] 转化成 encoder_batch_size * 1 * 1 的矩阵 [[[0]], [[1]], [[2]],...] tf.tile 将矩阵的第 1 维进行扩展 encoder_batch_size * encoder_len * 1 [[[0],[0]...]],...] 与 posts_triple 在第 2 维度上进行拼接,形成 indices 矩阵 indices 矩阵: [ [[0 0], [0 0], [0 0], [0 0], [0 1], [0 0], [0 2], [0 0],...encoder_len], [[1 0], [1 0], [1 0], [1 0], [1 1], [1 0], [1 2], [1 0],...encoder_len], [[2 0], [2 0], [2 0], [2 0], [2 1], [2 0], [2 2], [2 0],...encoder_len] ,...batch_size ] tf.gather_nd 将 graph_embed 中根据上面矩阵提供的索引检索图向量,再回填至 indices 矩阵 encoder_batch_size * encoder_len * 100 """ graph_embed_input = tf.gather_nd( graph_embed, tf.concat([ tf.tile( tf.reshape(tf.range(encoder_batch_size, dtype=tf.int32), [-1, 1, 1]), [1, encoder_len, 1]), self.posts_triple ], axis=2)) # 将 responses_triple 转化成实体嵌入 batch_size * decoder_len * 300 triple_embed_input = tf.reshape( tf.nn.embedding_lookup( self.entity_embed, self.entity2index.lookup(self.responses_triple)), [batch_size, decoder_len, 3 * num_trans_units]) # 将 posts_word_id 转化成词嵌入 post_word_input = tf.nn.embedding_lookup( self.embed, self.posts_word_id) # batch_size * encoder_len * 300 # 将 responses_word_id 转化成词嵌入 response_word_input = tf.nn.embedding_lookup( self.embed, self.responses_word_id) # batch_size * decoder_len * 300 # post_word_input, graph_embed_input 在第二个维度上拼接 self.encoder_input = tf.concat( [post_word_input, graph_embed_input], axis=2) # batch_size * encoder_len * 400 # response_word_input, triple_embed_input 在第二个维度上拼接 self.decoder_input = tf.concat( [response_word_input, triple_embed_input], axis=2) # batch_size * decoder_len * 600 # 构造 deep RNN encoder_cell = MultiRNNCell( [GRUCell(num_units) for _ in range(num_layers)]) decoder_cell = MultiRNNCell( [GRUCell(num_units) for _ in range(num_layers)]) # rnn encoder encoder_output, encoder_state = dynamic_rnn(encoder_cell, self.encoder_input, self.posts_length, dtype=tf.float32, scope="encoder") # 由于词汇表维度过大,所以输出的维度不可能和词汇表一样。通过 projection 函数,可以实现从低维向高维的映射 # 返回:输出函数,选择器函数,计算序列损失,采样序列损失,总体损失的函数 output_fn, selector_fn, sequence_loss, sampled_sequence_loss, total_loss = output_projection_layer( num_units, num_symbols, num_samples) # 用于训练的 decoder with tf.variable_scope('decoder'): # 得到注意力函数 # 准备注意力 # attention_keys_init: 注意力的 keys # attention_values_init: 注意力的 values # attention_score_fn_init: 计算注意力上下文的函数 # attention_construct_fn_init: 计算所有上下文拼接的函数 attention_keys_init, attention_values_init, attention_score_fn_init, attention_construct_fn_init \ = prepare_attention(encoder_output, 'bahdanau', num_units, imem=(graph_embed, triples_embedding), output_alignments=output_alignments and mem_use)#'luong', num_units) # 返回训练时解码器每一个时间步对输入的处理函数 decoder_fn_train = attention_decoder_fn_train( encoder_state, attention_keys_init, attention_values_init, attention_score_fn_init, attention_construct_fn_init, output_alignments=output_alignments and mem_use, max_length=tf.reduce_max(self.responses_length)) # 输出,最终状态,alignments 的 TensorArray self.decoder_output, _, alignments_ta = dynamic_rnn_decoder( decoder_cell, decoder_fn_train, self.decoder_input, self.responses_length, scope="decoder_rnn") if output_alignments: self.decoder_loss, self.ppx_loss, self.sentence_ppx = total_loss( self.decoder_output, self.responses_target, self.decoder_mask, self.alignments, triples_embedding, use_triples, one_hot_triples) self.sentence_ppx = tf.identity( self.sentence_ppx, name='ppx_loss') # 将 sentence_ppx 转化成一步操作 else: self.decoder_loss = sequence_loss(self.decoder_output, self.responses_target, self.decoder_mask) # 用于推导的 decoder with tf.variable_scope('decoder', reuse=True): # 得到注意力函数 attention_keys, attention_values, attention_score_fn, attention_construct_fn \ = prepare_attention(encoder_output, 'bahdanau', num_units, reuse=True, imem=(graph_embed, triples_embedding), output_alignments=output_alignments and mem_use)#'luong', num_units) decoder_fn_inference = attention_decoder_fn_inference( output_fn, encoder_state, attention_keys, attention_values, attention_score_fn, attention_construct_fn, self.embed, GO_ID, EOS_ID, max_length, num_symbols, imem=(entities_word_embedding, tf.reshape( triples_embedding, [encoder_batch_size, -1, 3 * num_trans_units])), selector_fn=selector_fn) # imem: ([batch_size,triple_num*triple_len,num_embed_units],[encoder_batch_size, triple_num*triple_len, 3*num_trans_units]) 实体次嵌入和三元组嵌入的元组 self.decoder_distribution, _, output_ids_ta = dynamic_rnn_decoder( decoder_cell, decoder_fn_inference, scope="decoder_rnn") output_len = tf.shape(self.decoder_distribution)[1] # decoder_len output_ids = tf.transpose( output_ids_ta.gather( tf.range(output_len))) # [batch_size, decoder_len] # 对 output 的值域行裁剪 word_ids = tf.cast(tf.clip_by_value(output_ids, 0, num_symbols), tf.int64) # [batch_size, decoder_len] # 计算的是采用的实体词在 entities 的位置 # 1、tf.shape(entities_word_embedding)[1] = triple_num*triple_len # 2、tf.range(encoder_batch_size): [batch_size] # 3、tf.reshape(tf.range(encoder_batch_size) * tf.shape(entities_word_embedding)[1], [-1, 1]): [batch_size, 1] 实体词在 entities 中的偏移量 # 4、tf.clip_by_value(-output_ids, 0, num_symbols): [batch_size, decoder_len] 实体词的相对位置 # 5、entity_ids: [batch_size * decoder_len] 加上偏移量之后在 entities 中的实际位置 entity_ids = tf.reshape( tf.clip_by_value(-output_ids, 0, num_symbols) + tf.reshape( tf.range(encoder_batch_size) * tf.shape(entities_word_embedding)[1], [-1, 1]), [-1]) # 计算的是所用的实体词 # 1、entities: [batch_size, triple_num, triple_len] # 2、tf.reshape(self.entities, [-1]): [batch_size * triple_num * triple_len] # 3、tf.gather: [batch_size*decoder_len] # 4、entities: [batch_size, output_len] entities = tf.reshape( tf.gather(tf.reshape(self.entities, [-1]), entity_ids), [-1, output_len]) words = self.index2symbol.lookup(word_ids) # 将 id 转化为实际的词 # output_ids > 0 为 bool 张量,True 的位置用 words 中该位置的词替换 self.generation = tf.where(output_ids > 0, words, entities) self.generation = tf.identity(self.generation, name='generation') # 初始化训练过程 self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) # ??? self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) # 更新参数的次数 self.global_step = tf.Variable(0, trainable=False) # 要训练的参数 self.params = tf.global_variables() # 选择优化算法 opt = tf.train.AdamOptimizer(learning_rate=learning_rate) self.lr = opt._lr # 根据 decoder_loss 计算 params 梯度 gradients = tf.gradients(self.decoder_loss, self.params) # 梯度裁剪 clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) tf.summary.scalar('decoder_loss', self.decoder_loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) self.merged_summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) self.saver_epoch = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=1000, pad_step_number=True)
def build(self, for_deploy, variants=""): conf = self.conf name = self.name job_type = self.job_type dtype = self.dtype self.beam_size = 1 if (not for_deploy or variants=="score") else sum(self.conf.beam_splits) # Input maps self.in_table = lookup.MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.out_table = lookup.MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value="_UNK", shared_name="out_table", name="out_table", checkpoint=True) graphlg.info("Creating placeholders...") self.enc_str_inps = tf.placeholder(tf.string, shape=(None, conf.input_max_len), name="enc_inps") self.enc_lens = tf.placeholder(tf.int32, shape=[None], name="enc_lens") self.dec_str_inps = tf.placeholder(tf.string, shape=[None, conf.output_max_len + 2], name="dec_inps") self.dec_lens = tf.placeholder(tf.int32, shape=[None], name="dec_lens") self.down_wgts = tf.placeholder(tf.float32, shape=[None], name="down_wgts") # lookup self.enc_inps = self.in_table.lookup(self.enc_str_inps) self.dec_inps = self.in_table.lookup(self.dec_str_inps) # Create encode graph and get attn states graphlg.info("Creating embeddings and embedding enc_inps.") with ops.device("/cpu:0"): self.embedding = variable_scope.get_variable("embedding", [conf.output_vocab_size, conf.embedding_size]) self.emb_inps = embedding_lookup_unique(self.embedding, self.enc_inps) graphlg.info("Creating dynamic rnn...") self.enc_outs, self.enc_states, mem_size, enc_state_size = DynRNN(conf.cell_model, conf.num_units, conf.num_layers, self.emb_inps, self.enc_lens, keep_prob=1.0, bidi=conf.bidirectional) memory = tf.reshape(tf.concat([self.enc_outs] * self.beam_size, 2), [-1, conf.input_max_len, mem_size]) memory_lens = tf.squeeze(tf.reshape(tf.concat([tf.expand_dims(self.enc_lens, 1)] * self.beam_size, 1), [-1, 1]), 1) batch_size = tf.shape(self.enc_outs)[0] graphlg.info("Creating out_proj...") if conf.out_layer_size: w = tf.get_variable("proj_w", [conf.out_layer_size, conf.output_vocab_size], dtype=dtype) elif conf.bidirectional: w = tf.get_variable("proj_w", [conf.num_units * 2, conf.output_vocab_size], dtype=dtype) else: w = tf.get_variable("proj_w", [conf.num_units, conf.output_vocab_size], dtype=dtype) b = tf.get_variable("proj_b", [conf.output_vocab_size], dtype=dtype) self.out_proj = (w, b) graphlg.info("Preparing decoder inps...") dec_inps = tf.slice(self.dec_inps, [0, 0], [-1, conf.output_max_len + 1]) with ops.device("/cpu:0"): emb_dec_inps = embedding_lookup_unique(self.embedding, dec_inps) # Attention with variable_scope.variable_scope("decoder", dtype=dtype) as scope: decoder_cell = CreateMultiRNNCell(conf.cell_model, mem_size, conf.num_layers, conf.output_keep_prob) max_mem_size = self.conf.input_max_len + self.conf.output_max_len + 2 if conf.attention == "Luo": mechanism = dynamic_attention_wrapper.LuongAttention(num_units=mem_size, memory=memory, max_mem_size=max_mem_size, memory_sequence_length=memory_lens) elif conf.attention == "Bah": mechanism = dynamic_attention_wrapper.BahdanauAttention(num_units=mem_size, memory=memory, max_mem_size=max_mem_size, memory_sequence_length=memory_lens) else: print "Unknown attention stype, must be Luo or Bah" exit(0) attn_cell = DynamicAttentionWrapper(cell=decoder_cell, attention_mechanism=mechanism, attention_size=mem_size, addmem=self.conf.addmem) # Zeros for initial state zero_attn_states = attn_cell.zero_state(dtype=tf.float32, batch_size=batch_size * self.beam_size) init_probs = tf.zeros([batch_size * self.beam_size]) #init_probs = tf.Print(init_probs, [tf.shape(init_probs)]) # Encoder states for initial state, with vae init_states = [] KLDs = tf.zeros([batch_size * self.beam_size]) zs = [] for i, each in enumerate(self.enc_states): if isinstance(each, LSTMStateTuple): new_c = tf.reshape(tf.concat([each.c] * self.beam_size, 1), [-1, mem_size]) new_h = tf.reshape(tf.concat([each.h] * self.beam_size, 1), [-1, mem_size]) #vae_c, KLD_c, l2_c = CreateVAE(new_c, self.conf.enc_latent_dim) vae_h, KLD, l2 = CreateVAE(new_h, self.conf.enc_latent_dim, stddev=self.conf.stddev, name="vae", reuse=(i!=0)) #vae_h, KLD, l2 = CreateVAE(each.h, self.conf.enc_latent_dim, stddev=self.conf.stddev, name="vae", reuse=(i!=0)) zs.append(tf.concat([each.c, vae_h], 1)) beam_vea_h = tf.reshape(tf.tile(vae_h, [1, self.beam_size]), [-1, mem_size]) new_c = tf.reshape(tf.tile(each.c, [1, self.beam_size]), [-1, mem_size]) init_states.append(LSTMStateTuple(new_c, vae_h)) KLDs += KLD else: zs.append(each) state = tf.reshape(tf.concat([each] * self.beam_size, 1), [-1, mem_size]) vae_state, KLD, l2 = CreateVAE(state, self.conf.enc_latent_dim, name="vae", stddev=self.conf.stddev, reuse=(i!=0)) init_states.append(vae_state) KLDs += KLD z = tf.concat(zs, 1) zero_attn_states = DynamicAttentionWrapperState(tuple(init_states), zero_attn_states.attention, zero_attn_states.newmem, zero_attn_states.alignments) if not for_deploy: dec_init_state = zero_attn_states hp_train = helper.ScheduledEmbeddingTrainingHelper(inputs=emb_dec_inps, sequence_length=self.dec_lens, embedding=self.embedding, sampling_probability=0.0, out_proj=self.out_proj) output_layer = layers_core.Dense(self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None my_decoder = basic_decoder.BasicDecoder(cell=attn_cell, helper=hp_train, initial_state=dec_init_state, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode(decoder=my_decoder, impute_finished=True, maximum_iterations=conf.output_max_len + 1, scope=scope) outputs = cell_outs.rnn_output L = tf.shape(outputs)[1] outputs = tf.reshape(outputs, [-1, int(self.out_proj[0].shape[0])]) outputs = tf.matmul(outputs, self.out_proj[0]) + self.out_proj[1] logits = tf.reshape(outputs, [-1, L, int(self.out_proj[0].shape[1])]) # branch 1 for debugging, doesn't have to be called #m = tf.shape(self.outputs)[0] #self.mask = tf.zeros([m, int(w.shape[1])]) #for i in [3]: # self.mask = self.mask + tf.one_hot(indices=tf.ones([m], dtype=tf.int32) * i, on_value=100.0, depth=int(w.shape[1])) #self.outputs = self.outputs - self.mask self.outputs = tf.argmax(logits, axis=2) self.outputs = tf.reshape(self.outputs, [-1, L]) self.outputs = self.out_table.lookup(tf.cast(self.outputs, tf.int64)) # branch 2 for loss tars = tf.slice(self.dec_inps, [0, 1], [-1, L]) wgts = tf.cumsum(tf.one_hot(self.dec_lens, L), axis=1, reverse=True) batch_wgt = tf.reduce_sum(self.down_wgts) + 1e-12 #wgts = wgts * tf.expand_dims(self.down_wgts, 1) self.loss = loss.sequence_loss(logits=logits, targets=tars, weights=wgts, average_across_timesteps=False, average_across_batch=False) example_losses = tf.reduce_sum(self.loss, 1) see_loss = tf.reduce_sum(example_losses / tf.cast(self.dec_lens, tf.float32) * self.down_wgts) / batch_wgt KLD = tf.reduce_sum(KLDs * self.down_wgts) / batch_wgt self.loss = tf.reduce_sum(example_losses * self.down_wgts) / batch_wgt + KLD tf.summary.scalar("loss", see_loss) tf.summary.scalar("kld", KLD) graph_nodes = { "loss":self.loss, "inputs":{}, "outputs":{}, "debug_ouputs":self.outputs } #saver return graph_nodes else: inputs = { "enc_inps:0":self.enc_str_inps, "enc_lens:0":self.enc_lens } if variants == "score": dec_init_state = zero_attn_states hp_train = helper.ScheduledEmbeddingTrainingHelper(inputs=emb_dec_inps, sequence_length=self.dec_lens, embedding=self.embedding, sampling_probability=0.0, out_proj=self.out_proj) output_layer = layers_core.Dense(self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None my_decoder = score_decoder.ScoreDecoder(cell=attn_cell, helper=hp_train, out_proj=self.out_proj, initial_state=dec_init_state, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode(decoder=my_decoder, scope=scope, maximum_iterations=self.conf.output_max_len, impute_finished=False) L = tf.shape(cell_outs.logprobs)[1] one_hot = tf.one_hot(tf.slice(self.dec_inps, [0, 1], [-1, L]), depth=self.conf.output_vocab_size, axis=-1, on_value=1.0, off_value=0.0) outputs = tf.reduce_sum(cell_outs.logprobs * one_hot, 2) outputs = tf.reduce_sum(outputs, axis=1) graph_nodes = { "loss":None, "inputs":inputs, "outputs":{"logprobs":outputs}, "visualize":None } return graph_nodes else: dec_init_state = beam_decoder.BeamState(tf.zeros([batch_size * self.beam_size]), zero_attn_states, tf.zeros([batch_size * self.beam_size], tf.int32)) #dec_init_state = nest.map_structure(lambda x:tf.Print(x, [tf.shape(x)], message=str(x)+"dec_init"), dec_init_state) hp_infer = helper.GreedyEmbeddingHelper(embedding=self.embedding, start_tokens=tf.ones(shape=[batch_size * self.beam_size], dtype=tf.int32), end_token=EOS_ID, out_proj=self.out_proj) output_layer = layers_core.Dense(self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None my_decoder = beam_decoder.BeamDecoder(cell=attn_cell, helper=hp_infer, out_proj=self.out_proj, initial_state=dec_init_state, beam_splits=self.conf.beam_splits, max_res_num=self.conf.max_res_num, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode(decoder=my_decoder, scope=scope, maximum_iterations=self.conf.output_max_len, impute_finished=True) L = tf.shape(cell_outs.beam_ends)[1] beam_symbols = cell_outs.beam_symbols beam_parents = cell_outs.beam_parents beam_ends = cell_outs.beam_ends beam_end_parents = cell_outs.beam_end_parents beam_end_probs = cell_outs.beam_end_probs alignments = cell_outs.alignments beam_ends = tf.reshape(tf.transpose(beam_ends, [0, 2, 1]), [-1, L]) beam_end_parents = tf.reshape(tf.transpose(beam_end_parents, [0, 2, 1]), [-1, L]) beam_end_probs = tf.reshape(tf.transpose(beam_end_probs, [0, 2, 1]), [-1, L]) ## Creating tail_ids batch_size = tf.Print(batch_size, [batch_size], message="VAERNN batch") #beam_symbols = tf.Print(cell_outs.beam_symbols, [tf.shape(cell_outs.beam_symbols)], message="beam_symbols") #beam_parents = tf.Print(cell_outs.beam_parents, [tf.shape(cell_outs.beam_parents)], message="beam_parents") #beam_ends = tf.Print(cell_outs.beam_ends, [tf.shape(cell_outs.beam_ends)], message="beam_ends") #beam_end_parents = tf.Print(cell_outs.beam_end_parents, [tf.shape(cell_outs.beam_end_parents)], message="beam_end_parents") #beam_end_probs = tf.Print(cell_outs.beam_end_probs, [tf.shape(cell_outs.beam_end_probs)], message="beam_end_probs") #alignments = tf.Print(cell_outs.alignments, [tf.shape(cell_outs.alignments)], message="beam_attns") batch_offset = tf.expand_dims(tf.cumsum(tf.ones([batch_size, self.beam_size], dtype=tf.int32) * self.beam_size, axis=0, exclusive=True), 2) offset2 = tf.expand_dims(tf.cumsum(tf.ones([batch_size, self.beam_size * 2], dtype=tf.int32) * self.beam_size, axis=0, exclusive=True), 2) out_len = tf.shape(beam_symbols)[1] self.beam_symbol_strs = tf.reshape(self.out_table.lookup(tf.cast(beam_symbols, tf.int64)), [batch_size, self.beam_size, -1]) self.beam_parents = tf.reshape(beam_parents, [batch_size, self.beam_size, -1]) - batch_offset self.beam_ends = tf.reshape(beam_ends, [batch_size, self.beam_size * 2, -1]) self.beam_end_parents = tf.reshape(beam_end_parents, [batch_size, self.beam_size * 2, -1]) - offset2 self.beam_end_probs = tf.reshape(beam_end_probs, [batch_size, self.beam_size * 2, -1]) self.beam_attns = tf.reshape(alignments, [batch_size, self.beam_size, out_len, -1]) outputs = { "beam_symbols":self.beam_symbol_strs, "beam_parents":self.beam_parents, "beam_ends":self.beam_ends, "beam_end_parents":self.beam_end_parents, "beam_end_probs":self.beam_end_probs, "beam_attns":self.beam_attns } graph_nodes = { "loss":None, "inputs":inputs, "outputs":outputs, "visualize":{"z":z} } return graph_nodes
def build(self, inputs, for_deploy): scope = "" conf = self.conf dtype = self.dtype beam_size = 1 if not for_deploy else sum(conf.beam_splits) with tf.name_scope("WordEmbedding"): # Input maps self.in_table = lookup.MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.out_table = lookup.MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value="_UNK", shared_name="out_table", name="out_table", checkpoint=True) enc_inps = self.in_table.lookup(inputs["enc_inps:0"]) dec_inps = self.in_table.lookup(inputs["dec_inps:0"]) graphlg.info("Creating embeddings and embedding enc_inps.") with tf.device("/cpu:0"): self.embedding = variable_scope.get_variable("embedding", [conf.output_vocab_size, conf.embedding_size]) emb_inps = embedding_lookup_unique(self.embedding, enc_inps) emb_dec_inps = embedding_lookup_unique(self.embedding, dec_inps) emb_dec_next_inps = tf.slice(emb_dec_inps, [0, 0, 0], [-1, conf.output_max_len + 1, -1]) batch_size = tf.shape(enc_inps)[0] # Create encode graph and get attn states graphlg.info("Creating dynamic x rnn...") enc_outs, enc_states, mem_size, enc_state_size = DynEncode(conf.cell_model, conf.num_units, conf.num_layers, emb_inps, inputs["enc_lens:0"], keep_prob=1.0, bidi=conf.bidirectional, name_scope="DynEncodeX") with tf.variable_scope("AttnEncState") as scope2: mechanism = Luong1_2(num_units=conf.num_units, memory=enc_outs, max_mem_size=conf.input_max_len, memory_sequence_length=inputs["enc_lens:0"], name=scope2.original_name_scope) if isinstance(enc_states[-1], LSTMStateTuple): #score = tf.expand_dims(tf.nn.softmax(mechanism(enc_states[-1].h)), 1) score = tf.expand_dims(mechanism(enc_states[-1].h, ()), 1) attention_h = tf.squeeze(tf.matmul(score, enc_outs), 1) enc_state = LSTMStateTuple(enc_states[-1].c, attention_h) else: #score = tf.expand_dims(tf.nn.softmax(mechanism(enc_states[-1])), 1) score = tf.expand_dims(mechanism(enc_states[-1], ()), 1) enc_state = tf.squeeze(tf.matmul(score, enc_outs), 1) hidden_units = int(math.sqrt(mem_size * conf.enc_latent_dim)) z, mu_prior, logvar_prior = Ptheta([enc_state], hidden_units, conf.enc_latent_dim, stddev=1, prior_type=conf.prior_type, name_scope="EncToPtheta") KLD = 0.0 # Y inputs for posterior z when training if not for_deploy: #with tf.name_scope("variational_distribution") as scope: y_emb_inps = tf.slice(emb_dec_inps, [0, 1, 0], [-1, -1, -1]) y_enc_outs, y_enc_states, y_mem_size, y_enc_state_size = DynEncode(conf.cell_model, conf.num_units, conf.num_layers, y_emb_inps, inputs["dec_lens:0"], keep_prob=conf.keep_prob, bidi=False, name_scope="DynEncodeY") z, KLD, l2 = VAE([enc_state, y_enc_states[-1]], conf.enc_latent_dim, mu_prior, logvar_prior, name_scope="VAE") # project z + x_thinking_state to decoder state with tf.name_scope("GatedZState"): if isinstance(enc_state, LSTMStateTuple): h_gate = tf.layers.dense(z, int(enc_state.h.get_shape()[1]), use_bias=True, name="z_gate_h", activation=tf.sigmoid) c_gate = tf.layers.dense(z, int(enc_state.c.get_shape()[1]), use_bias=True, name="z_gate_c", activation=tf.sigmoid) raw_dec_states = tf.concat([c_gate * enc_state.c, h_gate * enc_state.h, z], 1) #raw_dec_states = LSTMStateTuple(tf.concat([c_gate * enc_state.c, z], 1), tf.concat([h_gate * enc_state.h, z], 1)) else: gate = tf.layers.dense(z, int(enc_state.get_shape()[1]), use_bias=True, name="z_gate", activation=tf.sigmoid) raw_dec_states = tf.concat([gate * enc_state, z], 1) # add BOW loss #num_hidden_units = int(math.sqrt(conf.output_vocab_size * int(decision_state.shape[1]))) #bow_l1 = layers_core.Dense(num_hidden_units, use_bias=True, name="bow_hidden", activation=tf.tanh) #bow_l2 = layers_core.Dense(conf.output_vocab_size, use_bias=True, name="bow_out", activation=None) #bow = bow_l2(bow_l1(decision_state)) #y_dec_inps = tf.slice(self.dec_inps, [0, 1], [-1, -1]) #bow_y = tf.reduce_sum(tf.one_hot(y_dec_inps, on_value=1.0, off_value=0.0, axis=-1, depth=conf.output_vocab_size), axis=1) #batch_bow_losses = tf.reduce_sum(bow_y * (-1.0) * tf.nn.log_softmax(bow), axis=1) max_mem_size = conf.input_max_len + conf.output_max_len + 2 with tf.name_scope("ShapeToBeam"): beam_raw_dec_states = nest.map_structure(lambda x:tile_batch(x, beam_size), raw_dec_states) beam_memory = nest.map_structure(lambda x:tile_batch(x, beam_size), enc_outs) beam_memory_lens = tf.squeeze(nest.map_structure(lambda x:tile_batch(x, beam_size), tf.expand_dims(inputs["enc_lens:0"], 1)), 1) beam_z = nest.map_structure(lambda x:tile_batch(x, beam_size), z) #def _to_beam(t): # beam_t = tf.reshape(tf.tile(t, [1, beam_size]), [-1, int(t.get_shape()[1])]) # return beam_t #with tf.name_scope("ShapeToBeam") as scope: # beam_raw_dec_states = tf.contrib.framework.nest.map_structure(_to_beam, raw_dec_states) # beam_memory = tf.reshape(tf.tile(self.enc_outs, [1, 1, beam_size]), [-1, conf.input_max_len, mem_size]) # beam_memory_lens = tf.squeeze(tf.reshape(tf.tile(tf.expand_dims(inputs["enc_lens:0"], 1), [1, beam_size]), [-1, 1]), 1) # beam_z = tf.contrib.framework.nest.map_structure(_to_beam, z) #cell = AttnCell(cell_model=conf.cell_model, num_units=mem_size, num_layers=conf.num_layers, # attn_type=conf.attention, memory=beam_memory, mem_lens=beam_memory_lens, # max_mem_size=max_mem_size, addmem=conf.addmem, z=beam_z, keep_prob=conf.keep_prob, # dtype=tf.float32) #with tf.variable_scope("DynDecode/AttnCell") as dyn_scope: decoder_multi_rnn_cells = CreateMultiRNNCell(conf.cell_model, num_units=mem_size, num_layers=conf.num_layers, output_keep_prob=conf.keep_prob) zero_cell_states = DecCellStateInit(beam_raw_dec_states, decoder_multi_rnn_cells, name="InitCell") attn_cell = AttnCellWrapper(cell=decoder_multi_rnn_cells, cell_init_states=zero_cell_states, attn_type=conf.attention, attn_size=mem_size, memory=beam_memory, mem_lens=beam_memory_lens, max_mem_size=max_mem_size, addmem=conf.addmem, z=beam_z, dtype=tf.float32, name="AttnWrapper") if self.conf.attention: dec_init_state = None else: dec_init_state = beam_decoder.BeamState(tf.zeros_like(beam_memory_lens, tf.float32), zero_cell_states, tf.zeros_like(beam_memory_lens)) with tf.variable_scope("OutProj"): graphlg.info("Creating out_proj...") if conf.out_layer_size: w = tf.get_variable("proj_w", [conf.out_layer_size, conf.output_vocab_size], dtype=dtype) else: w = tf.get_variable("proj_w", [mem_size, conf.output_vocab_size], dtype=dtype) b = tf.get_variable("proj_b", [conf.output_vocab_size], dtype=dtype) out_proj = (w, b) if not for_deploy: hp_train = helper1_2.ScheduledEmbeddingTrainingHelper(inputs=emb_dec_next_inps, sequence_length=inputs["dec_lens:0"], embedding=self.embedding, sampling_probability=0.0, out_proj=out_proj) output_layer = layers_core.Dense(conf.out_layer_size, use_bias=True) if conf.out_layer_size else None my_decoder = basic_decoder1_2.BasicDecoder(cell=attn_cell, helper=hp_train, initial_state=dec_init_state, output_layer=output_layer) cell_outs, final_state, seq_len = decoder1_2.dynamic_decode(decoder=my_decoder, impute_finished=True, maximum_iterations=conf.output_max_len + 1) #cell_outs = tf.Print(cell_outs, [tf.shape(cell_outs)], message="cell_outs_shape") with tf.name_scope("Logits"): L = tf.shape(cell_outs.rnn_output)[1] rnn_output = tf.reshape(cell_outs.rnn_output, [-1, int(out_proj[0].shape[0])]) rnn_output = tf.matmul(rnn_output, out_proj[0]) + out_proj[1] logits = tf.reshape(rnn_output, [-1, L, int(out_proj[0].shape[1])]) with tf.name_scope("DebugOutputs") as scope: outputs = tf.argmax(logits, axis=2) outputs = tf.reshape(outputs, [-1, L]) outputs = self.out_table.lookup(tf.cast(outputs, tf.int64)) # branch 2 for loss with tf.name_scope("Loss") as scope: tars = tf.slice(dec_inps, [0, 1], [-1, L]) # wgts may be a more complicated form, for example a partial down-weighting of a sequence # but here i just use 1.0 weights for all no-padding label wgts = tf.cumsum(tf.one_hot(inputs["dec_lens:0"], L), axis=1, reverse=True) #wgts = wgts * tf.expand_dims(self.down_wgts, 1) loss_matrix = loss.sequence_loss(logits=logits, targets=tars, weights=wgts, average_across_timesteps=False, average_across_batch=False) #bow_loss = tf.reduce_sum(batch_bow_losses * self.down_wgts) / batch_wgt example_total_wgts = tf.reduce_sum(wgts, 1) total_wgts = tf.reduce_sum(example_total_wgts) example_losses = tf.reduce_sum(loss_matrix, 1) see_loss = tf.reduce_sum(example_losses) / total_wgts KLD = tf.reduce_sum(KLD * example_total_wgts) / total_wgts self.loss = tf.reduce_sum(example_losses + conf.kld_ratio * KLD) / total_wgts with tf.name_scope(self.model_kind): tf.summary.scalar("loss", see_loss) tf.summary.scalar("kld", KLD) #tf.summary.scalar("bow", bow_loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) graph_nodes = { "loss":self.loss, "inputs":inputs, "debug_outputs":outputs, "outputs":{}, "visualize":None } return graph_nodes else: beam_batch_size = tf.shape(beam_memory_lens)[0] hp_infer = helper1_2.GreedyEmbeddingHelper(embedding=self.embedding, start_tokens=tf.ones([beam_batch_size], dtype=tf.int32), end_token=EOS_ID, out_proj=out_proj) output_layer = layers_core.Dense(conf.out_layer_size, use_bias=True) if conf.out_layer_size else None my_decoder = beam_decoder.BeamDecoder(cell=attn_cell, helper=hp_infer, out_proj=out_proj, initial_state=dec_init_state, beam_splits=conf.beam_splits, max_res_num=conf.max_res_num, output_layer=output_layer) #cell_outs, final_state = decoder.dynamic_decode(decoder=my_decoder, scope=scope, maximum_iterations=conf.output_max_len) cell_outs, final_state, seq_len = decoder1_2.dynamic_decode(decoder=my_decoder, impute_finished=True, maximum_iterations=conf.output_max_len + 1) L = tf.shape(cell_outs.beam_ends)[1] beam_symbols = cell_outs.beam_symbols beam_parents = cell_outs.beam_parents beam_ends = cell_outs.beam_ends beam_end_parents = cell_outs.beam_end_parents beam_end_probs = cell_outs.beam_end_probs alignments = cell_outs.alignments beam_ends = tf.reshape(tf.transpose(beam_ends, [0, 2, 1]), [-1, L]) beam_end_parents = tf.reshape(tf.transpose(beam_end_parents, [0, 2, 1]), [-1, L]) beam_end_probs = tf.reshape(tf.transpose(beam_end_probs, [0, 2, 1]), [-1, L]) # Creating tail_ids batch_size = beam_batch_size / beam_size batch_size = tf.Print(batch_size, [batch_size], message="BATCH") #beam_symbols = tf.Print(cell_outs.beam_symbols, [tf.shape(cell_outs.beam_symbols)], message="beam_symbols") #beam_parents = tf.Print(cell_outs.beam_parents, [tf.shape(cell_outs.beam_parents)], message="beam_parents") #beam_ends = tf.Print(cell_outs.beam_ends, [tf.shape(cell_outs.beam_ends)], message="beam_ends") #beam_end_parents = tf.Print(cell_outs.beam_end_parents, [tf.shape(cell_outs.beam_end_parents)], message="beam_end_parents") #beam_end_probs = tf.Print(cell_outs.beam_end_probs, [tf.shape(cell_outs.beam_end_probs)], message="beam_end_probs") #alignments = tf.Print(cell_outs.alignments, [tf.shape(cell_outs.alignments)], message="beam_attns") batch_offset = tf.expand_dims(tf.cumsum(tf.ones([batch_size, beam_size], dtype=tf.int32) * beam_size, axis=0, exclusive=True), 2) offset2 = tf.expand_dims(tf.cumsum(tf.ones([batch_size, beam_size * 2], dtype=tf.int32) * beam_size, axis=0, exclusive=True), 2) out_len = tf.shape(beam_symbols)[1] self.beam_symbol_strs = tf.reshape(self.out_table.lookup(tf.cast(beam_symbols, tf.int64)), [batch_size, beam_size, -1]) self.beam_parents = tf.reshape(beam_parents, [batch_size, beam_size, -1]) - batch_offset self.beam_ends = tf.reshape(beam_ends, [batch_size, beam_size * 2, -1]) self.beam_end_parents = tf.reshape(beam_end_parents, [batch_size, beam_size * 2, -1]) - offset2 self.beam_end_probs = tf.reshape(beam_end_probs, [batch_size, beam_size * 2, -1]) self.beam_attns = tf.reshape(alignments, [batch_size, beam_size, out_len, -1]) #cell_outs.alignments #self.outputs = tf.concat([outputs_str, tf.cast(cell_outs.beam_parents, tf.string)], 1) #ones = tf.ones([batch_size, self.beam_size], dtype=tf.int32) #aux_matrix = tf.cumsum(ones * self.beam_size, axis=0, exclusive=True) #tm_beam_parents_reverse = tf.reverse(tf.transpose(cell_outs.beam_parents), axis=[0]) #beam_probs = final_state[1] #def traceback(prev_out, curr_input): # return tf.gather(curr_input, prev_out) # #tail_ids = tf.reshape(tf.cumsum(ones, axis=1, exclusive=True) + aux_matrix, [-1]) #tm_symbol_index_reverse = tf.scan(traceback, tm_beam_parents_reverse, initializer=tail_ids) ## Create beam index for symbols, and other info #tm_symbol_index = tf.concat([tf.expand_dims(tail_ids, 0), tm_symbol_index_reverse], axis=0) #tm_symbol_index = tf.reverse(tm_symbol_index, axis=[0]) #tm_symbol_index = tf.slice(tm_symbol_index, [1, 0], [-1, -1]) #symbol_index = tf.expand_dims(tf.transpose(tm_symbol_index), axis=2) #symbol_index = tf.concat([symbol_index, tf.cumsum(tf.ones_like(symbol_index), exclusive=True, axis=1)], axis=2) ## index alignments and output symbols #alignments = tf.gather_nd(cell_outs.alignments, symbol_index) #symbol_ids = tf.gather_nd(cell_outs.beam_symbols, symbol_index) ## outputs and other info #self.others = [alignments, beam_probs] #self.outputs = self.out_table.lookup(tf.cast(symbol_ids, tf.int64)) outputs = { "beam_symbols":self.beam_symbol_strs, "beam_parents":self.beam_parents, "beam_ends":self.beam_ends, "beam_end_parents":self.beam_end_parents, "beam_end_probs":self.beam_end_probs, "beam_attns":self.beam_attns } infer_inputs = {} infer_inputs["enc_inps:0"] = inputs["enc_inps:0"] infer_inputs["enc_lens:0"] = inputs["enc_lens:0"] graph_nodes = { "loss":None, "inputs":infer_inputs, "outputs":outputs, "visualize":{"z":z} } return graph_nodes
def __init__( self, num_symbols, # 词汇表size num_embed_units, # 词嵌入size num_units, # RNN 每层单元数 num_layers, # RNN 层数 embed, # 词嵌入 entity_embed=None, # 实体+关系的嵌入 num_entities=0, # 实体+关系的总个数 num_trans_units=100, # 实体嵌入的维度 memory_units=100, learning_rate=0.0001, # 学习率 learning_rate_decay_factor=0.95, # 学习率衰退,并没有采用这种方式 max_gradient_norm=5.0, # num_samples=500, # 样本个数,sampled softmax max_length=60, mem_use=True, output_alignments=True, use_lstm=False): self.posts = tf.placeholder(tf.string, (None, None), 'enc_inps') # [batch_size, encoder_len] self.posts_length = tf.placeholder(tf.int32, (None), 'enc_lens') # [batch_size] self.responses = tf.placeholder( tf.string, (None, None), 'dec_inps') # [batch_size, decoder_len] self.responses_length = tf.placeholder(tf.int32, (None), 'dec_lens') # [batch_size] self.entities = tf.placeholder( tf.string, (None, None, None), 'entities') # [batch_size, triple_num, triple_len] self.entity_masks = tf.placeholder(tf.string, (None, None), 'entity_masks') # 没用到 self.triples = tf.placeholder( tf.string, (None, None, None, 3), 'triples') # [batch_size, triple_num, triple_len, 3] self.posts_triple = tf.placeholder( tf.int32, (None, None, 1), 'enc_triples') # [batch_size, encoder_len, 1] self.responses_triple = tf.placeholder( tf.string, (None, None, 3), 'dec_triples') # [batch_size, decoder_len, 3] self.match_triples = tf.placeholder( tf.int32, (None, None, None), 'match_triples') # [batch_size, decoder_len, triple_num] # 编码器batch_size,编码器encoder_len encoder_batch_size, encoder_len = tf.unstack(tf.shape(self.posts)) triple_num = tf.shape(self.triples)[1] # 知识图个数 triple_len = tf.shape(self.triples)[2] # 知识三元组个数 # 使用的知识三元组 one_hot_triples = tf.one_hot( self.match_triples, triple_len) # [batch_size, decoder_len, triple_num, triple_len] # 用 1 标注了哪个时间步产生的回复用了知识三元组 use_triples = tf.reduce_sum(one_hot_triples, axis=[2, 3]) # [batch_size, decoder_len] # 词汇映射到index的hash table self.symbol2index = MutableHashTable( key_dtype=tf.string, # key张量的类型 value_dtype=tf.int64, # value张量的类型 default_value=UNK_ID, # 缺少key的默认值 shared_name= "in_table", # If non-empty, this table will be shared under the given name across multiple sessions name="in_table", # 操作名 checkpoint=True ) # if True, the contents of the table are saved to and restored from checkpoints. If shared_name is empty for a checkpointed table, it is shared using the table node name. # index映射到词汇的hash table self.index2symbol = MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value='_UNK', shared_name="out_table", name="out_table", checkpoint=True) # 实体映射到index的hash table self.entity2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=NONE_ID, shared_name="entity_in_table", name="entity_in_table", checkpoint=True) # index映射到实体的hash table self.index2entity = MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value='_NONE', shared_name="entity_out_table", name="entity_out_table", checkpoint=True) self.posts_word_id = self.symbol2index.lookup( self.posts) # [batch_size, encoder_len] self.posts_entity_id = self.entity2index.lookup( self.posts) # [batch_size, encoder_len] self.responses_target = self.symbol2index.lookup( self.responses) # [batch_size, decoder_len] # 获得解码器的batch_size,decoder_len batch_size, decoder_len = tf.shape(self.responses)[0], tf.shape( self.responses)[1] # 去掉responses_target的最后一列,给第一列加上GO_ID self.responses_word_id = tf.concat([ tf.ones([batch_size, 1], dtype=tf.int64) * GO_ID, tf.split(self.responses_target, [decoder_len - 1, 1], 1)[0] ], 1) # [batch_size, decoder_len] # 得到response的mask self.decoder_mask = tf.reshape( tf.cumsum(tf.one_hot(self.responses_length - 1, decoder_len), reverse=True, axis=1), [-1, decoder_len]) # [batch_size, decoder_len] # 初始化词嵌入和实体嵌入,传入了参数就直接赋值,没有的话就随机初始化 if embed is None: self.embed = tf.get_variable('word_embed', [num_symbols, num_embed_units], tf.float32) else: self.embed = tf.get_variable('word_embed', dtype=tf.float32, initializer=embed) if entity_embed is None: # 实体嵌入不随着模型的训练而更新 self.entity_trans = tf.get_variable( 'entity_embed', [num_entities, num_trans_units], tf.float32, trainable=False) else: self.entity_trans = tf.get_variable('entity_embed', dtype=tf.float32, initializer=entity_embed, trainable=False) # 将实体嵌入传入一个全连接层 self.entity_trans_transformed = tf.layers.dense( self.entity_trans, num_trans_units, activation=tf.tanh, name='trans_transformation') # 添加['_NONE', '_PAD_H', '_PAD_R', '_PAD_T', '_NAF_H', '_NAF_R', '_NAF_T']这7个的嵌入 padding_entity = tf.get_variable('entity_padding_embed', [7, num_trans_units], dtype=tf.float32, initializer=tf.zeros_initializer()) self.entity_embed = tf.concat( [padding_entity, self.entity_trans_transformed], axis=0) # triples_embedding: [batch_size, triple_num, triple_len, 3*num_trans_units] 知识图三元组的嵌入 triples_embedding = tf.reshape( tf.nn.embedding_lookup(self.entity_embed, self.entity2index.lookup(self.triples)), [encoder_batch_size, triple_num, -1, 3 * num_trans_units]) # entities_word_embedding: [batch_size, triple_num*triple_len, num_embed_units] 知识图中用到的所有实体的嵌入 entities_word_embedding = tf.reshape( tf.nn.embedding_lookup(self.embed, self.symbol2index.lookup(self.entities)), [encoder_batch_size, -1, num_embed_units]) # 分离知识图三元组的头、关系和尾 [batch_size, triple_num, triple_len, num_trans_units] head, relation, tail = tf.split(triples_embedding, [num_trans_units] * 3, axis=3) # 静态图注意力机制 with tf.variable_scope('graph_attention'): # 将头尾连接起来 [batch_size, triple_num, triple_len, 2*num_trans_units] head_tail = tf.concat([head, tail], axis=3) # 将头尾送入全连接层 [batch_size, triple_num, triple_len, num_trans_units] head_tail_transformed = tf.layers.dense(head_tail, num_trans_units, activation=tf.tanh, name='head_tail_transform') # 将关系送入全连接层 [batch_size, triple_num, triple_len, num_trans_units] relation_transformed = tf.layers.dense(relation, num_trans_units, name='relation_transform') # 求头尾和关系两个向量的内积,获得对三元组的注意力系数 e_weight = tf.reduce_sum( relation_transformed * head_tail_transformed, axis=3) # [batch_size, triple_num, triple_len] alpha_weight = tf.nn.softmax( e_weight) # [batch_size, triple_num, triple_len] # tf.expand_dims 使 alpha_weight 维度+1 [batch_size, triple_num, triple_len, 1] # 对第2个维度求和,由此产生静态图的向量表示 graph_embed = tf.reduce_sum( tf.expand_dims(alpha_weight, 3) * head_tail, axis=2) # [batch_size, triple_num, 2*num_trans_units] """graph_embed_input 1、首先一维的range列表[0, 1, 2... encoder_batch_size个]转化成三维的[encoder_batch_size, 1, 1]的矩阵 [[[0]], [[1]], [[2]],...] 2、然后tf.tile将矩阵的第1维复制encoder_len遍,变成[encoder_batch_size, encoder_len, 1] [[[0],[0]...]],...] 3、与posts_triple: [batch_size, encoder_len, 1]在第2维上进行拼接,形成一个indices: [batch_size, encoder_len, 2]矩阵, indices矩阵: [ [[0 0], [0 0], [0 0], [0 0], [0 1], [0 0], [0 2], [0 0],...encoder_len], [[1 0], [1 0], [1 0], [1 0], [1 1], [1 0], [1 2], [1 0],...encoder_len], [[2 0], [2 0], [2 0], [2 0], [2 1], [2 0], [2 2], [2 0],...encoder_len] ,...batch_size ] 4、tf.gather_nd根据索引检索graph_embed: [batch_size, triple_num, 2*num_trans_units]再回填至indices矩阵 indices矩阵最后一个维度是2,例如有[0, 2],表示这个时间步第1个batch用了第2个图, 则找到这个知识图的静态图向量填入到indices矩阵的[0, 2]位置最后得到结果维度 [encoder_batch_size, encoder_len, 2*num_trans_units]表示每个时间步用的静态图向量 """ # graph_embed_input = tf.gather_nd(graph_embed, tf.concat( # [tf.tile(tf.reshape(tf.range(encoder_batch_size, dtype=tf.int32), [-1, 1, 1]), [1, encoder_len, 1]), # self.posts_triple], # axis=2)) # 将responses_triple转化成实体嵌入 [batch_size, decoder_len, 300],标识了response每个时间步用了哪个三元组的嵌入 # triple_embed_input = tf.reshape( # tf.nn.embedding_lookup(self.entity_embed, self.entity2index.lookup(self.responses_triple)), # [batch_size, decoder_len, 3 * num_trans_units]) post_word_input = tf.nn.embedding_lookup( self.embed, self.posts_word_id) # [batch_size, encoder_len, num_embed_units] response_word_input = tf.nn.embedding_lookup( self.embed, self.responses_word_id ) # [batch_size, decoder_len, num_embed_units] # post_word_input和graph_embed_input拼接构成编码器输入 [batch_size, encoder_len, num_embed_units+2*num_trans_units] # self.encoder_input = tf.concat([post_word_input, graph_embed_input], axis=2) # response_word_input和triple_embed_input拼接构成解码器输入 [batch_size, decoder_len, num_embed_units+3*num_trans_units] # self.decoder_input = tf.concat([response_word_input, triple_embed_input], axis=2) encoder_cell = MultiRNNCell( [GRUCell(num_units) for _ in range(num_layers)]) decoder_cell = MultiRNNCell( [GRUCell(num_units) for _ in range(num_layers)]) # rnn encoder # encoder_state: [num_layers, 2, batch_size, num_units] 编码器输出状态 LSTM GRU:[num_layers, batch_size, num_units] encoder_output, encoder_state = tf.nn.dynamic_rnn(encoder_cell, post_word_input, self.posts_length, dtype=tf.float32, scope="encoder") # self.encoder_state_shape = tf.shape(encoder_state) ########记忆网络 ### response_encoder_cell = MultiRNNCell( [GRUCell(num_units) for _ in range(num_layers)]) response_encoder_output, response_encoder_state = tf.nn.dynamic_rnn( response_encoder_cell, response_word_input, self.responses_length, dtype=tf.float32, scope="response_encoder") # graph_embed: [batch_size, triple_num, 2*num_trans_units] 静态图向量 # encoder_state: [num_layers, batch_size, num_units] with tf.variable_scope("post_memory_network"): # 将静态知识图转化成输入向量m post_input = tf.layers.dense(graph_embed, memory_units, use_bias=False, name="post_weight_a") post_input = tf.tile( tf.reshape(post_input, (1, encoder_batch_size, triple_num, memory_units)), multiples=( num_layers, 1, 1, 1)) # [num_layers, batch_size, triple_num, memory_units] # 将静态知识库转化成输出向量c post_output = tf.layers.dense(graph_embed, memory_units, use_bias=False, name="post_weight_c") post_output = tf.tile( tf.reshape(post_output, (1, encoder_batch_size, triple_num, memory_units)), multiples=( num_layers, 1, 1, 1)) # [num_layers, batch_size, triple_num, memory_units] # 将question转化成状态向量u encoder_hidden_state = tf.reshape( tf.concat(encoder_state, axis=0), (num_layers, encoder_batch_size, num_units)) post_state = tf.layers.dense(encoder_hidden_state, memory_units, use_bias=False, name="post_weight_b") post_state = tf.tile( tf.reshape(post_state, (num_layers, encoder_batch_size, 1, memory_units)), multiples=( 1, 1, triple_num, 1)) # [num_layers, batch_size, triple_num, memory_units] # 概率p post_p = tf.reshape( tf.nn.softmax(tf.reduce_sum(post_state * post_input, axis=3)), (num_layers, encoder_batch_size, triple_num, 1)) # [num_layers, batch_size, triple_num, 1] # 输出o post_o = tf.reduce_sum( post_output * post_p, axis=2) # [num_layers, batch_size, memory_units] post_xstar = tf.concat( [ tf.layers.dense(post_o, memory_units, use_bias=False, name="post_weight_r"), encoder_state ], axis=2) # [num_layers, batch_size, num_units+memory_units] with tf.variable_scope("response_memory_network"): # 将静态知识图转化成输入向量m response_input = tf.layers.dense(graph_embed, memory_units, use_bias=False, name="response_weight_a") response_input = tf.tile( tf.reshape(response_input, (1, batch_size, triple_num, memory_units)), multiples=( num_layers, 1, 1, 1)) # [num_layers, batch_size, triple_num, memory_units] # 将静态知识库转化成输出向量c response_output = tf.layers.dense(graph_embed, memory_units, use_bias=False, name="response_weight_c") response_output = tf.tile( tf.reshape(response_output, (1, batch_size, triple_num, memory_units)), multiples=( num_layers, 1, 1, 1)) # [num_layers, batch_size, triple_num, memory_units] # 将question转化成状态向量u response_hidden_state = tf.reshape( tf.concat(response_encoder_state, axis=0), (num_layers, batch_size, num_units)) response_state = tf.layers.dense(response_hidden_state, memory_units, use_bias=False, name="response_weight_b") response_state = tf.tile( tf.reshape(response_state, (num_layers, batch_size, 1, memory_units)), multiples=( 1, 1, triple_num, 1)) # [num_layers, batch_size, triple_num, memory_units] # 概率p response_p = tf.reshape( tf.nn.softmax( tf.reduce_sum(response_state * response_input, axis=3)), (num_layers, batch_size, triple_num, 1)) # [num_layers, batch_size, triple_num, 1] # 输出o response_o = tf.reduce_sum( response_output * response_p, axis=2) # [num_layers, batch_size, memory_units] response_ystar = tf.concat( [ tf.layers.dense(response_o, memory_units, use_bias=False, name="response_weight_r"), response_encoder_state ], axis=2) # [num_layers, batch_size, num_units+memory_units] with tf.variable_scope("memory_network"): memory_hidden_state = tf.layers.dense(tf.concat( [post_xstar, response_ystar], axis=2), num_units, use_bias=False, activation=tf.tanh, name="output_weight") memory_hidden_state = tf.reshape( memory_hidden_state, (num_layers * batch_size, num_units)) # [num_layers, batch_size, num_units] memory_hidden_state = tuple( tf.split(memory_hidden_state, [batch_size] * num_layers, axis=0)) # self.memory_hidden_state_shape = tf.shape(memory_hidden_state) ######## ### output_fn, selector_fn, sequence_loss, sampled_sequence_loss, total_loss =\ output_projection_layer(num_units, num_symbols, num_samples) ########用于训练的decoder ### with tf.variable_scope('decoder'): attention_keys_init, attention_values_init, attention_score_fn_init, attention_construct_fn_init \ = prepare_attention(encoder_output, 'bahdanau', num_units, imem=(graph_embed, triples_embedding), output_alignments=output_alignments and mem_use) # 训练时处理每个时间步输出和下个时间步输入的函数 decoder_fn_train = attention_decoder_fn_train( memory_hidden_state, attention_keys_init, attention_values_init, attention_score_fn_init, attention_construct_fn_init, output_alignments=output_alignments and mem_use, max_length=tf.reduce_max(self.responses_length)) self.decoder_output, _, alignments_ta = dynamic_rnn_decoder( decoder_cell, decoder_fn_train, response_word_input, self.responses_length, scope="decoder_rnn") if output_alignments: self.alignments = tf.transpose(alignments_ta.stack(), perm=[1, 0, 2, 3]) self.decoder_loss, self.ppx_loss, self.sentence_ppx = total_loss( self.decoder_output, self.responses_target, self.decoder_mask, self.alignments, triples_embedding, use_triples, one_hot_triples) self.sentence_ppx = tf.identity(self.sentence_ppx, name='ppx_loss') else: self.decoder_loss = sequence_loss(self.decoder_output, self.responses_target, self.decoder_mask) ######## ### ########用于推导的decoder ### with tf.variable_scope('decoder', reuse=True): attention_keys, attention_values, attention_score_fn, attention_construct_fn \ = prepare_attention(encoder_output, 'bahdanau', num_units, reuse=True, imem=(graph_embed, triples_embedding), output_alignments=output_alignments and mem_use) decoder_fn_inference = \ attention_decoder_fn_inference(output_fn, memory_hidden_state, attention_keys, attention_values, attention_score_fn, attention_construct_fn, self.embed, GO_ID, EOS_ID, max_length, num_symbols, imem=(entities_word_embedding, # imem: ([batch_size,triple_num*triple_len,num_embed_units], tf.reshape(triples_embedding, [encoder_batch_size, -1, 3*num_trans_units])), # [encoder_batch_size, triple_num*triple_len, 3*num_trans_units]) 实体词嵌入和三元组嵌入的元组 selector_fn=selector_fn) # decoder_distribution: [batch_size, decoder_len, num_symbols] # output_ids_ta: tensorarray: decoder_len [batch_size] self.decoder_distribution, _, output_ids_ta = dynamic_rnn_decoder( decoder_cell, decoder_fn_inference, scope="decoder_rnn") output_len = tf.shape(self.decoder_distribution)[1] # decoder_len output_ids = tf.transpose( output_ids_ta.gather( tf.range(output_len))) # [batch_size, decoder_len] # 对output的值域行裁剪,因为存在负值表示用了实体词 word_ids = tf.cast(tf.clip_by_value(output_ids, 0, num_symbols), tf.int64) # [batch_size, decoder_len] # 计算的是实体词在entities中的实际位置 [batch_size, decoder_len] # 1、tf.shape(entities_word_embedding)[1] = triple_num*triple_len # 2、tf.range(encoder_batch_size): [batch_size] # 3、tf.reshape(tf.range(encoder_batch_size) * tf.shape(entities_word_embedding)[1], [-1, 1]): [batch_size, 1] 实体词在entities中的基地址 # 4、tf.clip_by_value(-output_ids, 0, num_symbols): [batch_size, decoder_len] 实体词在entities中的偏移量 # 5、entity_ids: [batch_size, decoder_len] 实体词在entities中的实际位置 entity_ids = tf.reshape( tf.clip_by_value(-output_ids, 0, num_symbols) + tf.reshape( tf.range(encoder_batch_size) * tf.shape(entities_word_embedding)[1], [-1, 1]), [-1]) # 计算的是所用的实体词 [batch_size, decoder_len] # 1、entities: [batch_size, triple_num, triple_len] # 2、tf.reshape(self.entities, [-1]): [batch_size*triple_num*triple_len] # 3、tf.gather: [batch_size*decoder_len] # 4、entities: [batch_size, decoder_len] entities = tf.reshape( tf.gather(tf.reshape(self.entities, [-1]), entity_ids), [-1, output_len]) words = self.index2symbol.lookup(word_ids) # 将id转化为实际的词 # output_ids>0为bool张量,True的位置用words中该位置的词替换 self.generation = tf.where(output_ids > 0, words, entities) self.generation = tf.identity( self.generation, name='generation') # [batch_size, decoder_len] ######## ### # 初始化训练过程 self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) # 并没有使用衰退的学习率 self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) # 更新参数的次数 self.global_step = tf.Variable(0, trainable=False) # 要训练的参数 self.params = tf.global_variables() # 选择优化算法 opt = tf.train.AdamOptimizer(learning_rate=learning_rate) self.lr = opt._lr # 根据 decoder_loss 计算 params 梯度 gradients = tf.gradients(self.decoder_loss, self.params) # 梯度裁剪 clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) # 记录损失 tf.summary.scalar('decoder_loss', self.decoder_loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) # 记录变量的训练情况 self.merged_summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) self.saver_epoch = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=1000, pad_step_number=True)
def build(self, for_deploy, variants=""): conf = self.conf name = self.name job_type = self.job_type dtype = self.dtype self.beam_size = 1 if (not for_deploy or variants == "score") else sum( self.conf.beam_splits) graphlg.info("Creating placeholders...") self.enc_str_inps = tf.placeholder(tf.string, shape=(None, conf.input_max_len), name="enc_inps") self.enc_lens = tf.placeholder(tf.int32, shape=[None], name="enc_lens") self.dec_str_inps = tf.placeholder( tf.string, shape=[None, conf.output_max_len + 2], name="dec_inps") self.dec_lens = tf.placeholder(tf.int32, shape=[None], name="dec_lens") self.down_wgts = tf.placeholder(tf.float32, shape=[None], name="down_wgts") with tf.name_scope("TableLookup"): # lookup tables self.in_table = lookup.MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.out_table = lookup.MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value="_UNK", shared_name="out_table", name="out_table", checkpoint=True) self.enc_inps = self.in_table.lookup(self.enc_str_inps) self.dec_inps = self.in_table.lookup(self.dec_str_inps) # Create encode graph and get attn states graphlg.info("Creating embeddings and embedding enc_inps.") with ops.device("/cpu:0"): self.embedding = variable_scope.get_variable( "embedding", [conf.output_vocab_size, conf.embedding_size]) with tf.name_scope("Embed") as scope: dec_inps = tf.slice(self.dec_inps, [0, 0], [-1, conf.output_max_len + 1]) with ops.device("/cpu:0"): self.emb_inps = embedding_lookup_unique( self.embedding, self.enc_inps) emb_dec_inps = embedding_lookup_unique(self.embedding, dec_inps) # output projector (w, b) with tf.variable_scope("OutProj"): if conf.out_layer_size: w = tf.get_variable( "proj_w", [conf.out_layer_size, conf.output_vocab_size], dtype=dtype) elif conf.bidirectional: w = tf.get_variable( "proj_w", [conf.num_units * 2, conf.output_vocab_size], dtype=dtype) else: w = tf.get_variable("proj_w", [conf.num_units, conf.output_vocab_size], dtype=dtype) b = tf.get_variable("proj_b", [conf.output_vocab_size], dtype=dtype) graphlg.info("Creating dynamic rnn...") self.enc_outs, self.enc_states, mem_size, enc_state_size = DynRNN( conf.cell_model, conf.num_units, conf.num_layers, self.emb_inps, self.enc_lens, keep_prob=1.0, bidi=conf.bidirectional, name_scope="DynRNNEncoder") batch_size = tf.shape(self.enc_outs)[0] # Do vae on the state of the last layer of the encoder final_enc_states = [] KLDs = 0.0 for each in self.enc_states: z, KLD, l2 = CreateVAE([each], self.conf.enc_latent_dim, name_scope="VAE") if isinstance(each, LSTMStateTuple): final_enc_states.append( LSTMStateTuple(each.c, tf.concat([each.h, z], 1))) else: final_enc_state.append(tf.concat([z, each], 1)) KLDs += KLD / self.conf.num_layers with tf.name_scope("DynRNNDecode") as scope: with tf.name_scope("ShapeToBeam") as scope: beam_memory = tf.reshape( tf.tile(self.enc_outs, [1, 1, self.beam_size]), [-1, conf.input_max_len, mem_size]) beam_memory_lens = tf.squeeze( tf.reshape( tf.tile(tf.expand_dims(self.enc_lens, 1), [1, self.beam_size]), [-1, 1]), 1) def _to_beam(t): return tf.reshape(tf.tile(t, [1, self.beam_size]), [-1, int(t.get_shape()[1])]) beam_init_states = tf.contrib.framework.nest.map_structure( _to_beam, final_enc_states) max_mem_size = self.conf.input_max_len + self.conf.output_max_len + 2 cell = AttnCell(cell_model=conf.cell_model, num_units=mem_size, num_layers=conf.num_layers, attn_type=self.conf.attention, memory=beam_memory, mem_lens=beam_memory_lens, max_mem_size=max_mem_size, addmem=self.conf.addmem, keep_prob=conf.keep_prob, dtype=tf.float32, name_scope="AttnCell") dec_init_state = DecStateInit(all_enc_states=beam_init_states, decoder_cell=cell, batch_size=batch_size * self.beam_size, init_type="each2each") if not for_deploy: hp_train = helper.ScheduledEmbeddingTrainingHelper( inputs=emb_dec_inps, sequence_length=self.dec_lens, embedding=self.embedding, sampling_probability=self.conf.sample_prob, out_proj=(w, b)) output_layer = layers_core.Dense( self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None my_decoder = basic_decoder.BasicDecoder( cell=cell, helper=hp_train, initial_state=dec_init_state, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode( decoder=my_decoder, impute_finished=False, maximum_iterations=conf.output_max_len + 1, scope=scope) elif variants == "score": dec_init_state = zero_attn_states hp_train = helper.ScheduledEmbeddingTrainingHelper( inputs=emb_dec_inps, sequence_length=self.dec_lens, embedding=self.embedding, sampling_probability=0.0, out_proj=(w, b)) output_layer = layers_core.Dense( self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None my_decoder = score_decoder.ScoreDecoder( cell=cell, helper=hp_train, out_proj=(w, b), initial_state=dec_init_state, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode( decoder=my_decoder, scope=scope, maximum_iterations=self.conf.output_max_len, impute_finished=False) else: hp_infer = helper.GreedyEmbeddingHelper( embedding=self.embedding, start_tokens=tf.ones(shape=[batch_size * self.beam_size], dtype=tf.int32), end_token=EOS_ID, out_proj=(w, b)) output_layer = layers_core.Dense( self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None my_decoder = beam_decoder.BeamDecoder( cell=cell, helper=hp_infer, out_proj=(w, b), initial_state=dec_init_state, beam_splits=self.conf.beam_splits, max_res_num=self.conf.max_res_num, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode( decoder=my_decoder, scope=scope, maximum_iterations=self.conf.output_max_len, impute_finished=True) if not for_deploy: outputs = cell_outs.rnn_output # Output ouputprojected to logits L = tf.shape(outputs)[1] outputs = tf.reshape(outputs, [-1, int(w.shape[0])]) outputs = tf.matmul(outputs, w) + b logits = tf.reshape(outputs, [-1, L, int(w.shape[1])]) # branch 1 for debugging, doesn't have to be called with tf.name_scope("DebugOutputs") as scope: self.outputs = tf.argmax(logits, axis=2) self.outputs = tf.reshape(self.outputs, [-1, L]) self.outputs = self.out_table.lookup( tf.cast(self.outputs, tf.int64)) with tf.name_scope("Loss") as scope: tars = tf.slice(self.dec_inps, [0, 1], [-1, L]) wgts = tf.cumsum(tf.one_hot(self.dec_lens, L), axis=1, reverse=True) #wgts = wgts * tf.expand_dims(self.down_wgts, 1) self.loss = loss.sequence_loss(logits=logits, targets=tars, weights=wgts, average_across_timesteps=False, average_across_batch=False) example_losses = tf.reduce_sum(self.loss, 1) batch_wgt = tf.reduce_sum(self.down_wgts) see_KLD = tf.reduce_sum(KLDs * self.down_wgts) / batch_wgt see_loss = tf.reduce_sum(example_losses / tf.cast( self.dec_lens, tf.float32) * self.down_wgts) / batch_wgt # not average over length self.loss = tf.reduce_sum( (example_losses + self.conf.kld_ratio * KLDs) * self.down_wgts) / batch_wgt with tf.name_scope(self.model_kind): tf.summary.scalar("loss", see_loss) tf.summary.scalar("kld", see_KLD) graph_nodes = { "loss": self.loss, "inputs": {}, "outputs": {}, "debug_outputs": self.outputs } elif variants == "score": L = tf.shape(cell_outs.logprobs)[1] one_hot = tf.one_hot(tf.slice(self.dec_inps, [0, 1], [-1, L]), depth=self.conf.output_vocab_size, axis=-1, on_value=1.0, off_value=0.0) outputs = tf.reduce_sum(cell_outs.logprobs * one_hot, 2) outputs = tf.reduce_sum(outputs, axis=1) inputs = { "enc_inps:0": self.enc_str_inps, "enc_lens:0": self.enc_lens, "dec_inps:0": self.dec_str_inps, "dec_lens:0": self.dec_lens } graph_nodes = { "loss": None, "inputs": inputs, "outputs": { "logprobs": outputs }, "visualize": None } else: L = tf.shape(cell_outs.beam_ends)[1] beam_symbols = cell_outs.beam_symbols beam_parents = cell_outs.beam_parents beam_ends = cell_outs.beam_ends beam_end_parents = cell_outs.beam_end_parents beam_end_probs = cell_outs.beam_end_probs alignments = cell_outs.alignments beam_ends = tf.reshape(tf.transpose(beam_ends, [0, 2, 1]), [-1, L]) beam_end_parents = tf.reshape( tf.transpose(beam_end_parents, [0, 2, 1]), [-1, L]) beam_end_probs = tf.reshape( tf.transpose(beam_end_probs, [0, 2, 1]), [-1, L]) ## Creating tail_ids batch_size = tf.Print(batch_size, [batch_size], message="VAERNN2 batch") batch_offset = tf.expand_dims( tf.cumsum( tf.ones([batch_size, self.beam_size], dtype=tf.int32) * self.beam_size, axis=0, exclusive=True), 2) offset2 = tf.expand_dims( tf.cumsum( tf.ones([batch_size, self.beam_size * 2], dtype=tf.int32) * self.beam_size, axis=0, exclusive=True), 2) out_len = tf.shape(beam_symbols)[1] self.beam_symbol_strs = tf.reshape( self.out_table.lookup(tf.cast(beam_symbols, tf.int64)), [batch_size, self.beam_size, -1]) self.beam_parents = tf.reshape( beam_parents, [batch_size, self.beam_size, -1]) - batch_offset self.beam_ends = tf.reshape(beam_ends, [batch_size, self.beam_size * 2, -1]) self.beam_end_parents = tf.reshape( beam_end_parents, [batch_size, self.beam_size * 2, -1]) - offset2 self.beam_end_probs = tf.reshape( beam_end_probs, [batch_size, self.beam_size * 2, -1]) self.beam_attns = tf.reshape( alignments, [batch_size, self.beam_size, out_len, -1]) inputs = { "enc_inps:0": self.enc_str_inps, "enc_lens:0": self.enc_lens } outputs = { "beam_symbols": self.beam_symbol_strs, "beam_parents": self.beam_parents, "beam_ends": self.beam_ends, "beam_end_parents": self.beam_end_parents, "beam_end_probs": self.beam_end_probs, "beam_attns": self.beam_attns } graph_nodes = { "loss": None, "inputs": inputs, "outputs": outputs, "visualize": { "z": z } } return graph_nodes
def __init__(self, num_symbols, num_embed_units, num_units, num_layers, embed, entity_embed=None, num_entities=0, num_trans_units=100, learning_rate=0.0001, learning_rate_decay_factor=0.95, max_gradient_norm=5.0, num_samples=500, max_length=60, mem_use=True, output_alignments=True, use_lstm=False): # 输入数据占位定义 self.posts = tf.placeholder(tf.string, (None, None), 'enc_inps') # batch*len self.posts_length = tf.placeholder(tf.int32, (None), 'enc_lens') # batch self.responses = tf.placeholder(tf.string, (None, None), 'dec_inps') # batch*len self.responses_length = tf.placeholder(tf.int32, (None), 'dec_lens') # batch self.entities = tf.placeholder(tf.string, (None, None, None), 'entities') # batch self.entity_masks = tf.placeholder(tf.string, (None, None), 'entity_masks') # batch self.triples = tf.placeholder(tf.string, (None, None, None, 3), 'triples') # batch self.posts_triple = tf.placeholder(tf.int32, (None, None, 1), 'enc_triples') # batch self.responses_triple = tf.placeholder(tf.string, (None, None, 3), 'dec_triples') # batch self.match_triples = tf.placeholder(tf.int32, (None, None, None), 'match_triples') # batch encoder_batch_size, encoder_len = tf.unstack(tf.shape(self.posts)) triple_num = tf.shape(self.triples)[1] triple_len = tf.shape(self.triples)[2] one_hot_triples = tf.one_hot(self.match_triples, triple_len) use_triples = tf.reduce_sum(one_hot_triples, axis=[2, 3]) # 构建词汇查询talbe (index to string, string to index) self.symbol2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.index2symbol = MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value='_UNK', shared_name="out_table", name="out_table", checkpoint=True) self.entity2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=NONE_ID, shared_name="entity_in_table", name="entity_in_table", checkpoint=True) self.index2entity = MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value='_NONE', shared_name="entity_out_table", name="entity_out_table", checkpoint=True) self.posts_word_id = self.symbol2index.lookup(self.posts) # batch*len self.posts_entity_id = self.entity2index.lookup( self.posts) # batch*len #self.posts_word_id = tf.Print(self.posts_word_id, ['use_triples', use_triples, 'one_hot_triples', one_hot_triples], summarize=1e6) self.responses_target = self.symbol2index.lookup( self.responses) # batch*len batch_size, decoder_len = tf.shape(self.responses)[0], tf.shape( self.responses)[1] self.responses_word_id = tf.concat([ tf.ones([batch_size, 1], dtype=tf.int64) * GO_ID, tf.split(self.responses_target, [decoder_len - 1, 1], 1)[0] ], 1) # batch*len self.decoder_mask = tf.reshape( tf.cumsum(tf.one_hot(self.responses_length - 1, decoder_len), reverse=True, axis=1), [-1, decoder_len]) # 构建词嵌入 table (index to vector) if embed is None: # 随机初始化词嵌入 self.embed = tf.get_variable('word_embed', [num_symbols, num_embed_units], tf.float32) else: # 使用预训练的词嵌入初始化 (pre-trained word vectors, GloVe or Word2Vec) self.embed = tf.get_variable('word_embed', dtype=tf.float32, initializer=embed) if entity_embed is None: # 随机初始化词嵌入 self.entity_trans = tf.get_variable( 'entity_embed', [num_entities, num_trans_units], tf.float32, trainable=False) else: # 使用预训练的词嵌入初始化 (pre-trained word vectors, GloVe or Word2Vec) self.entity_trans = tf.get_variable('entity_embed', dtype=tf.float32, initializer=entity_embed, trainable=False) self.entity_trans_transformed = tf.layers.dense( self.entity_trans, num_trans_units, activation=tf.tanh, name='trans_transformation') padding_entity = tf.get_variable('entity_padding_embed', [7, num_trans_units], dtype=tf.float32, initializer=tf.zeros_initializer()) self.entity_embed = tf.concat( [padding_entity, self.entity_trans_transformed], axis=0) triples_embedding = tf.reshape( tf.nn.embedding_lookup(self.entity_embed, self.entity2index.lookup(self.triples)), [encoder_batch_size, triple_num, -1, 3 * num_trans_units]) entities_word_embedding = tf.reshape( tf.nn.embedding_lookup(self.embed, self.symbol2index.lookup(self.entities)), [encoder_batch_size, -1, num_embed_units]) head, relation, tail = tf.split(triples_embedding, [num_trans_units] * 3, axis=3) # 知识融合层的静态注意力 with tf.variable_scope('graph_attention'): # 拼接head tail head_tail = tf.concat([head, tail], axis=3) # head tail合成一个向量 head_tail_transformed = tf.layers.dense(head_tail, num_trans_units, activation=tf.tanh, name='head_tail_transform') # relation 向量 relation_transformed = tf.layers.dense(relation, num_trans_units, name='relation_transform') # relation 和 head_tail 计算注意力权重 e_weight = tf.reduce_sum(relation_transformed * head_tail_transformed, axis=3) # 将注意力权重归一化 alpha_weight = tf.nn.softmax(e_weight) # 将权重和head_tail进行加权求和 graph_embed = tf.reduce_sum(tf.expand_dims(alpha_weight, 3) * head_tail, axis=2) graph_embed_input = tf.gather_nd( graph_embed, tf.concat([ tf.tile( tf.reshape(tf.range(encoder_batch_size, dtype=tf.int32), [-1, 1, 1]), [1, encoder_len, 1]), self.posts_triple ], axis=2)) triple_embed_input = tf.reshape( tf.nn.embedding_lookup( self.entity_embed, self.entity2index.lookup(self.responses_triple)), [batch_size, decoder_len, 3 * num_trans_units]) post_word_input = tf.nn.embedding_lookup( self.embed, self.posts_word_id) # batch*len*unit response_word_input = tf.nn.embedding_lookup( self.embed, self.responses_word_id) # batch*len*unit # 在输入语句中拼接注意力机制计算出来的图谱信息 self.encoder_input = tf.concat([post_word_input, graph_embed_input], axis=2) # 在输出语句中拼接所有图谱信息 self.decoder_input = tf.concat( [response_word_input, triple_embed_input], axis=2) # 编码器使用GRUCell, num_layers为网络层数 encoder_cell = MultiRNNCell( [GRUCell(num_units) for _ in range(num_layers)]) # 解码器层使用GRUCell,num_layers为网络层数 decoder_cell = MultiRNNCell( [GRUCell(num_units) for _ in range(num_layers)]) # RNN编码器的包装 encoder_output, encoder_state = dynamic_rnn(encoder_cell, self.encoder_input, self.posts_length, dtype=tf.float32, scope="encoder") # get output projection function output_fn, selector_fn, sequence_loss, sampled_sequence_loss, total_loss = output_projection_layer( num_units, num_symbols, num_samples) # 解码器 with tf.variable_scope('decoder'): # 获取 attention 函数 attention_keys_init, attention_values_init, attention_score_fn_init, attention_construct_fn_init \ = prepare_attention(encoder_output, 'bahdanau', num_units, imem=(graph_embed, triples_embedding), output_alignments=output_alignments and mem_use) # 'luong', num_units) decoder_fn_train = attention_decoder_fn_train( encoder_state, attention_keys_init, attention_values_init, attention_score_fn_init, attention_construct_fn_init, output_alignments=output_alignments and mem_use, max_length=tf.reduce_max(self.responses_length)) self.decoder_output, _, alignments_ta = dynamic_rnn_decoder( decoder_cell, decoder_fn_train, self.decoder_input, self.responses_length, scope="decoder_rnn") if output_alignments: self.alignments = tf.transpose(alignments_ta.stack(), perm=[1, 0, 2, 3]) self.decoder_loss, self.ppx_loss, self.sentence_ppx = total_loss( self.decoder_output, self.responses_target, self.decoder_mask, self.alignments, triples_embedding, use_triples, one_hot_triples) self.sentence_ppx = tf.identity(self.sentence_ppx, name='ppx_loss') else: self.decoder_loss = sequence_loss(self.decoder_output, self.responses_target, self.decoder_mask) with tf.variable_scope('decoder', reuse=True): # 获取 attention 函数 attention_keys, attention_values, attention_score_fn, attention_construct_fn \ = prepare_attention(encoder_output, 'bahdanau', num_units, reuse=True, imem=(graph_embed, triples_embedding), output_alignments=output_alignments and mem_use) # 'luong', num_units) decoder_fn_inference = attention_decoder_fn_inference( output_fn, encoder_state, attention_keys, attention_values, attention_score_fn, attention_construct_fn, self.embed, GO_ID, EOS_ID, max_length, num_symbols, imem=(entities_word_embedding, tf.reshape( triples_embedding, [encoder_batch_size, -1, 3 * num_trans_units])), selector_fn=selector_fn) self.decoder_distribution, _, output_ids_ta = dynamic_rnn_decoder( decoder_cell, decoder_fn_inference, scope="decoder_rnn") output_len = tf.shape(self.decoder_distribution)[1] output_ids = tf.transpose( output_ids_ta.gather(tf.range(output_len))) word_ids = tf.cast(tf.clip_by_value(output_ids, 0, num_symbols), tf.int64) entity_ids = tf.reshape( tf.clip_by_value(-output_ids, 0, num_symbols) + tf.reshape( tf.range(encoder_batch_size) * tf.shape(entities_word_embedding)[1], [-1, 1]), [-1]) entities = tf.reshape( tf.gather(tf.reshape(self.entities, [-1]), entity_ids), [-1, output_len]) words = self.index2symbol.lookup(word_ids) # 生成用于输出的回复语句 self.generation = tf.where(output_ids > 0, words, entities) self.generation = tf.identity(self.generation, name='generation') # 训练参数初始化 self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.params = tf.global_variables() # 使用Adam优化器,计算高效、梯度平滑、参数调节简单 opt = tf.train.AdamOptimizer(learning_rate=learning_rate) self.lr = opt._lr gradients = tf.gradients(self.decoder_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) tf.summary.scalar('decoder_loss', self.decoder_loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) self.merged_summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) self.saver_epoch = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=1000, pad_step_number=True)