def resource_constant_model_fn(unused_features, unused_labels, mode): """A model_fn that loads a constant from a resource and serves it.""" assert mode in (model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL, model_fn.ModeKeys.INFER) const = constant_op.constant(-1, dtype=dtypes.int64) table = lookup.MutableHashTable(dtypes.string, dtypes.int64, const, name='LookupTableModel') if mode in (model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL): key = constant_op.constant(['key']) value = constant_op.constant([42], dtype=dtypes.int64) train_op_1 = table.insert(key, value) training_state = lookup.MutableHashTable( dtypes.string, dtypes.int64, const, name='LookupTableTrainingState') training_op_2 = training_state.insert(key, value) return const, const, control_flow_ops.group( train_op_1, training_op_2) if mode == model_fn.ModeKeys.INFER: key = constant_op.constant(['key']) prediction = table.lookup(key) return prediction, const, control_flow_ops.no_op()
def testMutableHashTableIsLocal(self): with ops.device( estimator._get_replica_device_setter(run_config.RunConfig())): default_val = constant_op.constant([-1, -1], dtypes.int64) table = lookup.MutableHashTable(dtypes.string, dtypes.int64, default_val) input_string = constant_op.constant(['brain', 'salad', 'tank']) output = table.lookup(input_string) self.assertDeviceEqual('', table._table_ref.device) self.assertDeviceEqual('', output.device)
def testMutableHashTableIsOnPs(self): tf_config = {'cluster': {run_config.TaskType.PS: ['fake_ps_0']}} with test.mock.patch.dict('os.environ', {'TF_CONFIG': json.dumps(tf_config)}): config = run_config.RunConfig() with ops.device(estimator._get_replica_device_setter(config)): default_val = constant_op.constant([-1, -1], dtypes.int64) table = lookup.MutableHashTable(dtypes.string, dtypes.int64, default_val) input_string = constant_op.constant(['brain', 'salad', 'tank']) output = table.lookup(input_string) self.assertDeviceEqual('/job:ps/task:0', table._table_ref.device) self.assertDeviceEqual('/job:ps/task:0', output.device)
def build(self, inputs, for_deploy): conf = self.conf name = self.name job_type = self.job_type dtype = self.dtype self.beam_size = 1 if (not for_deploy or self.conf.variants == "score") else sum(self.conf.beam_splits) conf.keep_prob = conf.keep_prob if not for_deploy else 1.0 self.enc_str_inps = inputs["enc_inps:0"] self.dec_str_inps = inputs["dec_inps:0"] self.enc_lens = inputs["enc_lens:0"] self.dec_lens = inputs["dec_lens:0"] #self.down_wgts = inputs["down_wgts:0"] with tf.name_scope("TableLookup"): # lookup tables self.in_table = lookup.MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.out_table = lookup.MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value="_UNK", shared_name="out_table", name="out_table", checkpoint=True) self.enc_inps = self.in_table.lookup(self.enc_str_inps) self.dec_inps = self.in_table.lookup(self.dec_str_inps) # Create encode graph and get attn states graphlg.info("Creating embeddings and embedding enc_inps.") with ops.device("/cpu:0"): self.embedding = variable_scope.get_variable( "embedding", [conf.output_vocab_size, conf.embedding_size]) with tf.name_scope("Embed") as scope: dec_inps = tf.slice(self.dec_inps, [0, 0], [-1, conf.output_max_len + 1]) with ops.device("/cpu:0"): self.emb_inps = embedding_lookup_unique( self.embedding, self.enc_inps) emb_dec_inps = embedding_lookup_unique(self.embedding, dec_inps) # output projector (w, b) with tf.variable_scope("OutProj"): if conf.out_layer_size: w = tf.get_variable( "proj_w", [conf.out_layer_size, conf.output_vocab_size], dtype=dtype) elif conf.bidirectional: w = tf.get_variable( "proj_w", [conf.num_units * 2, conf.output_vocab_size], dtype=dtype) else: w = tf.get_variable("proj_w", [conf.num_units, conf.output_vocab_size], dtype=dtype) b = tf.get_variable("proj_b", [conf.output_vocab_size], dtype=dtype) graphlg.info("Creating dynamic rnn...") self.enc_outs, self.enc_states, mem_size, enc_state_size = DynRNN( conf.cell_model, conf.num_units, conf.num_layers, self.emb_inps, self.enc_lens, keep_prob=conf.keep_prob, bidi=conf.bidirectional, name_scope="DynRNNEncoder") batch_size = tf.shape(self.enc_outs)[0] # to modify the output states of all encoder layers for dec init final_enc_states = self.enc_states with tf.name_scope("DynRNNDecode") as scope: with tf.name_scope("ShapeToBeam") as scope: beam_memory = tf.reshape( tf.tile(self.enc_outs, [1, 1, self.beam_size]), [-1, conf.input_max_len, mem_size]) beam_memory_lens = tf.squeeze( tf.reshape( tf.tile(tf.expand_dims(self.enc_lens, 1), [1, self.beam_size]), [-1, 1]), 1) def _to_beam(t): return tf.reshape(tf.tile(t, [1, self.beam_size]), [-1, int(t.get_shape()[1])]) beam_init_states = tf.contrib.framework.nest.map_structure( _to_beam, final_enc_states) max_mem_size = self.conf.input_max_len + self.conf.output_max_len + 2 cell = AttnCell(cell_model=conf.cell_model, num_units=mem_size, num_layers=conf.num_layers, attn_type=self.conf.attention, memory=beam_memory, mem_lens=beam_memory_lens, max_mem_size=max_mem_size, addmem=self.conf.addmem, keep_prob=conf.keep_prob, dtype=tf.float32, name_scope="AttnCell") dec_init_state = DecStateInit(all_enc_states=beam_init_states, decoder_cell=cell, batch_size=batch_size * self.beam_size, init_type=conf.dec_init_type, use_proj=conf.use_init_proj) if not for_deploy: hp_train = helper.ScheduledEmbeddingTrainingHelper( inputs=emb_dec_inps, sequence_length=self.dec_lens, embedding=self.embedding, sampling_probability=self.conf.sample_prob, out_proj=(w, b)) output_layer = layers_core.Dense( self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None my_decoder = basic_decoder.BasicDecoder( cell=cell, helper=hp_train, initial_state=dec_init_state, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode( decoder=my_decoder, impute_finished=True, maximum_iterations=conf.output_max_len + 1, scope=scope) elif self.conf.variants == "score": hp_train = helper.ScheduledEmbeddingTrainingHelper( inputs=emb_dec_inps, sequence_length=self.dec_lens, embedding=self.embedding, sampling_probability=0.0, out_proj=(w, b)) output_layer = layers_core.Dense( self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None my_decoder = score_decoder.ScoreDecoder( cell=cell, helper=hp_train, out_proj=(w, b), initial_state=dec_init_state, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode( decoder=my_decoder, scope=scope, maximum_iterations=self.conf.output_max_len, impute_finished=True) else: hp_infer = helper.GreedyEmbeddingHelper( embedding=self.embedding, start_tokens=tf.ones(shape=[batch_size * self.beam_size], dtype=tf.int32), end_token=EOS_ID, out_proj=(w, b)) output_layer = layers_core.Dense( self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None dec_init_state = beam_decoder.BeamState( tf.zeros([batch_size * self.beam_size]), dec_init_state, tf.zeros([batch_size * self.beam_size], tf.int32)) my_decoder = beam_decoder.BeamDecoder( cell=cell, helper=hp_infer, out_proj=(w, b), initial_state=dec_init_state, beam_splits=self.conf.beam_splits, max_res_num=self.conf.max_res_num, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode( decoder=my_decoder, scope=scope, maximum_iterations=self.conf.output_max_len, impute_finished=True) if not for_deploy: outputs = cell_outs.rnn_output # Output ouputprojected to logits L = tf.shape(outputs)[1] outputs = tf.reshape(outputs, [-1, int(w.shape[0])]) outputs = tf.matmul(outputs, w) + b logits = tf.reshape(outputs, [-1, L, int(w.shape[1])]) # branch 1 for debugging, doesn't have to be called with tf.name_scope("DebugOutputs") as scope: self.outputs = tf.argmax(logits, axis=2) self.outputs = tf.reshape(self.outputs, [-1, L]) self.outputs = self.out_table.lookup( tf.cast(self.outputs, tf.int64)) with tf.name_scope("Loss") as scope: tars = tf.slice(self.dec_inps, [0, 1], [-1, L]) # wgts may be a more complicated form, for example a partial down-weighting of a sequence # but here i just use 1.0 weights for all no-padding label wgts = tf.cumsum(tf.one_hot(self.dec_lens, L), axis=1, reverse=True) #wgts = wgts * tf.expand_dims(self.down_wgts, 1) loss_matrix = loss.sequence_loss( logits=logits, targets=tars, weights=wgts, average_across_timesteps=False, average_across_batch=False) self.loss = see_loss = tf.reduce_sum( loss_matrix) / tf.reduce_sum(wgts) with tf.name_scope(self.model_kind): tf.summary.scalar("loss", see_loss) graph_nodes = { "loss": self.loss, "inputs": {}, "outputs": {}, "debug_outputs": self.outputs } elif self.conf.variants == "score": L = tf.shape(cell_outs.logprobs)[1] one_hot = tf.one_hot(tf.slice(self.dec_inps, [0, 1], [-1, L]), depth=self.conf.output_vocab_size, axis=-1, on_value=1.0, off_value=0.0) outputs = tf.reduce_sum(cell_outs.logprobs * one_hot, 2) outputs = tf.reduce_sum(outputs, axis=1) graph_nodes = { "loss": None, "inputs": { "enc_inps:0": self.enc_str_inps, "enc_lens:0": self.enc_lens, "dec_inps:0": self.dec_str_inps, "dec_lens:0": self.dec_lens }, "outputs": { "logprobs": outputs }, "visualize": None } else: L = tf.shape(cell_outs.beam_ends)[1] beam_symbols = cell_outs.beam_symbols beam_parents = cell_outs.beam_parents beam_ends = cell_outs.beam_ends beam_end_parents = cell_outs.beam_end_parents beam_end_probs = cell_outs.beam_end_probs alignments = cell_outs.alignments beam_ends = tf.reshape(tf.transpose(beam_ends, [0, 2, 1]), [-1, L]) beam_end_parents = tf.reshape( tf.transpose(beam_end_parents, [0, 2, 1]), [-1, L]) beam_end_probs = tf.reshape( tf.transpose(beam_end_probs, [0, 2, 1]), [-1, L]) ## Creating tail_ids batch_size = tf.Print(batch_size, [batch_size], message="BATCH") batch_offset = tf.expand_dims( tf.cumsum( tf.ones([batch_size, self.beam_size], dtype=tf.int32) * self.beam_size, axis=0, exclusive=True), 2) offset2 = tf.expand_dims( tf.cumsum( tf.ones([batch_size, self.beam_size * 2], dtype=tf.int32) * self.beam_size, axis=0, exclusive=True), 2) out_len = tf.shape(beam_symbols)[1] self.beam_symbol_strs = tf.reshape( self.out_table.lookup(tf.cast(beam_symbols, tf.int64)), [batch_size, self.beam_size, -1]) self.beam_parents = tf.reshape( beam_parents, [batch_size, self.beam_size, -1]) - batch_offset self.beam_ends = tf.reshape(beam_ends, [batch_size, self.beam_size * 2, -1]) self.beam_end_parents = tf.reshape( beam_end_parents, [batch_size, self.beam_size * 2, -1]) - offset2 self.beam_end_probs = tf.reshape( beam_end_probs, [batch_size, self.beam_size * 2, -1]) self.beam_attns = tf.reshape( alignments, [batch_size, self.beam_size, out_len, -1]) graph_nodes = { "loss": None, "inputs": { "enc_inps:0": self.enc_str_inps, "enc_lens:0": self.enc_lens }, "outputs": { "beam_symbols": self.beam_symbol_strs, "beam_parents": self.beam_parents, "beam_ends": self.beam_ends, "beam_end_parents": self.beam_end_parents, "beam_end_probs": self.beam_end_probs, "beam_attns": self.beam_attns }, "visualize": {} } return graph_nodes
def build(self, inputs, for_deploy): scope = "" conf = self.conf name = self.name job_type = self.job_type dtype = self.dtype self.beam_splits = conf.beam_splits self.beam_size = 1 if not for_deploy else sum(self.beam_splits) self.enc_str_inps = inputs["enc_inps:0"] self.dec_str_inps = inputs["dec_inps:0"] self.enc_lens = inputs["enc_lens:0"] self.dec_lens = inputs["dec_lens:0"] self.down_wgts = inputs["down_wgts:0"] with tf.name_scope("TableLookup"): # Input maps self.in_table = lookup.MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.out_table = lookup.MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value="_UNK", shared_name="out_table", name="out_table", checkpoint=True) # lookup self.enc_inps = self.in_table.lookup(self.enc_str_inps) self.dec_inps = self.in_table.lookup(self.dec_str_inps) graphlg.info("Preparing decoder inps...") dec_inps = tf.slice(self.dec_inps, [0, 0], [-1, conf.output_max_len + 1]) # Create encode graph and get attn states graphlg.info("Creating embeddings and embedding enc_inps.") with ops.device("/cpu:0"): self.embedding = variable_scope.get_variable("embedding", [conf.output_vocab_size, conf.embedding_size]) with tf.name_scope("Embed") as scope: dec_inps = tf.slice(self.dec_inps, [0, 0], [-1, conf.output_max_len + 1]) with ops.device("/cpu:0"): self.emb_inps = embedding_lookup_unique(self.embedding, self.enc_inps) emb_dec_inps = embedding_lookup_unique(self.embedding, dec_inps) graphlg.info("Creating dynamic x rnn...") self.enc_outs, self.enc_states, mem_size, enc_state_size = DynRNN(conf.cell_model, conf.num_units, conf.num_layers, self.emb_inps, self.enc_lens, keep_prob=1.0, bidi=conf.bidirectional, name_scope="DynRNNEncoder") batch_size = tf.shape(self.enc_outs)[0] if self.conf.attention: init_h = self.enc_states[-1].h else: mechanism = dynamic_attention_wrapper.LuongAttention(num_units=conf.num_units, memory=self.enc_outs, max_mem_size=self.conf.input_max_len, memory_sequence_length=self.enc_lens) init_h = mechanism(self.enc_states[-1].h) if isinstance(self.enc_states[-1], LSTMStateTuple): enc_state = LSTMStateTuple(self.enc_states[-1].c, init_h) hidden_units = int(math.sqrt(mem_size * self.conf.enc_latent_dim)) z, mu_prior, logvar_prior = PriorNet([enc_state], hidden_units, self.conf.enc_latent_dim, stddev=1.0, prior_type=conf.prior_type) KLD = 0.0 # Different graph for training and inference time if not for_deploy: # Y inputs for posterior z with tf.name_scope("YEncode"): y_emb_inps = tf.slice(emb_dec_inps, [0, 1, 0], [-1, -1, -1]) y_enc_outs, y_enc_states, y_mem_size, y_enc_state_size = DynRNN(conf.cell_model, conf.num_units, conf.num_layers, y_emb_inps, self.dec_lens, keep_prob=1.0, bidi=False, name_scope="y_enc") y_enc_state = y_enc_states[-1] z, KLD, l2 = CreateVAE([enc_state, y_enc_state], self.conf.enc_latent_dim, mu_prior, logvar_prior) # project z + x_thinking_state to decoder state raw_dec_states = [z, enc_state] # add BOW loss #num_hidden_units = int(math.sqrt(conf.output_vocab_size * int(decision_state.shape[1]))) #bow_l1 = layers_core.Dense(num_hidden_units, use_bias=True, name="bow_hidden", activation=tf.tanh) #bow_l2 = layers_core.Dense(conf.output_vocab_size, use_bias=True, name="bow_out", activation=None) #bow = bow_l2(bow_l1(decision_state)) #y_dec_inps = tf.slice(self.dec_inps, [0, 1], [-1, -1]) #bow_y = tf.reduce_sum(tf.one_hot(y_dec_inps, on_value=1.0, off_value=0.0, axis=-1, depth=conf.output_vocab_size), axis=1) #batch_bow_losses = tf.reduce_sum(bow_y * (-1.0) * tf.nn.log_softmax(bow), axis=1) max_mem_size = self.conf.input_max_len + self.conf.output_max_len + 2 with tf.name_scope("ShapeToBeam") as scope: def _to_beam(t): beam_t = tf.reshape(tf.tile(t, [1, self.beam_size]), [-1, int(t.get_shape()[1])]) return beam_t beam_raw_dec_states = tf.contrib.framework.nest.map_structure(_to_beam, raw_dec_states) beam_memory = tf.reshape(tf.tile(self.enc_outs, [1, 1, self.beam_size]), [-1, conf.input_max_len, mem_size]) beam_memory_lens = tf.squeeze(tf.reshape(tf.tile(tf.expand_dims(self.enc_lens, 1), [1, self.beam_size]), [-1, 1]), 1) cell = AttnCell(cell_model=conf.cell_model, num_units=mem_size, num_layers=conf.num_layers, attn_type=self.conf.attention, memory=beam_memory, mem_lens=beam_memory_lens, max_mem_size=max_mem_size, addmem=self.conf.addmem, keep_prob=1.0, dtype=tf.float32, name_scope="AttnCell") # Fit decision states to shape of attention decoder cell states zero_attn_states = DecStateInit(beam_raw_dec_states, cell, batch_size * self.beam_size) # Output projection with tf.variable_scope("OutProj"): graphlg.info("Creating out_proj...") if conf.out_layer_size: w = tf.get_variable("proj_w", [conf.out_layer_size, conf.output_vocab_size], dtype=dtype) else: w = tf.get_variable("proj_w", [mem_size, conf.output_vocab_size], dtype=dtype) b = tf.get_variable("proj_b", [conf.output_vocab_size], dtype=dtype) self.out_proj = (w, b) if not for_deploy: inputs = {} dec_init_state = zero_attn_states hp_train = helper.ScheduledEmbeddingTrainingHelper(inputs=emb_dec_inps, sequence_length=self.dec_lens, embedding=self.embedding, sampling_probability=0.0, out_proj=self.out_proj) output_layer = layers_core.Dense(self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None my_decoder = basic_decoder.BasicDecoder(cell=cell, helper=hp_train, initial_state=dec_init_state, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode(decoder=my_decoder, impute_finished=False, maximum_iterations=conf.output_max_len + 1, scope=scope) outputs = cell_outs.rnn_output L = tf.shape(outputs)[1] outputs = tf.reshape(outputs, [-1, int(self.out_proj[0].shape[0])]) outputs = tf.matmul(outputs, self.out_proj[0]) + self.out_proj[1] logits = tf.reshape(outputs, [-1, L, int(self.out_proj[0].shape[1])]) # branch 1 for debugging, doesn't have to be called #m = tf.shape(self.outputs)[0] #self.mask = tf.zeros([m, int(w.shape[1])]) #for i in [3]: # self.mask = self.mask + tf.one_hot(indices=tf.ones([m], dtype=tf.int32) * i, on_value=100.0, depth=int(w.shape[1])) #self.outputs = self.outputs - self.mask with tf.name_scope("DebugOutputs") as scope: self.outputs = tf.argmax(logits, axis=2) self.outputs = tf.reshape(self.outputs, [-1, L]) self.outputs = self.out_table.lookup(tf.cast(self.outputs, tf.int64)) # branch 2 for loss with tf.name_scope("Loss") as scope: tars = tf.slice(self.dec_inps, [0, 1], [-1, L]) wgts = tf.cumsum(tf.one_hot(self.dec_lens, L), axis=1, reverse=True) #wgts = wgts * tf.expand_dims(self.down_wgts, 1) self.loss = loss.sequence_loss(logits=logits, targets=tars, weights=wgts, average_across_timesteps=False, average_across_batch=False) batch_wgt = tf.reduce_sum(self.down_wgts) + 1e-12 #bow_loss = tf.reduce_sum(batch_bow_losses * self.down_wgts) / batch_wgt example_losses = tf.reduce_sum(self.loss, 1) see_loss = tf.reduce_sum(example_losses / tf.cast(self.dec_lens, tf.float32) * self.down_wgts) / batch_wgt KLD = tf.reduce_sum(KLD * self.down_wgts) / batch_wgt self.loss = tf.reduce_sum((example_losses + self.conf.kld_ratio * KLD) / tf.cast(self.dec_lens, tf.float32) * self.down_wgts) / batch_wgt with tf.name_scope(self.model_kind): tf.summary.scalar("loss", see_loss) tf.summary.scalar("kld", KLD) #tf.summary.scalar("bow", bow_loss) graph_nodes = { "loss":self.loss, "inputs":inputs, "debug_outputs":self.outputs, "outputs":{}, "visualize":None } return graph_nodes else: hp_infer = helper.GreedyEmbeddingHelper(embedding=self.embedding, start_tokens=tf.ones(shape=[batch_size * self.beam_size], dtype=tf.int32), end_token=EOS_ID, out_proj=self.out_proj) output_layer = layers_core.Dense(self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None dec_init_state = beam_decoder.BeamState(tf.zeros([batch_size * self.beam_size]), zero_attn_states, tf.zeros([batch_size * self.beam_size], tf.int32)) my_decoder = beam_decoder.BeamDecoder(cell=cell, helper=hp_infer, out_proj=self.out_proj, initial_state=dec_init_state, beam_splits=self.beam_splits, max_res_num=self.conf.max_res_num, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode(decoder=my_decoder, scope=scope, maximum_iterations=self.conf.output_max_len) L = tf.shape(cell_outs.beam_ends)[1] beam_symbols = cell_outs.beam_symbols beam_parents = cell_outs.beam_parents beam_ends = cell_outs.beam_ends beam_end_parents = cell_outs.beam_end_parents beam_end_probs = cell_outs.beam_end_probs alignments = cell_outs.alignments beam_ends = tf.reshape(tf.transpose(beam_ends, [0, 2, 1]), [-1, L]) beam_end_parents = tf.reshape(tf.transpose(beam_end_parents, [0, 2, 1]), [-1, L]) beam_end_probs = tf.reshape(tf.transpose(beam_end_probs, [0, 2, 1]), [-1, L]) # Creating tail_ids batch_size = tf.Print(batch_size, [batch_size], message="CVAERNN batch") #beam_symbols = tf.Print(cell_outs.beam_symbols, [tf.shape(cell_outs.beam_symbols)], message="beam_symbols") #beam_parents = tf.Print(cell_outs.beam_parents, [tf.shape(cell_outs.beam_parents)], message="beam_parents") #beam_ends = tf.Print(cell_outs.beam_ends, [tf.shape(cell_outs.beam_ends)], message="beam_ends") #beam_end_parents = tf.Print(cell_outs.beam_end_parents, [tf.shape(cell_outs.beam_end_parents)], message="beam_end_parents") #beam_end_probs = tf.Print(cell_outs.beam_end_probs, [tf.shape(cell_outs.beam_end_probs)], message="beam_end_probs") #alignments = tf.Print(cell_outs.alignments, [tf.shape(cell_outs.alignments)], message="beam_attns") batch_offset = tf.expand_dims(tf.cumsum(tf.ones([batch_size, self.beam_size], dtype=tf.int32) * self.beam_size, axis=0, exclusive=True), 2) offset2 = tf.expand_dims(tf.cumsum(tf.ones([batch_size, self.beam_size * 2], dtype=tf.int32) * self.beam_size, axis=0, exclusive=True), 2) out_len = tf.shape(beam_symbols)[1] self.beam_symbol_strs = tf.reshape(self.out_table.lookup(tf.cast(beam_symbols, tf.int64)), [batch_size, self.beam_size, -1]) self.beam_parents = tf.reshape(beam_parents, [batch_size, self.beam_size, -1]) - batch_offset self.beam_ends = tf.reshape(beam_ends, [batch_size, self.beam_size * 2, -1]) self.beam_end_parents = tf.reshape(beam_end_parents, [batch_size, self.beam_size * 2, -1]) - offset2 self.beam_end_probs = tf.reshape(beam_end_probs, [batch_size, self.beam_size * 2, -1]) self.beam_attns = tf.reshape(alignments, [batch_size, self.beam_size, out_len, -1]) #cell_outs.alignments #self.outputs = tf.concat([outputs_str, tf.cast(cell_outs.beam_parents, tf.string)], 1) #ones = tf.ones([batch_size, self.beam_size], dtype=tf.int32) #aux_matrix = tf.cumsum(ones * self.beam_size, axis=0, exclusive=True) #tm_beam_parents_reverse = tf.reverse(tf.transpose(cell_outs.beam_parents), axis=[0]) #beam_probs = final_state[1] #def traceback(prev_out, curr_input): # return tf.gather(curr_input, prev_out) # #tail_ids = tf.reshape(tf.cumsum(ones, axis=1, exclusive=True) + aux_matrix, [-1]) #tm_symbol_index_reverse = tf.scan(traceback, tm_beam_parents_reverse, initializer=tail_ids) ## Create beam index for symbols, and other info #tm_symbol_index = tf.concat([tf.expand_dims(tail_ids, 0), tm_symbol_index_reverse], axis=0) #tm_symbol_index = tf.reverse(tm_symbol_index, axis=[0]) #tm_symbol_index = tf.slice(tm_symbol_index, [1, 0], [-1, -1]) #symbol_index = tf.expand_dims(tf.transpose(tm_symbol_index), axis=2) #symbol_index = tf.concat([symbol_index, tf.cumsum(tf.ones_like(symbol_index), exclusive=True, axis=1)], axis=2) ## index alignments and output symbols #alignments = tf.gather_nd(cell_outs.alignments, symbol_index) #symbol_ids = tf.gather_nd(cell_outs.beam_symbols, symbol_index) ## outputs and other info #self.others = [alignments, beam_probs] #self.outputs = self.out_table.lookup(tf.cast(symbol_ids, tf.int64)) inputs = { "enc_inps:0":self.enc_str_inps, "enc_lens:0":self.enc_lens } outputs = { "beam_symbols":self.beam_symbol_strs, "beam_parents":self.beam_parents, "beam_ends":self.beam_ends, "beam_end_parents":self.beam_end_parents, "beam_end_probs":self.beam_end_probs, "beam_attns":self.beam_attns } graph_nodes = { "loss":None, "inputs":inputs, "outputs":outputs, "visualize":{"z":z} } return graph_nodes
def build(self, for_deploy, variants=""): conf = self.conf name = self.name job_type = self.job_type dtype = self.dtype # Input maps self.in_table = lookup.MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.out_table = lookup.MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value="_UNK", shared_name="out_table", name="out_table", checkpoint=True) graphlg.info("Creating placeholders...") self.enc_str_inps = tf.placeholder(tf.string, shape=[None, conf.input_max_len], name="enc_inps") self.dec_str_inps = tf.placeholder(tf.string, shape=[None, conf.output_max_len + 1], name="dec_inps") # lookup self.enc_inps = self.in_table.lookup(self.enc_str_inps) self.dec_tar_inps = self.in_table.lookup(self.dec_str_inps) batch_size = tf.shape(self.enc_inps)[0] with tf.variable_scope("encoder"): ## Encoding embedding with tf.device("/cpu:0"): self.embedding = tf.get_variable('embedding', dtype=tf.float32, shape=[self.conf.input_vocab_size, self.conf.embedding_size], initializer=tf.contrib.layers.xavier_initializer()) self.embedding = tf.concat((tf.zeros(shape=[1, self.conf.embedding_size]), self.embedding[1:, :]), 0) self.pos_embedding = tf.get_variable('positional_embedding', dtype=tf.float32, shape=[self.conf.input_max_len, self.conf.embedding_size], initializer=tf.contrib.layers.xavier_initializer()) # Word embedding self.enc = tf.nn.embedding_lookup(self.embedding, self.enc_inps) ## Positional Encoding pos = tf.tile(tf.expand_dims(tf.range(tf.shape(self.enc_inps)[1]), 0), [tf.shape(self.enc_inps)[0], 1]) pos_enc = tf.nn.embedding_lookup(self.pos_embedding, pos) self.enc = self.enc * (self.conf.embedding_size ** 0.5) + pos_enc # dropout self.enc = tf.layers.dropout(self.enc, rate=self.conf.dropout_rate, training=tf.convert_to_tensor(not for_deploy)) # Attn Blocks self.enc = AttnEncode(self.enc, self.conf.num_heads, for_deploy, self.conf.num_blocks, self.conf.hidden_units, self.conf.dropout_rate) with tf.variable_scope("decoder"): ## Decoding embedding with tf.device("/cpu:0"): self.dec_embedding = tf.get_variable('dec_embedding', dtype=tf.float32, shape=[self.conf.output_vocab_size, self.conf.embedding_size], initializer=tf.contrib.layers.xavier_initializer()) self.dec_embedding = tf.concat((tf.zeros(shape=[1, self.conf.embedding_size]), self.embedding[1:, :]), 0) self.pos_dec_embedding = tf.get_variable('positional_embedding', dtype=tf.float32, shape=[self.conf.output_max_len + 1, self.conf.embedding_size], initializer=tf.contrib.layers.xavier_initializer()) if for_deploy: #inps = tf.ones([batch_size, 1], tf.int32) dec_inps = tf.zeros([batch_size, conf.output_max_len + 1], tf.int32) time = constant_op.constant(0, tf.int32) else: dec_inps = tf.to_int32(self.dec_tar_inps) #inps = tf.Print(inps, [tf.shape(inps)], message="inps", summarize=10000) time = constant_op.constant(conf.output_max_len, tf.int32) def condition(time, finished, inps, logits, keys): finished = tf.equal(time, tf.convert_to_tensor(self.conf.output_max_len + 1)) return tf.logical_not(finished) def autoregressive(time, finished, inps, logits, keys): # inps are self.conf.output_max_len with one EOS at tail # here first remove the last EOS and head it by GO (1) the output should be # the original input sequence at training time, during which the loop body run only once inps = tf.concat([tf.ones([batch_size, 1], tf.int32), tf.slice(inps, [0, 0], [-1, self.conf.output_max_len])], 1) with tf.device("/cpu:0"): dec_emb = tf.nn.embedding_lookup(self.dec_embedding, inps) pos = tf.tile(tf.expand_dims(tf.range(tf.shape(inps)[1]), 0), [tf.shape(inps)[0], 1]) with tf.device("/cpu:0"): pos_dec_emb = tf.nn.embedding_lookup(self.pos_dec_embedding, pos) dec_emb = dec_emb * (self.conf.embedding_size ** 0.5) + pos_dec_emb dec_emb = tf.layers.dropout(dec_emb, rate=self.conf.dropout_rate, training=tf.convert_to_tensor(not for_deploy)) dec_out = AttnDecode(keys, dec_emb, self.conf.num_heads, for_deploy, self.conf.num_blocks, self.conf.hidden_units, self.conf.dropout_rate) ## Dropout # Final linear projection logits = tf.layers.dense(dec_out, self.conf.output_vocab_size) preds = tf.to_int32(tf.arg_max(logits, dimension=-1)) time = time + 1 next_inps = preds #time = tf.Print(time, [time], message="t", summarize=1000) #preds = tf.Print(preds, [preds], message="preds", summarize=10000) return time, finished, next_inps, logits, keys finished = constant_op.constant(True, tf.bool) logits = tf.zeros([batch_size, self.conf.output_max_len + 1, self.conf.output_vocab_size]) (time, finished, preds, logits, keys) = tf.while_loop( condition, autoregressive, loop_vars=[ time, finished, dec_inps, logits, self.enc ], shape_invariants=[ time.get_shape(), finished.get_shape(), tf.TensorShape([None, None]), tf.TensorShape([None, None, None]), self.enc.get_shape() ] ) self.preds = preds self.logits = logits #self.preds = tf.Print(self.preds, [tf.shape(self.preds)], message="preds") #self.logits = tf.Print(self.logits, [tf.shape(self.logits)], message="logits") if not for_deploy: self.istarget = tf.to_float(tf.not_equal(self.dec_tar_inps, 0)) self.acc = tf.reduce_sum(tf.to_float(tf.equal(tf.to_int64(self.preds), self.dec_tar_inps)) * self.istarget) / (tf.reduce_sum(self.istarget)) self.pred_strs = self.out_table.lookup(tf.cast(self.preds, tf.int64)) # Loss # smoothing #self.y_smoothed = label_smoothing(tf.one_hot(self.dec_tar_inps, depth=self.conf.output_vocab_size)) self.y_smoothed = tf.one_hot(self.dec_tar_inps, depth=self.conf.output_vocab_size, axis=-1) self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum(self.loss * self.istarget) / (tf.reduce_sum(self.istarget)) # Summary tf.summary.scalar('mean_loss', self.mean_loss) tf.summary.scalar('acc', self.acc) outputs = self.pred_strs return self.mean_loss, {}, {"out_strs":outputs} else: self.pred_strs = self.out_table.lookup(tf.cast(self.preds, tf.int64)) inputs = {"enc_inps":self.enc_str_inps} outputs = self.pred_strs return None, inputs, {"out_strs":outputs}
def build(self, inputs, for_deploy): scope = "" conf = self.conf name = self.name job_type = self.job_type dtype = self.dtype self.beam_splits = conf.beam_splits self.beam_size = 1 if not for_deploy else sum(self.beam_splits) self.enc_str_inps = inputs["enc_inps:0"] self.dec_str_inps = inputs["dec_inps:0"] self.enc_lens = inputs["enc_lens:0"] self.dec_lens = inputs["dec_lens:0"] self.down_wgts = inputs["down_wgts:0"] with tf.name_scope("TableLookup"): # Input maps self.in_table = lookup.MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.out_table = lookup.MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value="_UNK", shared_name="out_table", name="out_table", checkpoint=True) # lookup self.enc_inps = self.in_table.lookup(self.enc_str_inps) self.dec_inps = self.in_table.lookup(self.dec_str_inps) graphlg.info("Preparing decoder inps...") dec_inps = tf.slice(self.dec_inps, [0, 0], [-1, conf.output_max_len + 1]) # Create encode graph and get attn states graphlg.info("Creating embeddings and embedding enc_inps.") with ops.device("/cpu:0"): self.embedding = variable_scope.get_variable( "embedding", [conf.output_vocab_size, conf.embedding_size]) with tf.name_scope("Embed") as scope: dec_inps = tf.slice(self.dec_inps, [0, 0], [-1, conf.output_max_len + 1]) with ops.device("/cpu:0"): self.emb_inps = embedding_lookup_unique( self.embedding, self.enc_inps) emb_dec_inps = embedding_lookup_unique(self.embedding, dec_inps) graphlg.info("Creating dynamic x rnn...") self.enc_outs, self.enc_states, mem_size, enc_state_size = DynRNN( conf.cell_model, conf.num_units, conf.num_layers, self.emb_inps, self.enc_lens, keep_prob=1.0, bidi=conf.bidirectional, name_scope="DynRNNEncoder") batch_size = tf.shape(self.enc_outs)[0] if self.conf.attention: init_h = self.enc_states[-1].h else: mechanism = dynamic_attention_wrapper.LuongAttention( num_units=conf.num_units, memory=self.enc_outs, max_mem_size=self.conf.input_max_len, memory_sequence_length=self.enc_lens) init_h = mechanism(self.enc_states[-1].h) if isinstance(self.enc_states[-1], LSTMStateTuple): enc_state = LSTMStateTuple(self.enc_states[-1].c, init_h) all_emb = tf.concat([enc_state.c, enc_state.h], 1) else: all_emb = enc_state all_emb = tf.Print(all_emb, [tf.shape(all_emb)[0]], message="batch_size") query_emb, can_embs = tf.split(all_emb, [1, -1], 0) query_emb_normalized = tf.nn.l2_normalize(query_emb, 1) can_embs_normalized = tf.nn.l2_normalize(can_embs, 1) cos_dist_embs = tf.reduce_sum( query_emb_normalized * can_embs_normalized, 1) sum_word_embs = tf.reduce_sum(self.emb_inps, 1) query_word_emb, can_word_embs = tf.split(sum_word_embs, [1, -1], 0) query_word_emb_normalized = tf.nn.l2_normalize(query_word_emb, 1) can_word_embs_normalized = tf.nn.l2_normalize(can_word_embs, 1) cos_dist_word_embs = tf.reduce_sum( query_word_emb_normalized * can_word_embs_normalized, 1) inputs = {"enc_inps:0": self.enc_str_inps, "enc_lens:0": self.enc_lens} graph_nodes = { "loss": None, "inputs": inputs, "outputs": { "rnn_enc": tf.concat([tf.zeros([1]), cos_dist_embs], 0), "sum_emb": tf.concat([tf.zeros([1]), cos_dist_word_embs], 0) }, } return graph_nodes
def build(self): conf = self.conf dtype = self.dtype # All possible inputs graphlg.info("Creating inputs and tables...") batch_size = None self.enc_querys = tf.placeholder( tf.string, shape=[batch_size, conf.input_max_len], name="enc_querys") self.query_lens = tf.placeholder(tf.int32, shape=[batch_size], name="query_lens") self.enc_posts = tf.placeholder(tf.string, shape=[batch_size, conf.input_max_len], name="enc_posts") self.post_lens = tf.placeholder(tf.int32, shape=[batch_size], name="post_lens") self.enc_resps = tf.placeholder(tf.string, shape=[batch_size, conf.input_max_len], name="enc_resps") self.resp_lens = tf.placeholder(tf.int32, shape=[batch_size], name="resp_lens") self.enc_neg_resps = tf.placeholder( tf.string, shape=[batch_size, conf.input_max_len], name="enc_neg_resp") self.neg_resp_lens = tf.placeholder(tf.int32, shape=[batch_size], name="neg_resp_lens") #TODO table obj, lookup ops and embedding and its lookup op should be placed on the same device with tf.device("/cpu:0"): self.embedding = variable_scope.get_variable( "embedding", [conf.input_vocab_size, conf.embedding_size], initializer=tf.random_uniform_initializer(-0.08, 0.08)) self.in_table = lookup.MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.query_embs = embedding_lookup_unique( self.embedding, self.in_table.lookup(self.enc_querys)) self.post_embs = embedding_lookup_unique( self.embedding, self.in_table.lookup(self.enc_posts)) self.resp_embs = embedding_lookup_unique( self.embedding, self.in_table.lookup(self.enc_resps)) self.neg_resp_embs = embedding_lookup_unique( self.embedding, self.in_table.lookup(self.enc_neg_resps)) # MultiRNNCell graphlg.info("Creating multi-layer cells...") # Bi-RNN encoder graphlg.info("Creating bi-rnn...") with variable_scope.variable_scope("q_rnn", dtype=dtype, reuse=None) as scope: cell1 = CreateMultiRNNCell(conf.cell_model, conf.num_units, conf.num_layers, conf.output_keep_prob) cell2 = CreateMultiRNNCell(conf.cell_model, conf.num_units, conf.num_layers, conf.output_keep_prob) q_out, q_out_state = bidirectional_dynamic_rnn( cell_fw=cell1, cell_bw=cell2, inputs=self.query_embs, sequence_length=self.query_lens, initial_state_fw=None, initial_state_bw=None, dtype=dtype, parallel_iterations=16, swap_memory=False, time_major=False) with variable_scope.variable_scope("p_rnn", dtype=dtype, reuse=None) as scope: cell1 = CreateMultiRNNCell(conf.cell_model, conf.num_units, conf.num_layers, conf.output_keep_prob) cell2 = CreateMultiRNNCell(conf.cell_model, conf.num_units, conf.num_layers, conf.output_keep_prob) p_out, p_out_state = bidirectional_dynamic_rnn( cell_fw=cell1, cell_bw=cell2, inputs=self.post_embs, sequence_length=self.post_lens, initial_state_fw=None, initial_state_bw=None, dtype=dtype, parallel_iterations=16, swap_memory=False, time_major=False) with variable_scope.variable_scope("r_rnn", dtype=dtype, reuse=None) as scope: cell1 = CreateMultiRNNCell(conf.cell_model, conf.num_units, conf.num_layers, conf.output_keep_prob) cell2 = CreateMultiRNNCell(conf.cell_model, conf.num_units, conf.num_layers, conf.output_keep_prob) r_out, r_out_state = bidirectional_dynamic_rnn( cell_fw=cell1, cell_bw=cell2, inputs=self.resp_embs, sequence_length=self.resp_lens, initial_state_fw=None, initial_state_bw=None, dtype=dtype, parallel_iterations=16, swap_memory=False, time_major=False) with variable_scope.variable_scope("r_rnn", dtype=dtype, reuse=True) as scope: cell1 = CreateMultiRNNCell(conf.cell_model, conf.num_units, conf.num_layers, conf.output_keep_prob, reuse=True) cell2 = CreateMultiRNNCell(conf.cell_model, conf.num_units, conf.num_layers, conf.output_keep_prob, reuse=True) neg_r_out, neg_r_out_state = bidirectional_dynamic_rnn( cell_fw=cell1, cell_bw=cell2, inputs=self.neg_resp_embs, sequence_length=self.neg_resp_lens, initial_state_fw=None, initial_state_bw=None, dtype=dtype, parallel_iterations=16, swap_memory=False, time_major=False) fw, bw = q_out_state q_out_state = tf.concat([fw[-1].h, bw[-1].h], axis=1) fw, bw = p_out_state p_out_state = tf.concat([fw[-1].h, bw[-1].h], axis=1) fw, bw = r_out_state r_out_state = tf.concat([fw[-1].h, bw[-1].h], axis=1) fw, bw = neg_r_out_state neg_r_out_state = tf.concat([fw[-1].h, bw[-1].h], axis=1) q_out = tf.concat(q_out, axis=2) p_out = tf.concat(p_out, axis=2) r_out = tf.concat(r_out, axis=2) neg_r_out = tf.concat(neg_r_out, axis=2) # Out state Cosine dist norm_q = tf.sqrt( tf.reduce_sum(tf.square(q_out_state), 1, keep_dims=True)) norm_p = tf.sqrt( tf.reduce_sum(tf.square(p_out_state), 1, keep_dims=True)) norm_r = tf.sqrt( tf.reduce_sum(tf.square(r_out_state), 1, keep_dims=True)) norm_neg_r = tf.sqrt( tf.reduce_sum(tf.square(neg_r_out_state), 1, keep_dims=True)) cos_qp = tf.reduce_sum(q_out_state * p_out_state, 1, keep_dims=True) / (norm_q * norm_p) cos_qr = tf.reduce_sum(q_out_state * r_out_state, 1, keep_dims=True) / (norm_q * norm_r) cos_qnegr = tf.reduce_sum(q_out_state * neg_r_out_state, 1, keep_dims=True) / (norm_q * norm_neg_r) # Outputs 2-dim intersection graphlg.info("Creating cos dist...") qp_sim = tf.expand_dims(tf.matmul(q_out, p_out, transpose_b=True), -1) qr_sim = tf.expand_dims(tf.matmul(q_out, r_out, transpose_b=True), -1) qnegr_sim = tf.expand_dims( tf.matmul(q_out, neg_r_out, transpose_b=True), -1) # n-CNN max-poolling graphlg.info("Creating interactions...") with variable_scope.variable_scope("qp_cnn", dtype=dtype, reuse=None) as scope: qp_map = FeatureMatrix(conf.conv_conf, qp_sim, scope=scope, dtype=dtype) with variable_scope.variable_scope("qr_cnn", dtype=dtype, reuse=None) as scope: qr_map = FeatureMatrix(conf.conv_conf, qr_sim, scope=scope, dtype=dtype) with variable_scope.variable_scope("qr_cnn", dtype=dtype, reuse=True) as scope: qnegr_map = FeatureMatrix(conf.conv_conf, qnegr_sim, scope=scope, dtype=dtype) # h becomes 1 after max poolling qp_vec = tf.concat([tf.contrib.layers.flatten(qp_map), cos_qp], 1) qr_vec = tf.concat([tf.contrib.layers.flatten(qr_map), cos_qr], 1) qnegr_vec = tf.concat( [tf.contrib.layers.flatten(qnegr_map), cos_qnegr], 1) graphlg.info("Creating fully connected...") with variable_scope.variable_scope("qp_fc", dtype=dtype, reuse=None) as scope: qp_fc = FC(inputs=qp_vec, h_size=conf.fc_h_size, o_size=1, act=tf.nn.sigmoid) with variable_scope.variable_scope("qr_fc", dtype=dtype, reuse=None) as scope: qr_fc = FC(inputs=qr_vec, h_size=conf.fc_h_size, o_size=1, act=relu) with variable_scope.variable_scope("qr_fc", dtype=dtype, reuse=True) as scope: qnegr_fc = FC(inputs=qnegr_vec, h_size=conf.fc_h_size, o_size=1, act=relu) self.scores = tf.squeeze(qp_fc * qr_fc) self.neg_scores = tf.squeeze(qp_fc * qnegr_fc) graphlg.info("Creating optimizer and backpropagation...") self.global_params = [] self.trainable_params = tf.trainable_variables() self.optimizer_params = [] if not self.for_deploy: with variable_scope.variable_scope(self.model_kind, dtype=dtype) as scope: #self.loss = tf.losses.hinge_loss(self.neg_scores, self.scores) self.loss = tf.reduce_mean( tf.nn.relu(1 + self.neg_scores - self.scores)) self.summary = tf.summary.scalar("%s/loss" % name, self.loss) graphlg.info("Creating backpropagation graph and optimizers...") self.learning_rate = tf.Variable(float(conf.learning_rate), trainable=False, name="learning_rate") self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * conf.learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False, name="global_step") self.data_idx = tf.Variable(0, trainable=False, name="data_idx") self.data_idx_inc_op = self.data_idx.assign(self.data_idx + conf.batch_size) self.optimizers = { "SGD": tf.train.GradientDescentOptimizer(self.learning_rate), "Adadelta": tf.train.AdadeltaOptimizer(self.learning_rate), "Adagrad": tf.train.AdagradOptimizer(self.learning_rate), "AdagradDA": tf.train.AdagradDAOptimizer(self.learning_rate, self.global_step), "Moment": tf.train.MomentumOptimizer(self.learning_rate, 0.9), "Ftrl": tf.train.FtrlOptimizer(self.learning_rate), "RMSProp": tf.train.RMSPropOptimizer(self.learning_rate) } self.opt = self.optimizers[conf.opt_name] tmp = set(tf.global_variables()) if job_type == "worker": self.opt = SyncReplicasOptimizer(self.opt, conf.replicas_to_aggregate, conf.total_num_replicas) grads_and_vars = self.opt.compute_gradients(loss=self.loss) gradients, variables = zip(*grads_and_vars) else: gradients = tf.gradients(self.loss, tf.trainable_variables()) variables = tf.trainable_variables() clipped_gradients, self.grad_norm = tf.clip_by_global_norm( gradients, conf.max_gradient_norm) self.update = self.opt.apply_gradients( zip(clipped_gradients, variables), self.global_step) self.optimizer_params.append(self.learning_rate) self.optimizer_params.extend( list(set(tf.global_variables()) - tmp)) self.global_params.extend([self.global_step, self.data_idx]) self.saver = tf.train.Saver(max_to_keep=conf.max_to_keep) self.model_exporter = exporter.Exporter(self.saver) inputs = { "enc_querys:0": self.enc_querys, "query_lens:0": self.query_lens, "enc_posts:0": self.enc_posts, "post_lens:0": self.post_lens, "enc_resps:0": self.enc_resps, "resp_lens:0": self.resp_lens } outputs = {"out": self.scores} self.model_exporter.init(tf.get_default_graph().as_graph_def(), named_graph_signatures={ "inputs": exporter.generic_signature(inputs), "outputs": exporter.generic_signature(outputs) })
def build(self, for_deploy, variants=""): conf = self.conf name = self.name job_type = self.job_type dtype = self.dtype self.beam_size = 1 if (not for_deploy or variants=="score") else sum(self.conf.beam_splits) # Input maps self.in_table = lookup.MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.out_table = lookup.MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value="_UNK", shared_name="out_table", name="out_table", checkpoint=True) graphlg.info("Creating placeholders...") self.enc_str_inps = tf.placeholder(tf.string, shape=(None, conf.input_max_len), name="enc_inps") self.enc_lens = tf.placeholder(tf.int32, shape=[None], name="enc_lens") self.dec_str_inps = tf.placeholder(tf.string, shape=[None, conf.output_max_len + 2], name="dec_inps") self.dec_lens = tf.placeholder(tf.int32, shape=[None], name="dec_lens") self.down_wgts = tf.placeholder(tf.float32, shape=[None], name="down_wgts") # lookup self.enc_inps = self.in_table.lookup(self.enc_str_inps) self.dec_inps = self.in_table.lookup(self.dec_str_inps) # Create encode graph and get attn states graphlg.info("Creating embeddings and embedding enc_inps.") with ops.device("/cpu:0"): self.embedding = variable_scope.get_variable("embedding", [conf.output_vocab_size, conf.embedding_size]) self.emb_inps = embedding_lookup_unique(self.embedding, self.enc_inps) graphlg.info("Creating dynamic rnn...") self.enc_outs, self.enc_states, mem_size, enc_state_size = DynRNN(conf.cell_model, conf.num_units, conf.num_layers, self.emb_inps, self.enc_lens, keep_prob=1.0, bidi=conf.bidirectional) memory = tf.reshape(tf.concat([self.enc_outs] * self.beam_size, 2), [-1, conf.input_max_len, mem_size]) memory_lens = tf.squeeze(tf.reshape(tf.concat([tf.expand_dims(self.enc_lens, 1)] * self.beam_size, 1), [-1, 1]), 1) batch_size = tf.shape(self.enc_outs)[0] graphlg.info("Creating out_proj...") if conf.out_layer_size: w = tf.get_variable("proj_w", [conf.out_layer_size, conf.output_vocab_size], dtype=dtype) elif conf.bidirectional: w = tf.get_variable("proj_w", [conf.num_units * 2, conf.output_vocab_size], dtype=dtype) else: w = tf.get_variable("proj_w", [conf.num_units, conf.output_vocab_size], dtype=dtype) b = tf.get_variable("proj_b", [conf.output_vocab_size], dtype=dtype) self.out_proj = (w, b) graphlg.info("Preparing decoder inps...") dec_inps = tf.slice(self.dec_inps, [0, 0], [-1, conf.output_max_len + 1]) with ops.device("/cpu:0"): emb_dec_inps = embedding_lookup_unique(self.embedding, dec_inps) # Attention with variable_scope.variable_scope("decoder", dtype=dtype) as scope: decoder_cell = CreateMultiRNNCell(conf.cell_model, mem_size, conf.num_layers, conf.output_keep_prob) max_mem_size = self.conf.input_max_len + self.conf.output_max_len + 2 if conf.attention == "Luo": mechanism = dynamic_attention_wrapper.LuongAttention(num_units=mem_size, memory=memory, max_mem_size=max_mem_size, memory_sequence_length=memory_lens) elif conf.attention == "Bah": mechanism = dynamic_attention_wrapper.BahdanauAttention(num_units=mem_size, memory=memory, max_mem_size=max_mem_size, memory_sequence_length=memory_lens) else: print "Unknown attention stype, must be Luo or Bah" exit(0) attn_cell = DynamicAttentionWrapper(cell=decoder_cell, attention_mechanism=mechanism, attention_size=mem_size, addmem=self.conf.addmem) # Zeros for initial state zero_attn_states = attn_cell.zero_state(dtype=tf.float32, batch_size=batch_size * self.beam_size) init_probs = tf.zeros([batch_size * self.beam_size]) #init_probs = tf.Print(init_probs, [tf.shape(init_probs)]) # Encoder states for initial state, with vae init_states = [] KLDs = tf.zeros([batch_size * self.beam_size]) zs = [] for i, each in enumerate(self.enc_states): if isinstance(each, LSTMStateTuple): new_c = tf.reshape(tf.concat([each.c] * self.beam_size, 1), [-1, mem_size]) new_h = tf.reshape(tf.concat([each.h] * self.beam_size, 1), [-1, mem_size]) #vae_c, KLD_c, l2_c = CreateVAE(new_c, self.conf.enc_latent_dim) vae_h, KLD, l2 = CreateVAE(new_h, self.conf.enc_latent_dim, stddev=self.conf.stddev, name="vae", reuse=(i!=0)) #vae_h, KLD, l2 = CreateVAE(each.h, self.conf.enc_latent_dim, stddev=self.conf.stddev, name="vae", reuse=(i!=0)) zs.append(tf.concat([each.c, vae_h], 1)) beam_vea_h = tf.reshape(tf.tile(vae_h, [1, self.beam_size]), [-1, mem_size]) new_c = tf.reshape(tf.tile(each.c, [1, self.beam_size]), [-1, mem_size]) init_states.append(LSTMStateTuple(new_c, vae_h)) KLDs += KLD else: zs.append(each) state = tf.reshape(tf.concat([each] * self.beam_size, 1), [-1, mem_size]) vae_state, KLD, l2 = CreateVAE(state, self.conf.enc_latent_dim, name="vae", stddev=self.conf.stddev, reuse=(i!=0)) init_states.append(vae_state) KLDs += KLD z = tf.concat(zs, 1) zero_attn_states = DynamicAttentionWrapperState(tuple(init_states), zero_attn_states.attention, zero_attn_states.newmem, zero_attn_states.alignments) if not for_deploy: dec_init_state = zero_attn_states hp_train = helper.ScheduledEmbeddingTrainingHelper(inputs=emb_dec_inps, sequence_length=self.dec_lens, embedding=self.embedding, sampling_probability=0.0, out_proj=self.out_proj) output_layer = layers_core.Dense(self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None my_decoder = basic_decoder.BasicDecoder(cell=attn_cell, helper=hp_train, initial_state=dec_init_state, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode(decoder=my_decoder, impute_finished=True, maximum_iterations=conf.output_max_len + 1, scope=scope) outputs = cell_outs.rnn_output L = tf.shape(outputs)[1] outputs = tf.reshape(outputs, [-1, int(self.out_proj[0].shape[0])]) outputs = tf.matmul(outputs, self.out_proj[0]) + self.out_proj[1] logits = tf.reshape(outputs, [-1, L, int(self.out_proj[0].shape[1])]) # branch 1 for debugging, doesn't have to be called #m = tf.shape(self.outputs)[0] #self.mask = tf.zeros([m, int(w.shape[1])]) #for i in [3]: # self.mask = self.mask + tf.one_hot(indices=tf.ones([m], dtype=tf.int32) * i, on_value=100.0, depth=int(w.shape[1])) #self.outputs = self.outputs - self.mask self.outputs = tf.argmax(logits, axis=2) self.outputs = tf.reshape(self.outputs, [-1, L]) self.outputs = self.out_table.lookup(tf.cast(self.outputs, tf.int64)) # branch 2 for loss tars = tf.slice(self.dec_inps, [0, 1], [-1, L]) wgts = tf.cumsum(tf.one_hot(self.dec_lens, L), axis=1, reverse=True) batch_wgt = tf.reduce_sum(self.down_wgts) + 1e-12 #wgts = wgts * tf.expand_dims(self.down_wgts, 1) self.loss = loss.sequence_loss(logits=logits, targets=tars, weights=wgts, average_across_timesteps=False, average_across_batch=False) example_losses = tf.reduce_sum(self.loss, 1) see_loss = tf.reduce_sum(example_losses / tf.cast(self.dec_lens, tf.float32) * self.down_wgts) / batch_wgt KLD = tf.reduce_sum(KLDs * self.down_wgts) / batch_wgt self.loss = tf.reduce_sum(example_losses * self.down_wgts) / batch_wgt + KLD tf.summary.scalar("loss", see_loss) tf.summary.scalar("kld", KLD) graph_nodes = { "loss":self.loss, "inputs":{}, "outputs":{}, "debug_ouputs":self.outputs } #saver return graph_nodes else: inputs = { "enc_inps:0":self.enc_str_inps, "enc_lens:0":self.enc_lens } if variants == "score": dec_init_state = zero_attn_states hp_train = helper.ScheduledEmbeddingTrainingHelper(inputs=emb_dec_inps, sequence_length=self.dec_lens, embedding=self.embedding, sampling_probability=0.0, out_proj=self.out_proj) output_layer = layers_core.Dense(self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None my_decoder = score_decoder.ScoreDecoder(cell=attn_cell, helper=hp_train, out_proj=self.out_proj, initial_state=dec_init_state, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode(decoder=my_decoder, scope=scope, maximum_iterations=self.conf.output_max_len, impute_finished=False) L = tf.shape(cell_outs.logprobs)[1] one_hot = tf.one_hot(tf.slice(self.dec_inps, [0, 1], [-1, L]), depth=self.conf.output_vocab_size, axis=-1, on_value=1.0, off_value=0.0) outputs = tf.reduce_sum(cell_outs.logprobs * one_hot, 2) outputs = tf.reduce_sum(outputs, axis=1) graph_nodes = { "loss":None, "inputs":inputs, "outputs":{"logprobs":outputs}, "visualize":None } return graph_nodes else: dec_init_state = beam_decoder.BeamState(tf.zeros([batch_size * self.beam_size]), zero_attn_states, tf.zeros([batch_size * self.beam_size], tf.int32)) #dec_init_state = nest.map_structure(lambda x:tf.Print(x, [tf.shape(x)], message=str(x)+"dec_init"), dec_init_state) hp_infer = helper.GreedyEmbeddingHelper(embedding=self.embedding, start_tokens=tf.ones(shape=[batch_size * self.beam_size], dtype=tf.int32), end_token=EOS_ID, out_proj=self.out_proj) output_layer = layers_core.Dense(self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None my_decoder = beam_decoder.BeamDecoder(cell=attn_cell, helper=hp_infer, out_proj=self.out_proj, initial_state=dec_init_state, beam_splits=self.conf.beam_splits, max_res_num=self.conf.max_res_num, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode(decoder=my_decoder, scope=scope, maximum_iterations=self.conf.output_max_len, impute_finished=True) L = tf.shape(cell_outs.beam_ends)[1] beam_symbols = cell_outs.beam_symbols beam_parents = cell_outs.beam_parents beam_ends = cell_outs.beam_ends beam_end_parents = cell_outs.beam_end_parents beam_end_probs = cell_outs.beam_end_probs alignments = cell_outs.alignments beam_ends = tf.reshape(tf.transpose(beam_ends, [0, 2, 1]), [-1, L]) beam_end_parents = tf.reshape(tf.transpose(beam_end_parents, [0, 2, 1]), [-1, L]) beam_end_probs = tf.reshape(tf.transpose(beam_end_probs, [0, 2, 1]), [-1, L]) ## Creating tail_ids batch_size = tf.Print(batch_size, [batch_size], message="VAERNN batch") #beam_symbols = tf.Print(cell_outs.beam_symbols, [tf.shape(cell_outs.beam_symbols)], message="beam_symbols") #beam_parents = tf.Print(cell_outs.beam_parents, [tf.shape(cell_outs.beam_parents)], message="beam_parents") #beam_ends = tf.Print(cell_outs.beam_ends, [tf.shape(cell_outs.beam_ends)], message="beam_ends") #beam_end_parents = tf.Print(cell_outs.beam_end_parents, [tf.shape(cell_outs.beam_end_parents)], message="beam_end_parents") #beam_end_probs = tf.Print(cell_outs.beam_end_probs, [tf.shape(cell_outs.beam_end_probs)], message="beam_end_probs") #alignments = tf.Print(cell_outs.alignments, [tf.shape(cell_outs.alignments)], message="beam_attns") batch_offset = tf.expand_dims(tf.cumsum(tf.ones([batch_size, self.beam_size], dtype=tf.int32) * self.beam_size, axis=0, exclusive=True), 2) offset2 = tf.expand_dims(tf.cumsum(tf.ones([batch_size, self.beam_size * 2], dtype=tf.int32) * self.beam_size, axis=0, exclusive=True), 2) out_len = tf.shape(beam_symbols)[1] self.beam_symbol_strs = tf.reshape(self.out_table.lookup(tf.cast(beam_symbols, tf.int64)), [batch_size, self.beam_size, -1]) self.beam_parents = tf.reshape(beam_parents, [batch_size, self.beam_size, -1]) - batch_offset self.beam_ends = tf.reshape(beam_ends, [batch_size, self.beam_size * 2, -1]) self.beam_end_parents = tf.reshape(beam_end_parents, [batch_size, self.beam_size * 2, -1]) - offset2 self.beam_end_probs = tf.reshape(beam_end_probs, [batch_size, self.beam_size * 2, -1]) self.beam_attns = tf.reshape(alignments, [batch_size, self.beam_size, out_len, -1]) outputs = { "beam_symbols":self.beam_symbol_strs, "beam_parents":self.beam_parents, "beam_ends":self.beam_ends, "beam_end_parents":self.beam_end_parents, "beam_end_probs":self.beam_end_probs, "beam_attns":self.beam_attns } graph_nodes = { "loss":None, "inputs":inputs, "outputs":outputs, "visualize":{"z":z} } return graph_nodes
def build(self): conf = self.conf name = self.name job_type = self.job_type dtype = self.dtype # Input maps self.in_table = lookup.MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.topic_in_table = lookup.MutableHashTable( key_dtype=tf.string, value_dtype=tf.int64, default_value=2, shared_name="topic_in_table", name="topic_in_table", checkpoint=True) self.out_table = lookup.MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value="_UNK", shared_name="out_table", name="out_table", checkpoint=True) graphlg.info("Creating placeholders...") self.enc_str_inps = tf.placeholder(tf.string, shape=(None, conf.input_max_len), name="enc_inps") self.enc_lens = tf.placeholder(tf.int32, shape=[None], name="enc_lens") self.enc_str_topics = tf.placeholder(tf.string, shape=(None, None), name="enc_topics") self.dec_str_inps = tf.placeholder( tf.string, shape=[None, conf.output_max_len + 2], name="dec_inps") self.dec_lens = tf.placeholder(tf.int32, shape=[None], name="dec_lens") # table lookup self.enc_inps = self.in_table.lookup(self.enc_str_inps) self.enc_topics = self.topic_in_table.lookup(self.enc_str_topics) self.dec_inps = self.in_table.lookup(self.dec_str_inps) batch_size = tf.shape(self.enc_inps)[0] with variable_scope.variable_scope(self.model_kind, dtype=dtype) as scope: # Create encode graph and get attn states graphlg.info("Creating embeddings and do lookup...") t_major_enc_inps = tf.transpose(self.enc_inps) with ops.device("/cpu:0"): self.embedding = variable_scope.get_variable( "embedding", [conf.input_vocab_size, conf.embedding_size]) self.emb_enc_inps = embedding_lookup_unique( self.embedding, t_major_enc_inps) self.topic_embedding = variable_scope.get_variable( "topic_embedding", [conf.topic_vocab_size, conf.topic_embedding_size], trainable=False) self.emb_enc_topics = embedding_lookup_unique( self.topic_embedding, self.enc_topics) graphlg.info("Creating out projection weights...") if conf.out_layer_size != None: w = tf.get_variable( "proj_w", [conf.out_layer_size, conf.output_vocab_size], dtype=dtype) else: w = tf.get_variable("proj_w", [conf.num_units, conf.output_vocab_size], dtype=dtype) b = tf.get_variable("proj_b", [conf.output_vocab_size], dtype=dtype) self.out_proj = (w, b) graphlg.info("Creating encoding dynamic rnn...") with variable_scope.variable_scope("encoder", dtype=dtype) as scope: if conf.bidirectional: cell_fw = CreateMultiRNNCell(conf.cell_model, conf.num_units, conf.num_layers, conf.output_keep_prob) cell_bw = CreateMultiRNNCell(conf.cell_model, conf.num_units, conf.num_layers, conf.output_keep_prob) self.enc_outs, self.enc_states = bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=self.emb_enc_inps, sequence_length=self.enc_lens, dtype=dtype, parallel_iterations=16, time_major=True, scope=scope) fw_s, bw_s = self.enc_states self.enc_states = tuple([ tf.concat([f, b], axis=1) for f, b in zip(fw_s, bw_s) ]) self.enc_outs = tf.concat( [self.enc_outs[0], self.enc_outs[1]], axis=2) else: cell = CreateMultiRNNCell(conf.cell_model, conf.num_units, conf.num_layers, conf.output_keep_prob) self.enc_outs, self.enc_states = dynamic_rnn( cell=cell, inputs=self.emb_enc_inps, sequence_length=self.enc_lens, parallel_iterations=16, scope=scope, dtype=dtype, time_major=True) attn_len = tf.shape(self.enc_outs)[0] graphlg.info("Preparing init attention and states for decoder...") initial_state = self.enc_states attn_states = tf.transpose(self.enc_outs, perm=[1, 0, 2]) attn_size = self.conf.num_units topic_attn_size = self.conf.num_units k = tf.get_variable( "topic_proj", [1, 1, self.conf.topic_embedding_size, topic_attn_size]) topic_attn_states = nn_ops.conv2d( tf.expand_dims(self.emb_enc_topics, 2), k, [1, 1, 1, 1], "SAME") topic_attn_states = tf.squeeze(topic_attn_states, axis=2) graphlg.info("Creating decoder cell...") with variable_scope.variable_scope("decoder", dtype=dtype) as scope: cell = CreateMultiRNNCell(conf.cell_model, attn_size, conf.num_layers, conf.output_keep_prob) # topic if not self.for_deploy: graphlg.info( "Embedding decoder inps, tars and tar weights...") t_major_dec_inps = tf.transpose(self.dec_inps) t_major_tars = tf.slice(t_major_dec_inps, [1, 0], [conf.output_max_len + 1, -1]) t_major_dec_inps = tf.slice(t_major_dec_inps, [0, 0], [conf.output_max_len + 1, -1]) t_major_tar_wgts = tf.cumsum(tf.one_hot( self.dec_lens - 1, conf.output_max_len + 1, axis=0), axis=0, reverse=True) with ops.device("/cpu:0"): emb_dec_inps = embedding_lookup_unique( self.embedding, t_major_dec_inps) hp_train = helper.ScheduledEmbeddingTrainingHelper( inputs=emb_dec_inps, sequence_length=self.enc_lens, embedding=self.embedding, sampling_probability=0.0, out_proj=self.out_proj, except_ids=None, time_major=True) output_layer = None my_decoder = AttnTopicDecoder( cell=cell, helper=hp_train, initial_state=initial_state, attn_states=attn_states, attn_size=attn_size, topic_attn_states=topic_attn_states, topic_attn_size=topic_attn_size, output_layer=output_layer) t_major_cell_outs, final_state = decoder.dynamic_decode( decoder=my_decoder, output_time_major=True, maximum_iterations=conf.output_max_len + 1, scope=scope) t_major_outs = t_major_cell_outs.rnn_output # Branch 1 for debugging, doesn't have to be called self.outputs = tf.transpose(t_major_outs, perm=[1, 0, 2]) L = tf.shape(self.outputs)[1] w, b = self.out_proj self.outputs = tf.reshape(self.outputs, [-1, int(w.shape[0])]) self.outputs = tf.matmul(self.outputs, w) + b # For masking the except_ids when debuging #m = tf.shape(self.outputs)[0] #self.mask = tf.zeros([m, int(w.shape[1])]) #for i in [3]: # self.mask = self.mask + tf.one_hot(indices=tf.ones([m], dtype=tf.int32) * i, on_value=100.0, depth=int(w.shape[1])) #self.outputs = self.outputs - self.mask self.outputs = tf.argmax(self.outputs, axis=1) self.outputs = tf.reshape(self.outputs, [-1, L]) self.outputs = self.out_table.lookup( tf.cast(self.outputs, tf.int64)) # Branch 2 for loss self.loss = dyn_sequence_loss(self.conf, t_major_outs, self.out_proj, t_major_tars, t_major_tar_wgts) self.summary = tf.summary.scalar("%s/loss" % self.name, self.loss) # backpropagation self.build_backprop(self.loss, conf, dtype) #saver self.trainable_params.extend(tf.trainable_variables() + [self.topic_embedding]) need_to_save = self.global_params + self.trainable_params + self.optimizer_params + tf.get_default_graph( ).get_collection("saveable_objects") + [ self.topic_embedding ] self.saver = tf.train.Saver(need_to_save, max_to_keep=conf.max_to_keep) else: hp_infer = helper.GreedyEmbeddingHelper( embedding=self.embedding, start_tokens=tf.ones(shape=[batch_size], dtype=tf.int32), end_token=EOS_ID, out_proj=self.out_proj) output_layer = None #layers_core.Dense(self.conf.outproj_from_size, use_bias=True) my_decoder = AttnTopicDecoder( cell=cell, helper=hp_infer, initial_state=initial_state, attn_states=attn_states, attn_size=attn_size, topic_attn_states=topic_attn_states, topic_attn_size=topic_attn_size, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode( decoder=my_decoder, scope=scope, maximum_iterations=40) self.outputs = cell_outs.sample_id #lookup self.outputs = self.out_table.lookup( tf.cast(self.outputs, tf.int64)) #saver self.trainable_params.extend(tf.trainable_variables()) self.saver = tf.train.Saver(max_to_keep=conf.max_to_keep) # Exporter for serving self.model_exporter = exporter.Exporter(self.saver) inputs = { "enc_inps": self.enc_str_inps, "enc_lens": self.enc_lens } outputs = {"out": self.outputs} self.model_exporter.init( tf.get_default_graph().as_graph_def(), named_graph_signatures={ "inputs": exporter.generic_signature(inputs), "outputs": exporter.generic_signature(outputs) }) graphlg.info("Graph done") graphlg.info("") self.dec_states = final_state
def build(self, inputs, for_deploy): scope = "" conf = self.conf dtype = self.dtype beam_size = 1 if not for_deploy else sum(conf.beam_splits) with tf.name_scope("WordEmbedding"): # Input maps self.in_table = lookup.MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.out_table = lookup.MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value="_UNK", shared_name="out_table", name="out_table", checkpoint=True) enc_inps = self.in_table.lookup(inputs["enc_inps:0"]) dec_inps = self.in_table.lookup(inputs["dec_inps:0"]) graphlg.info("Creating embeddings and embedding enc_inps.") with tf.device("/cpu:0"): self.embedding = variable_scope.get_variable("embedding", [conf.output_vocab_size, conf.embedding_size]) emb_inps = embedding_lookup_unique(self.embedding, enc_inps) emb_dec_inps = embedding_lookup_unique(self.embedding, dec_inps) emb_dec_next_inps = tf.slice(emb_dec_inps, [0, 0, 0], [-1, conf.output_max_len + 1, -1]) batch_size = tf.shape(enc_inps)[0] # Create encode graph and get attn states graphlg.info("Creating dynamic x rnn...") enc_outs, enc_states, mem_size, enc_state_size = DynEncode(conf.cell_model, conf.num_units, conf.num_layers, emb_inps, inputs["enc_lens:0"], keep_prob=1.0, bidi=conf.bidirectional, name_scope="DynEncodeX") with tf.variable_scope("AttnEncState") as scope2: mechanism = Luong1_2(num_units=conf.num_units, memory=enc_outs, max_mem_size=conf.input_max_len, memory_sequence_length=inputs["enc_lens:0"], name=scope2.original_name_scope) if isinstance(enc_states[-1], LSTMStateTuple): #score = tf.expand_dims(tf.nn.softmax(mechanism(enc_states[-1].h)), 1) score = tf.expand_dims(mechanism(enc_states[-1].h, ()), 1) attention_h = tf.squeeze(tf.matmul(score, enc_outs), 1) enc_state = LSTMStateTuple(enc_states[-1].c, attention_h) else: #score = tf.expand_dims(tf.nn.softmax(mechanism(enc_states[-1])), 1) score = tf.expand_dims(mechanism(enc_states[-1], ()), 1) enc_state = tf.squeeze(tf.matmul(score, enc_outs), 1) hidden_units = int(math.sqrt(mem_size * conf.enc_latent_dim)) z, mu_prior, logvar_prior = Ptheta([enc_state], hidden_units, conf.enc_latent_dim, stddev=1, prior_type=conf.prior_type, name_scope="EncToPtheta") KLD = 0.0 # Y inputs for posterior z when training if not for_deploy: #with tf.name_scope("variational_distribution") as scope: y_emb_inps = tf.slice(emb_dec_inps, [0, 1, 0], [-1, -1, -1]) y_enc_outs, y_enc_states, y_mem_size, y_enc_state_size = DynEncode(conf.cell_model, conf.num_units, conf.num_layers, y_emb_inps, inputs["dec_lens:0"], keep_prob=conf.keep_prob, bidi=False, name_scope="DynEncodeY") z, KLD, l2 = VAE([enc_state, y_enc_states[-1]], conf.enc_latent_dim, mu_prior, logvar_prior, name_scope="VAE") # project z + x_thinking_state to decoder state with tf.name_scope("GatedZState"): if isinstance(enc_state, LSTMStateTuple): h_gate = tf.layers.dense(z, int(enc_state.h.get_shape()[1]), use_bias=True, name="z_gate_h", activation=tf.sigmoid) c_gate = tf.layers.dense(z, int(enc_state.c.get_shape()[1]), use_bias=True, name="z_gate_c", activation=tf.sigmoid) raw_dec_states = tf.concat([c_gate * enc_state.c, h_gate * enc_state.h, z], 1) #raw_dec_states = LSTMStateTuple(tf.concat([c_gate * enc_state.c, z], 1), tf.concat([h_gate * enc_state.h, z], 1)) else: gate = tf.layers.dense(z, int(enc_state.get_shape()[1]), use_bias=True, name="z_gate", activation=tf.sigmoid) raw_dec_states = tf.concat([gate * enc_state, z], 1) # add BOW loss #num_hidden_units = int(math.sqrt(conf.output_vocab_size * int(decision_state.shape[1]))) #bow_l1 = layers_core.Dense(num_hidden_units, use_bias=True, name="bow_hidden", activation=tf.tanh) #bow_l2 = layers_core.Dense(conf.output_vocab_size, use_bias=True, name="bow_out", activation=None) #bow = bow_l2(bow_l1(decision_state)) #y_dec_inps = tf.slice(self.dec_inps, [0, 1], [-1, -1]) #bow_y = tf.reduce_sum(tf.one_hot(y_dec_inps, on_value=1.0, off_value=0.0, axis=-1, depth=conf.output_vocab_size), axis=1) #batch_bow_losses = tf.reduce_sum(bow_y * (-1.0) * tf.nn.log_softmax(bow), axis=1) max_mem_size = conf.input_max_len + conf.output_max_len + 2 with tf.name_scope("ShapeToBeam"): beam_raw_dec_states = nest.map_structure(lambda x:tile_batch(x, beam_size), raw_dec_states) beam_memory = nest.map_structure(lambda x:tile_batch(x, beam_size), enc_outs) beam_memory_lens = tf.squeeze(nest.map_structure(lambda x:tile_batch(x, beam_size), tf.expand_dims(inputs["enc_lens:0"], 1)), 1) beam_z = nest.map_structure(lambda x:tile_batch(x, beam_size), z) #def _to_beam(t): # beam_t = tf.reshape(tf.tile(t, [1, beam_size]), [-1, int(t.get_shape()[1])]) # return beam_t #with tf.name_scope("ShapeToBeam") as scope: # beam_raw_dec_states = tf.contrib.framework.nest.map_structure(_to_beam, raw_dec_states) # beam_memory = tf.reshape(tf.tile(self.enc_outs, [1, 1, beam_size]), [-1, conf.input_max_len, mem_size]) # beam_memory_lens = tf.squeeze(tf.reshape(tf.tile(tf.expand_dims(inputs["enc_lens:0"], 1), [1, beam_size]), [-1, 1]), 1) # beam_z = tf.contrib.framework.nest.map_structure(_to_beam, z) #cell = AttnCell(cell_model=conf.cell_model, num_units=mem_size, num_layers=conf.num_layers, # attn_type=conf.attention, memory=beam_memory, mem_lens=beam_memory_lens, # max_mem_size=max_mem_size, addmem=conf.addmem, z=beam_z, keep_prob=conf.keep_prob, # dtype=tf.float32) #with tf.variable_scope("DynDecode/AttnCell") as dyn_scope: decoder_multi_rnn_cells = CreateMultiRNNCell(conf.cell_model, num_units=mem_size, num_layers=conf.num_layers, output_keep_prob=conf.keep_prob) zero_cell_states = DecCellStateInit(beam_raw_dec_states, decoder_multi_rnn_cells, name="InitCell") attn_cell = AttnCellWrapper(cell=decoder_multi_rnn_cells, cell_init_states=zero_cell_states, attn_type=conf.attention, attn_size=mem_size, memory=beam_memory, mem_lens=beam_memory_lens, max_mem_size=max_mem_size, addmem=conf.addmem, z=beam_z, dtype=tf.float32, name="AttnWrapper") if self.conf.attention: dec_init_state = None else: dec_init_state = beam_decoder.BeamState(tf.zeros_like(beam_memory_lens, tf.float32), zero_cell_states, tf.zeros_like(beam_memory_lens)) with tf.variable_scope("OutProj"): graphlg.info("Creating out_proj...") if conf.out_layer_size: w = tf.get_variable("proj_w", [conf.out_layer_size, conf.output_vocab_size], dtype=dtype) else: w = tf.get_variable("proj_w", [mem_size, conf.output_vocab_size], dtype=dtype) b = tf.get_variable("proj_b", [conf.output_vocab_size], dtype=dtype) out_proj = (w, b) if not for_deploy: hp_train = helper1_2.ScheduledEmbeddingTrainingHelper(inputs=emb_dec_next_inps, sequence_length=inputs["dec_lens:0"], embedding=self.embedding, sampling_probability=0.0, out_proj=out_proj) output_layer = layers_core.Dense(conf.out_layer_size, use_bias=True) if conf.out_layer_size else None my_decoder = basic_decoder1_2.BasicDecoder(cell=attn_cell, helper=hp_train, initial_state=dec_init_state, output_layer=output_layer) cell_outs, final_state, seq_len = decoder1_2.dynamic_decode(decoder=my_decoder, impute_finished=True, maximum_iterations=conf.output_max_len + 1) #cell_outs = tf.Print(cell_outs, [tf.shape(cell_outs)], message="cell_outs_shape") with tf.name_scope("Logits"): L = tf.shape(cell_outs.rnn_output)[1] rnn_output = tf.reshape(cell_outs.rnn_output, [-1, int(out_proj[0].shape[0])]) rnn_output = tf.matmul(rnn_output, out_proj[0]) + out_proj[1] logits = tf.reshape(rnn_output, [-1, L, int(out_proj[0].shape[1])]) with tf.name_scope("DebugOutputs") as scope: outputs = tf.argmax(logits, axis=2) outputs = tf.reshape(outputs, [-1, L]) outputs = self.out_table.lookup(tf.cast(outputs, tf.int64)) # branch 2 for loss with tf.name_scope("Loss") as scope: tars = tf.slice(dec_inps, [0, 1], [-1, L]) # wgts may be a more complicated form, for example a partial down-weighting of a sequence # but here i just use 1.0 weights for all no-padding label wgts = tf.cumsum(tf.one_hot(inputs["dec_lens:0"], L), axis=1, reverse=True) #wgts = wgts * tf.expand_dims(self.down_wgts, 1) loss_matrix = loss.sequence_loss(logits=logits, targets=tars, weights=wgts, average_across_timesteps=False, average_across_batch=False) #bow_loss = tf.reduce_sum(batch_bow_losses * self.down_wgts) / batch_wgt example_total_wgts = tf.reduce_sum(wgts, 1) total_wgts = tf.reduce_sum(example_total_wgts) example_losses = tf.reduce_sum(loss_matrix, 1) see_loss = tf.reduce_sum(example_losses) / total_wgts KLD = tf.reduce_sum(KLD * example_total_wgts) / total_wgts self.loss = tf.reduce_sum(example_losses + conf.kld_ratio * KLD) / total_wgts with tf.name_scope(self.model_kind): tf.summary.scalar("loss", see_loss) tf.summary.scalar("kld", KLD) #tf.summary.scalar("bow", bow_loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) graph_nodes = { "loss":self.loss, "inputs":inputs, "debug_outputs":outputs, "outputs":{}, "visualize":None } return graph_nodes else: beam_batch_size = tf.shape(beam_memory_lens)[0] hp_infer = helper1_2.GreedyEmbeddingHelper(embedding=self.embedding, start_tokens=tf.ones([beam_batch_size], dtype=tf.int32), end_token=EOS_ID, out_proj=out_proj) output_layer = layers_core.Dense(conf.out_layer_size, use_bias=True) if conf.out_layer_size else None my_decoder = beam_decoder.BeamDecoder(cell=attn_cell, helper=hp_infer, out_proj=out_proj, initial_state=dec_init_state, beam_splits=conf.beam_splits, max_res_num=conf.max_res_num, output_layer=output_layer) #cell_outs, final_state = decoder.dynamic_decode(decoder=my_decoder, scope=scope, maximum_iterations=conf.output_max_len) cell_outs, final_state, seq_len = decoder1_2.dynamic_decode(decoder=my_decoder, impute_finished=True, maximum_iterations=conf.output_max_len + 1) L = tf.shape(cell_outs.beam_ends)[1] beam_symbols = cell_outs.beam_symbols beam_parents = cell_outs.beam_parents beam_ends = cell_outs.beam_ends beam_end_parents = cell_outs.beam_end_parents beam_end_probs = cell_outs.beam_end_probs alignments = cell_outs.alignments beam_ends = tf.reshape(tf.transpose(beam_ends, [0, 2, 1]), [-1, L]) beam_end_parents = tf.reshape(tf.transpose(beam_end_parents, [0, 2, 1]), [-1, L]) beam_end_probs = tf.reshape(tf.transpose(beam_end_probs, [0, 2, 1]), [-1, L]) # Creating tail_ids batch_size = beam_batch_size / beam_size batch_size = tf.Print(batch_size, [batch_size], message="BATCH") #beam_symbols = tf.Print(cell_outs.beam_symbols, [tf.shape(cell_outs.beam_symbols)], message="beam_symbols") #beam_parents = tf.Print(cell_outs.beam_parents, [tf.shape(cell_outs.beam_parents)], message="beam_parents") #beam_ends = tf.Print(cell_outs.beam_ends, [tf.shape(cell_outs.beam_ends)], message="beam_ends") #beam_end_parents = tf.Print(cell_outs.beam_end_parents, [tf.shape(cell_outs.beam_end_parents)], message="beam_end_parents") #beam_end_probs = tf.Print(cell_outs.beam_end_probs, [tf.shape(cell_outs.beam_end_probs)], message="beam_end_probs") #alignments = tf.Print(cell_outs.alignments, [tf.shape(cell_outs.alignments)], message="beam_attns") batch_offset = tf.expand_dims(tf.cumsum(tf.ones([batch_size, beam_size], dtype=tf.int32) * beam_size, axis=0, exclusive=True), 2) offset2 = tf.expand_dims(tf.cumsum(tf.ones([batch_size, beam_size * 2], dtype=tf.int32) * beam_size, axis=0, exclusive=True), 2) out_len = tf.shape(beam_symbols)[1] self.beam_symbol_strs = tf.reshape(self.out_table.lookup(tf.cast(beam_symbols, tf.int64)), [batch_size, beam_size, -1]) self.beam_parents = tf.reshape(beam_parents, [batch_size, beam_size, -1]) - batch_offset self.beam_ends = tf.reshape(beam_ends, [batch_size, beam_size * 2, -1]) self.beam_end_parents = tf.reshape(beam_end_parents, [batch_size, beam_size * 2, -1]) - offset2 self.beam_end_probs = tf.reshape(beam_end_probs, [batch_size, beam_size * 2, -1]) self.beam_attns = tf.reshape(alignments, [batch_size, beam_size, out_len, -1]) #cell_outs.alignments #self.outputs = tf.concat([outputs_str, tf.cast(cell_outs.beam_parents, tf.string)], 1) #ones = tf.ones([batch_size, self.beam_size], dtype=tf.int32) #aux_matrix = tf.cumsum(ones * self.beam_size, axis=0, exclusive=True) #tm_beam_parents_reverse = tf.reverse(tf.transpose(cell_outs.beam_parents), axis=[0]) #beam_probs = final_state[1] #def traceback(prev_out, curr_input): # return tf.gather(curr_input, prev_out) # #tail_ids = tf.reshape(tf.cumsum(ones, axis=1, exclusive=True) + aux_matrix, [-1]) #tm_symbol_index_reverse = tf.scan(traceback, tm_beam_parents_reverse, initializer=tail_ids) ## Create beam index for symbols, and other info #tm_symbol_index = tf.concat([tf.expand_dims(tail_ids, 0), tm_symbol_index_reverse], axis=0) #tm_symbol_index = tf.reverse(tm_symbol_index, axis=[0]) #tm_symbol_index = tf.slice(tm_symbol_index, [1, 0], [-1, -1]) #symbol_index = tf.expand_dims(tf.transpose(tm_symbol_index), axis=2) #symbol_index = tf.concat([symbol_index, tf.cumsum(tf.ones_like(symbol_index), exclusive=True, axis=1)], axis=2) ## index alignments and output symbols #alignments = tf.gather_nd(cell_outs.alignments, symbol_index) #symbol_ids = tf.gather_nd(cell_outs.beam_symbols, symbol_index) ## outputs and other info #self.others = [alignments, beam_probs] #self.outputs = self.out_table.lookup(tf.cast(symbol_ids, tf.int64)) outputs = { "beam_symbols":self.beam_symbol_strs, "beam_parents":self.beam_parents, "beam_ends":self.beam_ends, "beam_end_parents":self.beam_end_parents, "beam_end_probs":self.beam_end_probs, "beam_attns":self.beam_attns } infer_inputs = {} infer_inputs["enc_inps:0"] = inputs["enc_inps:0"] infer_inputs["enc_lens:0"] = inputs["enc_lens:0"] graph_nodes = { "loss":None, "inputs":infer_inputs, "outputs":outputs, "visualize":{"z":z} } return graph_nodes
def build(self): # All possible inputs graphlg.info("Creating inputs and tables...") batch_size = None self.enc_querys = tf.placeholder( tf.string, shape=[batch_size, conf.input_max_len], name="enc_querys") self.query_lens = tf.placeholder(tf.int32, shape=[batch_size], name="query_lens") self.enc_posts = tf.placeholder(tf.string, shape=[batch_size, conf.input_max_len], name="enc_posts") self.post_lens = tf.placeholder(tf.int32, shape=[batch_size], name="post_lens") self.enc_resps = tf.placeholder(tf.string, shape=[batch_size, conf.input_max_len], name="enc_resps") self.resp_lens = tf.placeholder(tf.int32, shape=[batch_size], name="resp_lens") self.target = tf.placeholder(tf.float32, shape=[batch_size], name="target") #TODO table obj, lookup ops and embedding and its lookup op should be placed on the same device with tf.device("/cpu:0"): self.embedding = variable_scope.get_variable( "embedding", [conf.input_vocab_size, conf.embedding_size], initializer=tf.random_uniform_initializer(-0.08, 0.08)) self.in_table = lookup.MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.query_embs = embedding_lookup_unique( self.embedding, self.in_table.lookup(self.enc_querys)) self.post_embs = embedding_lookup_unique( self.embedding, self.in_table.lookup(self.enc_posts)) self.resp_embs = embedding_lookup_unique( self.embedding, self.in_table.lookup(self.enc_resps)) # MultiRNNCell graphlg.info("Creating multi-layer cells...") # Bi-RNN encoder graphlg.info("Creating bi-rnn...") #q_out = self.query_embs with variable_scope.variable_scope("q_rnn", dtype=dtype, reuse=None) as scope: cell1 = MultiRNNCell( [CreateCell(conf) for _ in range(conf.num_layers)]) cell2 = MultiRNNCell( [CreateCell(conf) for _ in range(conf.num_layers)]) q_out, q_out_state = bidirectional_dynamic_rnn( cell_fw=cell1, cell_bw=cell2, inputs=self.query_embs, sequence_length=self.query_lens, initial_state_fw=None, initial_state_bw=None, dtype=dtype, parallel_iterations=16, swap_memory=False, time_major=False, scope=scope) with variable_scope.variable_scope("p_rnn", dtype=dtype, reuse=None) as scope: cell1 = MultiRNNCell( [CreateCell(conf) for _ in range(conf.num_layers)]) cell2 = MultiRNNCell( [CreateCell(conf) for _ in range(conf.num_layers)]) p_out, p_out_state = bidirectional_dynamic_rnn( cell_fw=cell1, cell_bw=cell2, inputs=self.post_embs, sequence_length=self.post_lens, initial_state_fw=None, initial_state_bw=None, dtype=dtype, parallel_iterations=16, swap_memory=False, time_major=False, scope=scope) with variable_scope.variable_scope("r_rnn", dtype=dtype, reuse=None) as scope: cell1 = MultiRNNCell( [CreateCell(conf) for _ in range(conf.num_layers)]) cell2 = MultiRNNCell( [CreateCell(conf) for _ in range(conf.num_layers)]) r_out, r_out_state = bidirectional_dynamic_rnn( cell_fw=cell1, cell_bw=cell2, inputs=self.resp_embs, sequence_length=self.resp_lens, initial_state_fw=None, initial_state_bw=None, dtype=dtype, parallel_iterations=16, swap_memory=False, time_major=False, scope=scope) #q_out_state = tf.concat(q_out_state, axis=1) #p_out_state = tf.concat(p_out_state, axis=1) #r_out_state = tf.concat(r_out_state, axis=1) q_out = tf.concat(q_out, axis=2) p_out = tf.concat(p_out, axis=2) r_out = tf.concat(r_out, axis=2) # Three feature matrice graphlg.info("Creating three cnn feature matrice and cos dist...") with variable_scope.variable_scope("q_cnn1", dtype=dtype, reuse=None) as scope: q_m = FeatureMatrix(conf, q_out, scope=scope, dtype=dtype) with variable_scope.variable_scope("p_cnn1", dtype=dtype, reuse=None) as scope: p_m = FeatureMatrix(conf, p_out, scope=scope, dtype=dtype) with variable_scope.variable_scope("r_cnn1", dtype=dtype, reuse=None) as scope: r_m = FeatureMatrix(conf, r_out, scope=scope, dtype=dtype) graphlg.info("Creating interactions...") # h becomes 1 after max poolling q_vec = tf.reshape(q_m, [-1, conf.num_units * 1 * 2 * conf.c1]) #q_vec = tf.reshape(q_m, [-1, 1 * 1 * conf.c1]) p_vec = tf.reshape(p_m, [-1, conf.num_units * 1 * 2 * conf.c1]) #p_vec = tf.reshape(p_m, [-1, 1 * 1 * conf.c1]) r_vec = tf.reshape(r_m, [-1, conf.num_units * 1 * 2 * conf.c1]) #r_vec = tf.reshape(r_m, [-1, 1 * 1 * conf.c1]) norm_q = tf.sqrt(tf.reduce_sum(tf.square(q_vec), 1, keep_dims=True)) norm_p = tf.sqrt(tf.reduce_sum(tf.square(p_vec), 1, keep_dims=True)) norm_r = tf.sqrt(tf.reduce_sum(tf.square(r_vec), 1, keep_dims=True)) cos_q_p = tf.reduce_sum(q_vec * p_vec, 1, keep_dims=True) / (norm_q * norm_p) cos_q_r = tf.reduce_sum(q_vec * r_vec, 1, keep_dims=True) / (norm_q * norm_r) qpcos_vec = tf.concat([q_vec, p_vec, cos_q_p], axis=1) qrcos_vec = tf.concat([q_vec, r_vec, cos_q_r], axis=1) #qpcos_vec = tf.concat([q_vec, p_vec], axis=1) #qrcos_vec = tf.concat([q_vec, r_vec], axis=1) h_size = int(math.sqrt(conf.num_units * 2 * 1 * conf.c1 * 2 + 1)) qp_fc1 = tf.contrib.layers.fully_connected( inputs=qpcos_vec, num_outputs=h_size, activation_fn=relu, weights_initializer=tf.random_uniform_initializer(-0.08, 0.08), biases_initializer=tf.random_uniform_initializer(-0.08, 0.08)) qp_fc2 = tf.contrib.layers.fully_connected( inputs=qp_fc1, num_outputs=1, activation_fn=tf.nn.sigmoid, weights_initializer=tf.random_uniform_initializer(-0.2, 0.2), biases_initializer=tf.random_uniform_initializer(-0.4, 0.4)) qr_fc1 = tf.contrib.layers.fully_connected( inputs=qrcos_vec, num_outputs=h_size, activation_fn=relu, weights_initializer=tf.random_uniform_initializer(-0.08, 0.08), biases_initializer=tf.random_uniform_initializer(-0.08, 0.08)) qr_fc2 = tf.contrib.layers.fully_connected( inputs=qr_fc1, num_outputs=1, activation_fn=tf.nn.sigmoid, weights_initializer=tf.random_uniform_initializer(-0.2, 0.2), biases_initializer=tf.random_uniform_initializer(-0.4, 0.4)) self.scores = tf.squeeze(qp_fc2 * qr_fc2) graphlg.info("Creating optimizer and backpropagation...") self.global_params = [] self.trainable_params = tf.global_variables() self.optimizer_params = [] if not for_deploy: with variable_scope.variable_scope("deepmatch", dtype=dtype) as scope: self.loss = tf.reduce_mean(tf.square(self.target - self.scores)) self.summary = tf.summary.scalar("%s/loss" % name, self.loss) self.learning_rate = tf.Variable(float(conf.learning_rate), trainable=False, name="learning_rate") self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * conf.learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False, name="global_step") self.data_idx = tf.Variable(0, trainable=False, name="data_idx") self.data_idx_inc_op = self.data_idx.assign(self.data_idx + conf.batch_size) graphlg.info("Creating backpropagation graph and optimizers...") self.optimizers = { "SGD": tf.train.GradientDescentOptimizer(self.learning_rate), "Adadelta": tf.train.AdadeltaOptimizer(self.learning_rate), "Adagrad": tf.train.AdagradOptimizer(self.learning_rate), "AdagradDA": tf.train.AdagradDAOptimizer(self.learning_rate, self.global_step), "Moment": tf.train.MomentumOptimizer(self.learning_rate, 0.9), "Ftrl": tf.train.FtrlOptimizer(self.learning_rate), "RMSProp": tf.train.RMSPropOptimizer(self.learning_rate) } self.opt = self.optimizers[conf.opt_name] tmp = set(tf.global_variables()) if job_type == "worker": self.opt = SyncReplicasOptimizer(self.opt, conf.replicas_to_aggregate, conf.total_num_replicas) grads_and_vars = self.opt.compute_gradients(loss=self.loss) gradients, variables = zip(*grads_and_vars) else: gradients = tf.gradients(self.loss, tf.trainable_variables()) variables = tf.trainable_variables() clipped_gradients, self.grad_norm = tf.clip_by_global_norm( gradients, conf.max_gradient_norm) self.update = self.opt.apply_gradients( zip(clipped_gradients, variables), self.global_step) self.optimizer_params.append(self.learning_rate) self.optimizer_params.extend( list(set(tf.global_variables()) - tmp)) self.global_params.extend([self.global_step, self.data_idx]) self.saver = tf.train.Saver(max_to_keep=conf.max_to_keep)
def build(self, for_deploy, variants=""): conf = self.conf name = self.name job_type = self.job_type dtype = self.dtype self.beam_size = 1 if (not for_deploy or variants == "score") else sum( self.conf.beam_splits) graphlg.info("Creating placeholders...") self.enc_str_inps = tf.placeholder(tf.string, shape=(None, conf.input_max_len), name="enc_inps") self.enc_lens = tf.placeholder(tf.int32, shape=[None], name="enc_lens") self.dec_str_inps = tf.placeholder( tf.string, shape=[None, conf.output_max_len + 2], name="dec_inps") self.dec_lens = tf.placeholder(tf.int32, shape=[None], name="dec_lens") self.down_wgts = tf.placeholder(tf.float32, shape=[None], name="down_wgts") with tf.name_scope("TableLookup"): # lookup tables self.in_table = lookup.MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.out_table = lookup.MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value="_UNK", shared_name="out_table", name="out_table", checkpoint=True) self.enc_inps = self.in_table.lookup(self.enc_str_inps) self.dec_inps = self.in_table.lookup(self.dec_str_inps) # Create encode graph and get attn states graphlg.info("Creating embeddings and embedding enc_inps.") with ops.device("/cpu:0"): self.embedding = variable_scope.get_variable( "embedding", [conf.output_vocab_size, conf.embedding_size]) with tf.name_scope("Embed") as scope: dec_inps = tf.slice(self.dec_inps, [0, 0], [-1, conf.output_max_len + 1]) with ops.device("/cpu:0"): self.emb_inps = embedding_lookup_unique( self.embedding, self.enc_inps) emb_dec_inps = embedding_lookup_unique(self.embedding, dec_inps) # output projector (w, b) with tf.variable_scope("OutProj"): if conf.out_layer_size: w = tf.get_variable( "proj_w", [conf.out_layer_size, conf.output_vocab_size], dtype=dtype) elif conf.bidirectional: w = tf.get_variable( "proj_w", [conf.num_units * 2, conf.output_vocab_size], dtype=dtype) else: w = tf.get_variable("proj_w", [conf.num_units, conf.output_vocab_size], dtype=dtype) b = tf.get_variable("proj_b", [conf.output_vocab_size], dtype=dtype) graphlg.info("Creating dynamic rnn...") self.enc_outs, self.enc_states, mem_size, enc_state_size = DynRNN( conf.cell_model, conf.num_units, conf.num_layers, self.emb_inps, self.enc_lens, keep_prob=1.0, bidi=conf.bidirectional, name_scope="DynRNNEncoder") batch_size = tf.shape(self.enc_outs)[0] # Do vae on the state of the last layer of the encoder final_enc_states = [] KLDs = 0.0 for each in self.enc_states: z, KLD, l2 = CreateVAE([each], self.conf.enc_latent_dim, name_scope="VAE") if isinstance(each, LSTMStateTuple): final_enc_states.append( LSTMStateTuple(each.c, tf.concat([each.h, z], 1))) else: final_enc_state.append(tf.concat([z, each], 1)) KLDs += KLD / self.conf.num_layers with tf.name_scope("DynRNNDecode") as scope: with tf.name_scope("ShapeToBeam") as scope: beam_memory = tf.reshape( tf.tile(self.enc_outs, [1, 1, self.beam_size]), [-1, conf.input_max_len, mem_size]) beam_memory_lens = tf.squeeze( tf.reshape( tf.tile(tf.expand_dims(self.enc_lens, 1), [1, self.beam_size]), [-1, 1]), 1) def _to_beam(t): return tf.reshape(tf.tile(t, [1, self.beam_size]), [-1, int(t.get_shape()[1])]) beam_init_states = tf.contrib.framework.nest.map_structure( _to_beam, final_enc_states) max_mem_size = self.conf.input_max_len + self.conf.output_max_len + 2 cell = AttnCell(cell_model=conf.cell_model, num_units=mem_size, num_layers=conf.num_layers, attn_type=self.conf.attention, memory=beam_memory, mem_lens=beam_memory_lens, max_mem_size=max_mem_size, addmem=self.conf.addmem, keep_prob=conf.keep_prob, dtype=tf.float32, name_scope="AttnCell") dec_init_state = DecStateInit(all_enc_states=beam_init_states, decoder_cell=cell, batch_size=batch_size * self.beam_size, init_type="each2each") if not for_deploy: hp_train = helper.ScheduledEmbeddingTrainingHelper( inputs=emb_dec_inps, sequence_length=self.dec_lens, embedding=self.embedding, sampling_probability=self.conf.sample_prob, out_proj=(w, b)) output_layer = layers_core.Dense( self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None my_decoder = basic_decoder.BasicDecoder( cell=cell, helper=hp_train, initial_state=dec_init_state, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode( decoder=my_decoder, impute_finished=False, maximum_iterations=conf.output_max_len + 1, scope=scope) elif variants == "score": dec_init_state = zero_attn_states hp_train = helper.ScheduledEmbeddingTrainingHelper( inputs=emb_dec_inps, sequence_length=self.dec_lens, embedding=self.embedding, sampling_probability=0.0, out_proj=(w, b)) output_layer = layers_core.Dense( self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None my_decoder = score_decoder.ScoreDecoder( cell=cell, helper=hp_train, out_proj=(w, b), initial_state=dec_init_state, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode( decoder=my_decoder, scope=scope, maximum_iterations=self.conf.output_max_len, impute_finished=False) else: hp_infer = helper.GreedyEmbeddingHelper( embedding=self.embedding, start_tokens=tf.ones(shape=[batch_size * self.beam_size], dtype=tf.int32), end_token=EOS_ID, out_proj=(w, b)) output_layer = layers_core.Dense( self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None my_decoder = beam_decoder.BeamDecoder( cell=cell, helper=hp_infer, out_proj=(w, b), initial_state=dec_init_state, beam_splits=self.conf.beam_splits, max_res_num=self.conf.max_res_num, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode( decoder=my_decoder, scope=scope, maximum_iterations=self.conf.output_max_len, impute_finished=True) if not for_deploy: outputs = cell_outs.rnn_output # Output ouputprojected to logits L = tf.shape(outputs)[1] outputs = tf.reshape(outputs, [-1, int(w.shape[0])]) outputs = tf.matmul(outputs, w) + b logits = tf.reshape(outputs, [-1, L, int(w.shape[1])]) # branch 1 for debugging, doesn't have to be called with tf.name_scope("DebugOutputs") as scope: self.outputs = tf.argmax(logits, axis=2) self.outputs = tf.reshape(self.outputs, [-1, L]) self.outputs = self.out_table.lookup( tf.cast(self.outputs, tf.int64)) with tf.name_scope("Loss") as scope: tars = tf.slice(self.dec_inps, [0, 1], [-1, L]) wgts = tf.cumsum(tf.one_hot(self.dec_lens, L), axis=1, reverse=True) #wgts = wgts * tf.expand_dims(self.down_wgts, 1) self.loss = loss.sequence_loss(logits=logits, targets=tars, weights=wgts, average_across_timesteps=False, average_across_batch=False) example_losses = tf.reduce_sum(self.loss, 1) batch_wgt = tf.reduce_sum(self.down_wgts) see_KLD = tf.reduce_sum(KLDs * self.down_wgts) / batch_wgt see_loss = tf.reduce_sum(example_losses / tf.cast( self.dec_lens, tf.float32) * self.down_wgts) / batch_wgt # not average over length self.loss = tf.reduce_sum( (example_losses + self.conf.kld_ratio * KLDs) * self.down_wgts) / batch_wgt with tf.name_scope(self.model_kind): tf.summary.scalar("loss", see_loss) tf.summary.scalar("kld", see_KLD) graph_nodes = { "loss": self.loss, "inputs": {}, "outputs": {}, "debug_outputs": self.outputs } elif variants == "score": L = tf.shape(cell_outs.logprobs)[1] one_hot = tf.one_hot(tf.slice(self.dec_inps, [0, 1], [-1, L]), depth=self.conf.output_vocab_size, axis=-1, on_value=1.0, off_value=0.0) outputs = tf.reduce_sum(cell_outs.logprobs * one_hot, 2) outputs = tf.reduce_sum(outputs, axis=1) inputs = { "enc_inps:0": self.enc_str_inps, "enc_lens:0": self.enc_lens, "dec_inps:0": self.dec_str_inps, "dec_lens:0": self.dec_lens } graph_nodes = { "loss": None, "inputs": inputs, "outputs": { "logprobs": outputs }, "visualize": None } else: L = tf.shape(cell_outs.beam_ends)[1] beam_symbols = cell_outs.beam_symbols beam_parents = cell_outs.beam_parents beam_ends = cell_outs.beam_ends beam_end_parents = cell_outs.beam_end_parents beam_end_probs = cell_outs.beam_end_probs alignments = cell_outs.alignments beam_ends = tf.reshape(tf.transpose(beam_ends, [0, 2, 1]), [-1, L]) beam_end_parents = tf.reshape( tf.transpose(beam_end_parents, [0, 2, 1]), [-1, L]) beam_end_probs = tf.reshape( tf.transpose(beam_end_probs, [0, 2, 1]), [-1, L]) ## Creating tail_ids batch_size = tf.Print(batch_size, [batch_size], message="VAERNN2 batch") batch_offset = tf.expand_dims( tf.cumsum( tf.ones([batch_size, self.beam_size], dtype=tf.int32) * self.beam_size, axis=0, exclusive=True), 2) offset2 = tf.expand_dims( tf.cumsum( tf.ones([batch_size, self.beam_size * 2], dtype=tf.int32) * self.beam_size, axis=0, exclusive=True), 2) out_len = tf.shape(beam_symbols)[1] self.beam_symbol_strs = tf.reshape( self.out_table.lookup(tf.cast(beam_symbols, tf.int64)), [batch_size, self.beam_size, -1]) self.beam_parents = tf.reshape( beam_parents, [batch_size, self.beam_size, -1]) - batch_offset self.beam_ends = tf.reshape(beam_ends, [batch_size, self.beam_size * 2, -1]) self.beam_end_parents = tf.reshape( beam_end_parents, [batch_size, self.beam_size * 2, -1]) - offset2 self.beam_end_probs = tf.reshape( beam_end_probs, [batch_size, self.beam_size * 2, -1]) self.beam_attns = tf.reshape( alignments, [batch_size, self.beam_size, out_len, -1]) inputs = { "enc_inps:0": self.enc_str_inps, "enc_lens:0": self.enc_lens } outputs = { "beam_symbols": self.beam_symbol_strs, "beam_parents": self.beam_parents, "beam_ends": self.beam_ends, "beam_end_parents": self.beam_end_parents, "beam_end_probs": self.beam_end_probs, "beam_attns": self.beam_attns } graph_nodes = { "loss": None, "inputs": inputs, "outputs": outputs, "visualize": { "z": z } } return graph_nodes
def build(self, for_deploy, variants=""): conf = self.conf name = self.name job_type = self.job_type dtype = self.dtype self.beam_size = 1 if (not for_deploy or variants == "score") else sum( self.conf.beam_splits) # Input maps self.in_table = lookup.MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.enc_str_inps = tf.placeholder(tf.string, shape=(None, conf.input_max_len), name="enc_inps") self.enc_lens = tf.placeholder(tf.int32, shape=[None], name="enc_lens") self.tags = tf.placeholder(tf.int32, shape=[None, conf.tag_num], name="tags") self.down_wgts = tf.placeholder(tf.float32, shape=[None], name="down_wgts") # lookup self.enc_inps = self.in_table.lookup(self.enc_str_inps) #self.enc_inps = tf.Print(self.enc_inps, [self.enc_inps], message="enc_inps", summarize=100000) with variable_scope.variable_scope(self.model_kind, dtype=dtype) as scope: # Create encode graph and get attn states graphlg.info("Creating embeddings and embedding enc_inps.") with ops.device("/cpu:0"): self.embedding = variable_scope.get_variable( "embedding", [conf.output_vocab_size, conf.embedding_size], initializer=tf.random_uniform_initializer(-0.08, 0.08)) self.emb_enc_inps = embedding_lookup_unique( self.embedding, self.enc_inps) graphlg.info("Creating dynamic rnn...") if conf.bidirectional: with variable_scope.variable_scope("encoder", dtype=dtype) as scope: cell_fw = CreateMultiRNNCell(conf.cell_model, conf.num_units, conf.num_layers, conf.output_keep_prob) cell_bw = CreateMultiRNNCell(conf.cell_model, conf.num_units, conf.num_layers, conf.output_keep_prob) self.enc_outs, self.enc_states = bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=self.emb_enc_inps, sequence_length=self.enc_lens, dtype=dtype, parallel_iterations=16, scope=scope) fw_s, bw_s = self.enc_states self.enc_states = [] for f, b in zip(fw_s, bw_s): if isinstance(f, LSTMStateTuple): self.enc_states.append( LSTMStateTuple(tf.concat([f.c, b.c], axis=1), tf.concat([f.h, b.h], axis=1))) else: self.enc_states.append(tf.concat([f, b], 1)) self.enc_outs = tf.concat([self.enc_outs[0], self.enc_outs[1]], axis=2) mem_size = 2 * conf.num_units enc_state_size = 2 * conf.num_units else: with variable_scope.variable_scope("encoder", dtype=dtype) as scope: cell = CreateMultiRNNCell(conf.cell_model, conf.num_units, conf.num_layers, conf.output_keep_prob) self.enc_outs, self.enc_states = dynamic_rnn( cell=cell, inputs=self.emb_enc_inps, sequence_length=self.enc_lens, parallel_iterations=16, scope=scope, dtype=dtype) mem_size = conf.num_units enc_state_size = conf.num_units self.enc_outs = tf.expand_dims(self.enc_outs, -1) with variable_scope.variable_scope("cnn", dtype=dtype, reuse=None) as scope: feature_map = FeatureMatrix(conf.conv_conf, self.enc_outs, scope=scope, dtype=dtype) vec = tf.contrib.layers.flatten(feature_map) with variable_scope.variable_scope("fc", dtype=dtype, reuse=False) as scope: fc_out = FC(inputs=vec, h_size=conf.fc_h_size, o_size=conf.tag_num, act=relu) self.outputs = fc_out if not for_deploy: #self.tags = tf.Print(self.tags, [self.tags], message="tags", summarize=10000) loss = tf.losses.softmax_cross_entropy(self.tags, self.outputs) see_loss = loss tf.summary.scalar("loss", see_loss) self.summary_ops = tf.summary.merge_all() self.update = self.backprop(loss) self.train_outputs_map["loss"] = see_loss self.train_outputs_map["update"] = self.update self.fo_outputs_map["loss"] = see_loss self.debug_outputs_map["loss"] = see_loss self.debug_outputs_map["outputs"] = self.outputs, self.debug_outputs_map["update"] = self.update #saver self.trainable_params.extend(tf.trainable_variables()) self.saver = tf.train.Saver(max_to_keep=conf.max_to_keep) else: if variants == "": self.infer_outputs_map["tags"] = tf.nn.softmax(self.outputs) else: pass #saver self.trainable_params.extend(tf.trainable_variables()) self.saver = tf.train.Saver(max_to_keep=conf.max_to_keep) # Exporter for serving self.model_exporter = exporter.Exporter(self.saver) inputs = { "enc_inps:0": self.enc_str_inps, "enc_lens:0": self.enc_lens } outputs = self.infer_outputs_map self.model_exporter.init(tf.get_default_graph().as_graph_def(), named_graph_signatures={ "inputs": exporter.generic_signature(inputs), "outputs": exporter.generic_signature(outputs) }) graphlg.info("Graph done") graphlg.info("") return