def cs_bahdanau_attention(key, context, hidden_size, depth, projected_align=False): """ It is a implementation of the Bahdanau et al. attention mechanism. Based on the papers: https://arxiv.org/abs/1409.0473 "Neural Machine Translation by Jointly Learning to Align and Translate" https://andre-martins.github.io/docs/emnlp2017_final.pdf "Learning What's Easy: Fully Differentiable Neural Easy-First Taggers" Args: key: A tensorflow tensor with dimensionality [None, None, key_size] context: A tensorflow tensor with dimensionality [None, None, max_num_tokens, token_size] hidden_size: Number of units in hidden representation depth: Number of csoftmax usages projected_align: Using bidirectional lstm for hidden representation of context. If true, beetween input and attention mechanism insert layer of bidirectional lstm with dimensionality [hidden_size]. If false, bidirectional lstm is not used. Returns: output: Tensor at the output with dimensionality [None, None, depth * hidden_size] """ if hidden_size % 2 != 0: raise ValueError("hidden size must be dividable by two") batch_size = tf.shape(context)[0] max_num_tokens, token_size = context.get_shape().as_list()[-2:] r_context = tf.reshape(context, shape=[-1, max_num_tokens, token_size]) # projected context: [None, max_num_tokens, token_size] projected_context = tf.layers.dense(r_context, token_size, kernel_initializer=xav(), name='projected_context') # projected_key: [None, None, hidden_size] projected_key = tf.layers.dense(key, hidden_size, kernel_initializer=xav(), name='projected_key') r_projected_key = \ tf.tile(tf.reshape(projected_key, shape=[-1, 1, hidden_size]), [1, max_num_tokens, 1]) lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size//2) lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size//2) (output_fw, output_bw), states = \ tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fw_cell, cell_bw=lstm_bw_cell, inputs=projected_context, dtype=tf.float32) # bilstm_output: [-1, max_num_tokens, hidden_size] bilstm_output = tf.concat([output_fw, output_bw], -1) concat_h_state = tf.concat([r_projected_key, output_fw, output_bw], -1) if projected_align: log.info("Using projected attention alignment") h_state_for_attn_alignment = bilstm_output aligned_h_state = csoftmax_attention.attention_bah_block( concat_h_state, h_state_for_attn_alignment, depth) output = \ tf.reshape(aligned_h_state, shape=[batch_size, -1, depth * hidden_size]) else: log.info("Using without projected attention alignment") h_state_for_attn_alignment = projected_context aligned_h_state = csoftmax_attention.attention_bah_block( concat_h_state, h_state_for_attn_alignment, depth) output = \ tf.reshape(aligned_h_state, shape=[batch_size, -1, depth * token_size]) return output
def __graph__(): tf.reset_default_graph() # entry points features_ = tf.placeholder(tf.float32, [1, obs_size], name='input_features') init_state_c_, init_state_h_ = (tf.placeholder( tf.float32, [1, nb_hidden]) for _ in range(2)) action_ = tf.placeholder(tf.int32, name='ground_truth_action') # input projection - 인풋 dimention을 맞춰주기 위한 trick ############## Wi = tf.get_variable('Wi', [obs_size, nb_hidden], initializer=xav()) bi = tf.get_variable('bi', [nb_hidden], initializer=tf.constant_initializer(0.)) projected_features = tf.matmul(features_, Wi) + bi ######################################################################## lstm_f = tf.contrib.rnn.LSTMCell(num_units=nb_hidden, state_is_tuple=True) lstm_op, state = lstm_f(inputs=projected_features, state=(init_state_c_, init_state_h_)) # ouput projection - 아웃풋 dimention을 맞춰주기 위한 trick ########### state_reshaped = tf.concat(axis=1, values=(state.c, state.h)) Wo = tf.get_variable('Wo', [2 * nb_hidden, action_size], initializer=xav()) bo = tf.get_variable('bo', [action_size], initializer=tf.constant_initializer(0.)) logits = tf.matmul(state_reshaped, Wo) + bo ######################################################################## probs = tf.squeeze(tf.nn.softmax(logits)) prediction = tf.arg_max(probs, dimension=0) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=action_) # default was 0.1 train_op = tf.train.AdadeltaOptimizer(0.01).minimize(loss) # each output values self.loss = loss self.prediction = prediction self.probs = probs self.logits = logits self.state = state self.train_op = train_op # attach placeholder self.features_ = features_ self.init_state_c_ = init_state_c_ self.init_state_h_ = init_state_h_ self.action_ = action_
def __graph__(): tf.reset_default_graph() features_ = tf.placeholder(tf.float32, [1, obs_size], name='input_features') init_state_c_, init_state_h_ = ( tf.placeholder(tf.float32, [1, nb_hidden]) for _ in range(2) ) action_ = tf.placeholder(tf.int32, name='ground_truth_action') if self.is_action_mask: action_mask_ = tf.placeholder(tf.float32, [action_size], name='action_mask') # input projection Wi = tf.get_variable('Wi', [obs_size, nb_hidden], initializer=xav()) bi = tf.get_variable('bi', [nb_hidden], initializer=tf.constant_initializer(0.)) projected_features = tf.matmul(features_, Wi) + bi lstm_f = tf.contrib.rnn.LSTMCell(nb_hidden, state_is_tuple=True) output, state = lstm_f(inputs=projected_features, state=(init_state_c_, init_state_h_)) # reshape LSTM's state tuple (2,128) -> (1,256) state_reshaped = tf.concat(axis=1, values=(state.c, state.h)) # output projection - desnse Wo = tf.get_variable('Wo', [2*nb_hidden, action_size], initializer=xav()) bo = tf.get_variable('bo', [action_size], initializer=tf.constant_initializer(0.)) logits = tf.matmul(state_reshaped, Wo) + bo loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=action_) train_op = tf.train.AdadeltaOptimizer(0.1).minimize(loss) # probabilities # normalization : elemwise multiply with action mask if self.is_action_mask: probs = tf.multiply(tf.squeeze(tf.nn.softmax(logits)), action_mask_) else: probs = tf.squeeze(tf.squeeze(tf.nn.softmax(logits))) # prediction prediction = tf.arg_max(probs, dimension=0) # attach symbols to self self.loss = loss self.prediction = prediction self.probs = probs self.logits = logits self.state = state self.train_op = train_op # attach placeholders self.features_ = features_ self.init_state_c_ = init_state_c_ self.init_state_h_ = init_state_h_ self.action_ = action_ if self.is_action_mask: self.action_mask_ = action_mask_
def bahdanau_attention(key, context, hidden_size, projected_align=False): """ It is a implementation of the Bahdanau et al. attention mechanism. Based on the paper: https://arxiv.org/abs/1409.0473 "Neural Machine Translation by Jointly Learning to Align and Translate" Args: key: A tensorflow tensor with dimensionality [None, None, key_size] context: A tensorflow tensor with dimensionality [None, None, max_num_tokens, token_size] hidden_size: Number of units in hidden representation projected_align: Using bidirectional lstm for hidden representation of context. If true, beetween input and attention mechanism insert layer of bidirectional lstm with dimensionality [hidden_size]. If false, bidirectional lstm is not used. Returns: output: Tensor at the output with dimensionality [None, None, hidden_size] """ if hidden_size % 2 != 0: raise ValueError("hidden size must be dividable by two") batch_size = tf.shape(context)[0] max_num_tokens, token_size = context.get_shape().as_list()[-2:] r_context = tf.reshape(context, shape=[-1, max_num_tokens, token_size]) # projected_key: [None, None, hidden_size] projected_key = tf.layers.dense(key, hidden_size, kernel_initializer=xav()) r_projected_key = \ tf.tile(tf.reshape(projected_key, shape=[-1, 1, hidden_size]), [1, max_num_tokens, 1]) lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size // 2) lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size // 2) (output_fw, output_bw), states = \ tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fw_cell, cell_bw=lstm_bw_cell, inputs=r_context, dtype=tf.float32) # bilstm_output: [-1,self.max_num_tokens,_n_hidden] bilstm_output = tf.concat([output_fw, output_bw], -1) concat_h_state = tf.concat([r_projected_key, output_fw, output_bw], -1) projected_state = \ tf.layers.dense(concat_h_state, hidden_size, use_bias=False, kernel_initializer=xav()) score = \ tf.layers.dense(tf.tanh(projected_state), units=1, use_bias=False, kernel_initializer=xav()) attn = tf.nn.softmax(score, dim=1) if projected_align: log.info("Using projected attention alignment") t_context = tf.transpose(bilstm_output, [0, 2, 1]) output = tf.reshape(tf.matmul(t_context, attn), shape=[batch_size, -1, hidden_size]) else: log.info("Using without projected attention alignment") t_context = tf.transpose(r_context, [0, 2, 1]) output = tf.reshape(tf.matmul(t_context, attn), shape=[batch_size, -1, token_size]) return output
def bahdanau_attention(key, context, hidden_size, projected_align=False): """ It is a implementation of the Bahdanau et al. attention mechanism. Based on the paper: https://arxiv.org/abs/1409.0473 "Neural Machine Translation by Jointly Learning to Align and Translate" Args: key: A tensorflow tensor with dimensionality [None, None, key_size] context: A tensorflow tensor with dimensionality [None, None, max_num_tokens, token_size] hidden_size: Number of units in hidden representation projected_align: Using bidirectional lstm for hidden representation of context. If true, beetween input and attention mechanism insert layer of bidirectional lstm with dimensionality [hidden_size]. If false, bidirectional lstm is not used. Returns: output: Tensor at the output with dimensionality [None, None, hidden_size] """ if hidden_size % 2 != 0: raise ValueError("hidden size must be dividable by two") batch_size = tf.shape(context)[0] max_num_tokens, token_size = context.get_shape().as_list()[-2:] r_context = tf.reshape(context, shape=[-1, max_num_tokens, token_size]) # projected_key: [None, None, hidden_size] projected_key = tf.layers.dense(key, hidden_size, kernel_initializer=xav()) r_projected_key = \ tf.tile(tf.reshape(projected_key, shape=[-1, 1, hidden_size]), [1, max_num_tokens, 1]) lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size//2) lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size//2) (output_fw, output_bw), states = \ tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fw_cell, cell_bw=lstm_bw_cell, inputs=r_context, dtype=tf.float32) # bilstm_output: [-1,self.max_num_tokens,_n_hidden] bilstm_output = tf.concat([output_fw, output_bw], -1) concat_h_state = tf.concat([r_projected_key, output_fw, output_bw], -1) projected_state = \ tf.layers.dense(concat_h_state, hidden_size, use_bias=False, kernel_initializer=xav()) score = \ tf.layers.dense(tf.tanh(projected_state), units=1, use_bias=False, kernel_initializer=xav()) attn = tf.nn.softmax(score, dim=1) if projected_align: log.info("Using projected attention alignment") t_context = tf.transpose(bilstm_output, [0, 2, 1]) output = tf.reshape(tf.matmul(t_context, attn), shape=[batch_size, -1, hidden_size]) else: log.info("Using without projected attention alignment") t_context = tf.transpose(r_context, [0, 2, 1]) output = tf.reshape(tf.matmul(t_context, attn), shape=[batch_size, -1, token_size]) return output
def light_bahdanau_attention(key, context, hidden_size, projected_align=False): """ It is a implementation of the Bahdanau et al. attention mechanism. Based on the paper: https://arxiv.org/abs/1409.0473 "Neural Machine Translation by Jointly Learning to Align and Translate" Args: key: A tensorflow tensor with dimensionality [None, None, key_size] context: A tensorflow tensor with dimensionality [None, None, max_num_tokens, token_size] hidden_size: Number of units in hidden representation projected_align: Using dense layer for hidden representation of context. If true, between input and attention mechanism insert a dense layer with dimensionality [hidden_size]. If false, a dense layer is not used. Returns: output: Tensor at the output with dimensionality [None, None, hidden_size] """ batch_size = tf.shape(context)[0] max_num_tokens, token_size = context.get_shape().as_list()[-2:] r_context = tf.reshape(context, shape=[-1, max_num_tokens, token_size]) # projected_key: [None, None, hidden_size] projected_key = tf.layers.dense(key, hidden_size, kernel_initializer=xav()) r_projected_key = \ tf.tile(tf.reshape(projected_key, shape=[-1, 1, hidden_size]), [1, max_num_tokens, 1]) # projected_context: [None, max_num_tokens, hidden_size] projected_context = \ tf.layers.dense(r_context, hidden_size, kernel_initializer=xav()) concat_h_state = tf.concat([projected_context, r_projected_key], -1) projected_state = \ tf.layers.dense(concat_h_state, hidden_size, use_bias=False, kernel_initializer=xav()) score = \ tf.layers.dense(tf.tanh(projected_state), units=1, use_bias=False, kernel_initializer=xav()) attn = tf.nn.softmax(score, dim=1) if projected_align: log.info("Using projected attention alignment") t_context = tf.transpose(projected_context, [0, 2, 1]) output = tf.reshape(tf.matmul(t_context, attn), shape=[batch_size, -1, hidden_size]) else: log.info("Using without projected attention alignment") t_context = tf.transpose(r_context, [0, 2, 1]) output = tf.reshape(tf.matmul(t_context, attn), shape=[batch_size, -1, token_size]) return output
def light_general_attention(key, context, hidden_size, projected_align=False): """ It is a implementation of the Luong et al. attention mechanism with general score. Based on the paper: https://arxiv.org/abs/1508.04025 "Effective Approaches to Attention-based Neural Machine Translation" Args: key: A tensorflow tensor with dimensionality [None, None, key_size] context: A tensorflow tensor with dimensionality [None, None, max_num_tokens, token_size] hidden_size: Number of units in hidden representation projected_align: Using dense layer for hidden representation of context. If true, between input and attention mechanism insert a dense layer with dimensionality [hidden_size]. If false, a dense layer is not used. Returns: output: Tensor at the output with dimensionality [None, None, hidden_size] """ batch_size = tf.shape(context)[0] max_num_tokens, token_size = context.get_shape().as_list()[-2:] r_context = tf.reshape(context, shape=[-1, max_num_tokens, token_size]) # projected_key: [None, None, hidden_size] projected_key = tf.layers.dense(key, hidden_size, kernel_initializer=xav()) r_projected_key = tf.reshape(projected_key, shape=[-1, hidden_size, 1]) # projected context: [None, None, hidden_size] projected_context = \ tf.layers.dense(r_context, hidden_size, kernel_initializer=xav()) attn = tf.nn.softmax(tf.matmul(projected_context, r_projected_key), dim=1) if projected_align: log.info("Using projected attention alignment") t_context = tf.transpose(projected_context, [0, 2, 1]) output = tf.reshape(tf.matmul(t_context, attn), shape=[batch_size, -1, hidden_size]) else: log.info("Using without projected attention alignment") t_context = tf.transpose(r_context, [0, 2, 1]) output = tf.reshape(tf.matmul(t_context, attn), shape=[batch_size, -1, token_size]) return output
def _build_body(self) -> Tuple[tf.Tensor, tf.Tensor]: # input projection _units = tf.layers.dense(self._features, self.dense_size, kernel_regularizer=tf.nn.l2_loss, kernel_initializer=xav()) if self.attention_params: _attn_output = self._build_attn_body() _units = tf.concat([_units, _attn_output], -1) _units = tf_layers.variational_dropout( _units, keep_prob=self._dropout_keep_prob) # recurrent network unit _lstm_cell = tf.nn.rnn_cell.LSTMCell(self.hidden_size) _utter_lengths = tf.cast(tf.reduce_sum(self._utterance_mask, axis=-1), tf.int32) # _output: [batch_size, max_time, hidden_size] # _state: tuple of two [batch_size, hidden_size] _output, _state = tf.nn.dynamic_rnn(_lstm_cell, _units, time_major=False, initial_state=self._initial_state, sequence_length=_utter_lengths) _output = tf.reshape(_output, (self._batch_size, -1, self.hidden_size)) _output = tf_layers.variational_dropout( _output, keep_prob=self._dropout_keep_prob) # output projection _logits = tf.layers.dense(_output, self.action_size, kernel_regularizer=tf.nn.l2_loss, kernel_initializer=xav(), name='logits') return _logits, _state
def __graph__(): tf.reset_default_graph() # entry points features_ = tf.placeholder(tf.float32, [1, obs_size], name='input_features') init_state_c_, init_state_h_ = (tf.placeholder( tf.float32, [1, nb_hidden]) for _ in range(2)) action_ = tf.placeholder(tf.int32, name='ground_truth_action') # action_mask disabled (line 22, 49, 74, 96, 112) action_mask_ = tf.placeholder(tf.float32, [action_size], name='action_mask') # input projection with tf.name_scope("input"): with tf.name_scope("weights"): Wi = tf.get_variable('Wi', [obs_size, nb_hidden], initializer=xav()) variable_summarize(Wi) with tf.name_scope("biases"): bi = tf.get_variable( 'bi', [nb_hidden], initializer=tf.constant_initializer(0.)) variable_summarize(bi) # add relu/tanh here if necessary with tf.name_scope("projected_features"): projected_features = tf.matmul(features_, Wi) + bi tf.summary.histogram('histogram', projected_features) lstm_f = tf.contrib.rnn.LSTMCell(nb_hidden, state_is_tuple=True) lstm_op, state = lstm_f(inputs=projected_features, state=(init_state_c_, init_state_h_)) # reshape LSTM's state tuple (2,128) -> (1,256) state_reshaped = tf.concat(axis=1, values=(state.c, state.h)) # output projection with tf.name_scope("outputs"): with tf.name_scope("weights"): Wo = tf.get_variable('Wo', [2 * nb_hidden, action_size], initializer=xav()) variable_summarize(Wo) with tf.name_scope("biases"): bo = tf.get_variable( 'bo', [action_size], initializer=tf.constant_initializer(0.)) variable_summarize(bo) # get logits with tf.name_scope("logits"): logits = tf.matmul(state_reshaped, Wo) + bo tf.summary.histogram('histogram', logits) # probabilities # normalization : elemwise multiply with action mask probs = tf.multiply(tf.squeeze(tf.nn.softmax(logits)), action_mask_) #print("PROBS : ", probs) # prediction prediction = tf.argmax(probs, dimension=0) # loss loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=action_, name="loss") tf.summary.scalar('loss', tf.squeeze(loss)) # train op train_op = tf.train.AdadeltaOptimizer(0.1).minimize(loss) # attach symbols to self self.loss = loss self.prediction = prediction self.probs = probs self.logits = logits self.state = state self.train_op = train_op # attach placeholders self.features_ = features_ self.init_state_c_ = init_state_c_ self.init_state_h_ = init_state_h_ self.action_ = action_ self.action_mask_ = action_mask_ self.merged = tf.summary.merge_all()
def __graph__(self): with tf.variable_scope('vae'): self.vrae = getattr(vae, self.config['vae_model'])(self.config, self.rev_vocab, self.sess) # entry points input_words = tf.placeholder( tf.float32, [None, self.max_input_length, self.max_sequence_length], name='input_words') input_contexts = tf.placeholder( tf.float32, [None, self.max_input_length, self.feature_vector_size], name='input_contexts') action_ = tf.placeholder(tf.int32, [None, self.max_input_length], name='ground_truth_action') prev_action_ = tf.placeholder( tf.float32, [None, self.max_input_length, self.action_size], name='prev_action') action_mask_ = tf.placeholder( tf.float32, [None, self.max_input_length, self.action_size], name='action_mask') action_seq_length = tf.count_nonzero(action_, -1) vae_outputs = tf.reshape( self.vrae.z, shape=[-1, self.max_input_length, self.config['latent_size']]) # input projection Wi_var = tf.get_variable( 'Wi_var', shape=[ self.feature_vector_size + self.config['latent_size'] + 2 * self.action_size, self.config['dialog_level_embedding_size'] ], dtype=tf.float32, initializer=xav()) bi_var = tf.get_variable( 'bi_var', shape=[self.config['dialog_level_embedding_size']], dtype=tf.float32, initializer=tf.constant_initializer(0.)) turn_features_var = tf.concat([vae_outputs, input_contexts], axis=-1) all_inputs_var = tf.concat( [turn_features_var, action_mask_, prev_action_], axis=-1) # add relu/tanh here if necessary projected_features_var = tf.tensordot(all_inputs_var, Wi_var, axes=1) + bi_var lstm_cell_var = tf.contrib.rnn.BasicLSTMCell( self.config['dialog_level_embedding_size'], state_is_tuple=True, name='dialog_encoder_var') outputs_var, states_var = tf.nn.dynamic_rnn(lstm_cell_var, projected_features_var, dtype=tf.float32) # output projection Wo = tf.get_variable('Wo', shape=[ self.config['dialog_level_embedding_size'], self.action_size ], dtype=tf.float32, initializer=xav()) bo = tf.get_variable('bo', shape=[self.action_size], dtype=tf.float32, initializer=tf.constant_initializer(0.)) logits = tf.tensordot(outputs_var, Wo, axes=1) + bo # probabilities # normalization : elemwise multiply with action mask # not doing softmax because it's taken care of in the cross-entropy! probs = tf.multiply(logits, action_mask_) # prediction prediction = tf.argmax(probs, axis=-1) mask_fn = lambda l: tf.sequence_mask( l, self.max_input_length, dtype=tf.float32) sequence_mask = mask_fn(action_seq_length) # sequence_mask = tf.placeholder(tf.float32, [None, self.max_input_length], name='sequence_mask') # loss self.hcn_loss = tf.contrib.seq2seq.sequence_loss( logits=logits, targets=action_, weights=sequence_mask, average_across_batch=False) self.vae_kl_loss = tf.reduce_mean(tf.reshape( self.vrae._kl_loss_fn(self.vrae.z_mean, self.vrae.z_logvar), shape=[-1, self.max_input_length]), axis=-1) # self.vae_nll_loss = tf.reduce_mean(tf.reshape(self.vrae._nll_loss_fn(), shape=[-1, self.max_input_length]), axis=-1) self.vae_bow_loss = tf.reduce_mean(tf.reshape( self.vrae._bow_loss_fn(self.vrae.bow_logits, self.vrae.bow_targets), shape=[-1, self.max_input_length]), axis=-1) self.vae_overall_loss = self.vae_kl_loss + self.vae_bow_loss # + self.vae_nll_loss self.loss = self.hcn_loss + self.vae_overall_loss self.lr = tf.train.exponential_decay( self.config['learning_rate'], self.global_step, self.config.get('steps_before_decay', 0), self.config.get('learning_rate_decay', 1.0), staircase=True) # train op optimizer = getattr(tf.train, self.config['optimizer'])(self.lr) gradients, variables = zip(*optimizer.compute_gradients(self.loss)) gradients, _ = tf.clip_by_global_norm(gradients, self.config['clip_norm']) train_op = optimizer.apply_gradients(zip(gradients, variables), global_step=self.global_step) # attach symbols to self self.prediction = prediction self.probs = probs self.logits = logits self.sequence_mask_ = sequence_mask self.train_op = train_op # attach placeholders self.input_words = input_words self.input_contexts = input_contexts self.action_ = action_ self.action_mask_ = action_mask_ self.prev_action_ = prev_action_
def __graph__(self): # entry points input_words_ = tf.placeholder(tf.int32, [None, self.max_input_length, self.max_sequence_length], name='input_words') bow_features_ = tf.placeholder(tf.float32, [None, self.max_input_length, self.config['vocabulary_size']], name='bow_features') context_features_ = tf.placeholder(tf.float32, [None, self.max_input_length, self.feature_vector_size], name='input_features') action_ = tf.placeholder(tf.int32, [None, self.max_input_length], name='ground_truth_action') prev_action_ = tf.placeholder(tf.float32, [None, self.max_input_length, self.action_size], name='prev_action') action_mask_ = tf.placeholder(tf.float32, [None, self.max_input_length, self.action_size], name='action_mask') action_seq_length = tf.count_nonzero(action_, -1) embedding_matrix = tf.get_variable('emb', initializer=tf.constant(get_w2v_model(self.vocab)), trainable=True) lookup_result = tf.nn.embedding_lookup(embedding_matrix, input_words_) masked_emb = tf.concat([tf.zeros([1, 1]), tf.ones([embedding_matrix.get_shape()[0] - 1, 1])], axis=0) mask_lookup_result = tf.nn.embedding_lookup(masked_emb, input_words_) lookup_result = tf.multiply(lookup_result, mask_lookup_result) utterance_embeddings = tf.reduce_mean(lookup_result, axis=2) all_input = tf.concat([utterance_embeddings, bow_features_, context_features_, prev_action_, action_mask_], axis=-1) # input projection Wi = tf.get_variable('Wi', shape=[self.feature_vector_size + self.config['w2v_embedding_size'] + self.config['vocabulary_size'] + 2 * self.action_size, self.nb_hidden], dtype=tf.float32, initializer=xav()) bi = tf.get_variable('bi', shape=[self.nb_hidden], dtype=tf.float32, initializer=tf.constant_initializer(0.)) # add relu/tanh here if necessary projected_features = tf.tensordot(all_input, Wi, axes=1) + bi Wbow = tf.get_variable('Wbow', shape=[self.nb_hidden, len(self.vocab)], dtype=tf.float32, initializer=xav()) bbow = tf.get_variable('bbow', shape=[len(self.vocab)], dtype=tf.float32, initializer=tf.constant_initializer(0.)) bow_logits = tf.tensordot(projected_features, Wbow, axes=1) + bbow lstm_cell = tf.contrib.rnn.BasicLSTMCell(self.nb_hidden, state_is_tuple=True) outputs, states = tf.nn.dynamic_rnn(lstm_cell, projected_features, dtype=tf.float32) # output projection Wo = tf.get_variable('Wo', shape=[self.nb_hidden, self.action_size], dtype=tf.float32, initializer=xav()) bo = tf.get_variable('bo', shape=[self.action_size], dtype=tf.float32, initializer=tf.constant_initializer(0.)) # get logits logits = tf.tensordot(outputs, Wo, axes=1) + bo # probabilities # normalization : elemwise multiply with action mask # not doing softmax because it's taken care of in the cross-entropy! probs = tf.multiply(logits, action_mask_) # prediction prediction = tf.argmax(probs, axis=-1) mask_fn = lambda l: tf.sequence_mask(l, self.max_input_length, dtype=tf.float32) sequence_mask = mask_fn(action_seq_length) # sequence_mask = tf.placeholder(tf.float32, [None, self.max_input_length], name='sequence_mask') # loss l2_loss = tf.reduce_sum([tf.nn.l2_loss(v) for v in tf.trainable_variables() if v.name[0] != 'b']) * self.config['l2_coef'] hcn_loss = tf.contrib.seq2seq.sequence_loss(logits=logits, targets=action_, weights=sequence_mask, average_across_batch=False) bow_loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=bow_logits, labels=bow_features_), axis=-1) loss = hcn_loss + l2_loss + bow_loss # train op self.lr = tf.train.exponential_decay(self.config['learning_rate'], self.global_step, self.config.get('steps_before_decay', 0), self.config.get('learning_rate_decay', 1.0), staircase=True) optimizer = getattr(tf.train, self.config['optimizer'])(self.lr) gradients, variables = zip(*optimizer.compute_gradients(loss)) gradients_filtered, variables_filtered = [], [] if len(self.trainable_vars): for gradient, variable in zip(gradients, variables): if variable.name in self.trainable_vars: gradients_filtered.append(gradient) variables_filtered.append(variable) else: gradients_filtered, variables_filtered = gradients, variables gradients, _ = tf.clip_by_global_norm(gradients_filtered, self.config['clip_norm']) train_op = optimizer.apply_gradients(zip(gradients_filtered, variables_filtered), global_step=self.global_step) # attach symbols to self self.loss = loss self.prediction = prediction self.probs = probs self.logits = logits self.train_op = train_op # attach placeholders self.input_words_ = input_words_ self.context_features_ = context_features_ self.bow_features_ = bow_features_ self.action_ = action_ self.prev_action_ = prev_action_ self.action_mask_ = action_mask_
def __graph__(): tf.reset_default_graph() # entry points features_ = tf.placeholder(tf.float32, [1, obs_size], name='input_features') init_state_f_, init_state_s_ = (tf.placeholder( tf.float32, [1, nb_hidden]) for _ in range(2)) action_ = tf.placeholder(tf.int32, name='ground_truth_action') if self.is_action_mask: action_mask_ = tf.placeholder(tf.float32, [action_size], name='action_mask') # input projection - 인풋 dimention을 맞춰주기 위한 trick ############## Wi = tf.get_variable('Wi', [obs_size, nb_hidden], initializer=xav()) bi = tf.get_variable('bi', [nb_hidden], initializer=tf.constant_initializer(0.)) projected_features = tf.matmul(features_, Wi) + bi ######################################################################## gru_f = tf.contrib.rnn.GRUCell(num_units=nb_hidden) stacked_gru = tf.contrib.rnn.MultiRNNCell([gru_f] * 2) output, state = stacked_gru(projected_features, state=(init_state_f_, init_state_s_)) # ouput projection - 아웃풋 dimention을 맞춰주기 위한 trick ########### state_reshaped = tf.concat(axis=1, values=(state[0], state[1])) Wo = tf.get_variable('Wo', [2 * nb_hidden, action_size], initializer=xav()) bo = tf.get_variable('bo', [action_size], initializer=tf.constant_initializer(0.)) logits = tf.matmul(state_reshaped, Wo) + bo loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=action_) train_op = tf.train.AdamOptimizer(0.1).minimize(loss) ######################################################################## if self.is_action_mask: probs = tf.multiply(tf.squeeze(tf.nn.softmax(logits)), action_mask_) else: probs = tf.squeeze(tf.squeeze(tf.nn.softmax(logits))) prediction = tf.arg_max(probs, dimension=0) # each output values self.loss = loss self.prediction = prediction self.probs = probs self.logits = logits self.state = state self.train_op = train_op # attach placeholder self.features_ = features_ self.init_state_f_ = init_state_f_ self.init_state_s_ = init_state_s_ self.action_ = action_ if self.is_action_mask: self.action_mask_ = action_mask_
def __graph__(): tf.reset_default_graph() # entry points features_ = tf.placeholder(tf.float32, [1, obs_size], name='input_features') init_state_h_ = tf.placeholder(tf.float32, [1, nb_hidden]) action_ = tf.placeholder(tf.int32, name='ground_truth_action') if self.is_action_mask: action_mask_ = tf.placeholder(tf.float32, [action_size], name='action_mask') # input projection - 인풋 dimention을 맞춰주기 위한 trick ############## Wi = tf.get_variable('Wi', [obs_size, nb_hidden], initializer=xav()) bi = tf.get_variable('bi', [nb_hidden], initializer=tf.constant_initializer(0.)) projected_features = tf.matmul(features_, Wi) + bi ######################################################################## gru_f = tf.contrib.rnn.GRUCell(num_units=nb_hidden) gru_op, state = gru_f(inputs=projected_features, state=init_state_h_) # ouput projection - 아웃풋 dimention을 맞춰주기 위한 trick ########### state_reshaped = state Wo = tf.get_variable('Wo', [nb_hidden, action_size], initializer=xav()) bo = tf.get_variable('bo', [action_size], initializer=tf.constant_initializer(0.)) logits = tf.matmul(state_reshaped, Wo) + bo loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=action_) train_op = tf.train.AdadeltaOptimizer(0.1).minimize(loss) ######################################################################## # probabilities # normalization : elemwise multiply with action mask if self.is_action_mask: probs = tf.multiply(tf.squeeze(tf.nn.softmax(logits)), action_mask_) else: probs = tf.squeeze(tf.squeeze(tf.nn.softmax(logits))) # prediction prediction = tf.arg_max(probs, dimension=0) # each output values self.loss = loss self.prediction = prediction self.probs = probs self.logits = logits self.state = state self.train_op = train_op # attach placeholder self.features_ = features_ self.init_state_h_ = init_state_h_ self.action_ = action_ if self.is_action_mask: self.action_mask_ = action_mask_
def __graph__(): tf.reset_default_graph() # entry points features_ = tf.placeholder(tf.float32, [1, obs_size], name='input_features') # 365 init_state_c_, init_state_h_ = (tf.placeholder( tf.float32, [1, nb_hidden]) for _ in range(2)) # 128 action_ = tf.placeholder(tf.int32, name='ground_truth_action') # label action_mask_ = tf.placeholder( tf.float32, [action_size], name='action_mask') # 7个二进制(将要与softmax后的分布相乘) # input projection Wi = tf.get_variable('Wi', [obs_size, nb_hidden], initializer=xav()) # [365,128] bi = tf.get_variable( 'bi', [nb_hidden], initializer=tf.constant_initializer(0.)) # 128 # add relu/tanh here if necessary projected_features = tf.matmul(features_, Wi) + bi # 128 # state_is_tuple如果为True,则接受和返回的状态是c_state和m_state的2-tuple;如果为False,则他们沿着列轴连接后一种即将被弃用 lstm_f = tf.contrib.rnn.LSTMCell(nb_hidden, state_is_tuple=True) # 128 lstm_op, state = lstm_f(inputs=projected_features, state=(init_state_c_, init_state_h_)) # reshape LSTM's state tuple (2,128) -> (1,256) (joint h and c) state_reshaped = tf.concat(axis=1, values=(state.c, state.h)) # 256 # output projection Wo = tf.get_variable('Wo', [2 * nb_hidden, action_size], initializer=xav()) # [256,7] bo = tf.get_variable('bo', [action_size], initializer=tf.constant_initializer(0.)) # 7 # get logits logits = tf.matmul(state_reshaped, Wo) + bo # 7 # probabilities # normalization : elemwise multiply with action mask probs = tf.multiply(tf.squeeze(tf.nn.softmax(logits)), action_mask_) # softmax后的概率分布与7个二进制0 1相乘 # prediction prediction = tf.arg_max(probs, dimension=0) # 取概率最大的action_id作为输出 # loss 由于有sparse_,labels为一维向量,长度=batch_size,此处长度为1 [action_id],代表分类结果 loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=action_) # train op train_op = tf.train.AdadeltaOptimizer(0.1).minimize(loss) # attach symbols to self self.loss = loss self.prediction = prediction self.probs = probs self.logits = logits self.state = state self.train_op = train_op # attach placeholders self.features_ = features_ self.init_state_c_ = init_state_c_ self.init_state_h_ = init_state_h_ self.action_ = action_ self.action_mask_ = action_mask_
def _build_body(self): # input projection _units = tf.layers.dense(self._features, self.dense_size, kernel_regularizer=tf.nn.l2_loss, kernel_initializer=xav()) if self.attn: attn_scope = "attention_mechanism/{}".format(self.attn.type) with tf.variable_scope(attn_scope): if self.attn.type == 'general': _attn_output = am.general_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, projected_align=self.attn.projected_align) elif self.attn.type == 'bahdanau': _attn_output = am.bahdanau_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, projected_align=self.attn.projected_align) elif self.attn.type == 'cs_general': _attn_output = am.cs_general_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, depth=self.attn.depth, projected_align=self.attn.projected_align) elif self.attn.type == 'cs_bahdanau': _attn_output = am.cs_bahdanau_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, depth=self.attn.depth, projected_align=self.attn.projected_align) elif self.attn.type == 'light_general': _attn_output = am.light_general_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, projected_align=self.attn.projected_align) elif self.attn.type == 'light_bahdanau': _attn_output = am.light_bahdanau_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, projected_align=self.attn.projected_align) else: raise ValueError("wrong value for attention mechanism type") _units = tf.concat([_units, _attn_output], -1) _units = tf_layers.variational_dropout(_units, keep_prob=self._dropout_keep_prob) # recurrent network unit _lstm_cell = tf.nn.rnn_cell.LSTMCell(self.hidden_size) _utter_lengths = tf.to_int32(tf.reduce_sum(self._utterance_mask, axis=-1)) _output, _state = tf.nn.dynamic_rnn(_lstm_cell, _units, time_major=False, initial_state=self._initial_state, sequence_length=_utter_lengths) _output = tf.reshape(_output, (self._batch_size, -1, self.hidden_size)) _output = tf_layers.variational_dropout(_output, keep_prob=self._dropout_keep_prob) # output projection _logits = tf.layers.dense(_output, self.action_size, kernel_regularizer=tf.nn.l2_loss, kernel_initializer=xav(), name='logits') return _logits, _state
def __graph__(): tf.reset_default_graph() self.dropout = tf.placeholder(dtype=tf.float32, shape=[], name="dropout") # entry points features_ = tf.placeholder(tf.float32, [1, obs_size], name='input_features') init_state_c_, init_state_h_ = (tf.placeholder( tf.float32, [1, nb_hidden]) for _ in range(2)) action_ = tf.placeholder(tf.int32, name='ground_truth_action') # input projection Wi = tf.get_variable('Wi', [obs_size, nb_hidden], initializer=xav()) bi = tf.get_variable('bi', [nb_hidden], initializer=tf.constant_initializer(0.)) # add relu/tanh here if necessary projected_features = tf.matmul(features_, Wi) + bi lstm_f = tf.contrib.rnn.LSTMCell(nb_hidden, state_is_tuple=True) lstm_op, state = lstm_f(inputs=projected_features, state=(init_state_c_, init_state_h_)) # reshape LSTM's state tuple (2,128) -> (1,256) state_reshaped = tf.concat(axis=1, values=(state.c, state.h)) state_reshaped = tf.nn.dropout(state_reshaped, self.dropout) # define user utterance memory prev_hidden_states = tf.placeholder(tf.float32, [None, nb_hidden * 2], name='prev_hidden_states') W_user = tf.get_variable('W_user', [nb_hidden * 2, nb_hidden * 2], initializer=xav()) # (None, 256) x (256, 256) x (256, 1) => (None, 1) user_attention_score = tf.matmul( tf.matmul(prev_hidden_states, W_user), tf.transpose(state_reshaped)) # (None) user_attention_weights = tf.nn.softmax( tf.transpose(user_attention_score)) # (None, 256) user_encodings = prev_hidden_states # (None) x (None, 256) => (1, 256) user_weighted_sum = tf.matmul(user_attention_weights, user_encodings) user_weighted_sum = tf.nn.dropout(user_weighted_sum, self.dropout) # define action attention variables action_projection = tf.placeholder(tf.float32, [300, action_size], name='action_projection') action_one_hot = tf.placeholder(tf.float32, [action_size], name='action_one_hot') expanded_action_one_hot = tf.expand_dims(action_one_hot, 1) # action_encoding => (300 x 1) 현재 메모리값임 action_encoding = tf.matmul(action_projection, expanded_action_one_hot) action_encoding = tf.nn.dropout(action_encoding, self.dropout) # (1 x 300) action_encoding = tf.transpose(action_encoding) W_action = tf.get_variable('W_action', [300, nb_hidden * 2], initializer=xav()) # output : 1 dimension scalar value (current system action projection value) 이거 전 액션에 대한거임 변수명때문에 헷갈리지 말것. # 1 x 1 transposed_hidden_state = tf.transpose(state_reshaped) # 256 x 1 # 이전 시스템 메모리값들임 prev_action_encodings = tf.placeholder(tf.float32, [None, 300], name='prev_actions') # output : [None, 1] prev_projected_actions = tf.matmul( tf.matmul(prev_action_encodings, W_action), transposed_hidden_state) # shape : [number of prev_utter, 1] projected_actions = prev_projected_actions # shape : [1, number of prev_utter] transposed_projected_actions = tf.transpose(projected_actions) # output shape : [number of prev_utter] # Get action weights (probability distribution of each action encodings) action_weights = tf.nn.softmax(transposed_projected_actions) action_encodings = prev_action_encodings # output shape : (1, 300) system_weighted_sum = tf.matmul(action_weights, action_encodings) system_weighted_sum = tf.nn.dropout(system_weighted_sum, self.dropout) # 이 밑에 부분 3가지로 실험할 것. (1. +, 2. AVG, 3.POOLING) sum_features = tf.reduce_sum( [state_reshaped, user_weighted_sum, system_weighted_sum], 0) # avg_features = tf.reduce_mean([state_reshaped, user_weighted_sum, system_weighted_sum], 0) # 3. pooled_features = tf.reduce_max([state_reshaped, user_weighted_sum, system_weighted_sum], 0) # output projection Wo = tf.get_variable('Wo', [300, action_size], initializer=xav()) bo = tf.get_variable('bo', [action_size], initializer=tf.constant_initializer(0.)) # get logits logits = tf.matmul(sum_features, Wo) + bo # concate lstm features with weighted sum attention feature # concatenated_features = tf.concat([state_reshaped, user_weighted_sum, system_weighted_sum], 1) # concatenated_features = tf.nn.dropout(concatenated_features, self.dropout) # # output projection # Wo = tf.get_variable('Wo', [300 + 256 + 256, action_size], # initializer=xav()) # bo = tf.get_variable('bo', [action_size], # initializer=tf.constant_initializer(0.)) # # get logits # logits = tf.matmul(concatenated_features, Wo) + bo probs = tf.squeeze(tf.nn.softmax(logits)) # prediction prediction = tf.arg_max(probs, dimension=0) # loss loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=action_) global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.exponential_decay(0.25, global_step, 200000, 0.8, staircase=True) # train op train_op = tf.train.AdadeltaOptimizer(learning_rate).minimize( loss, global_step=global_step) # attach symbols to self self.loss = loss self.prediction = prediction self.probs = probs self.logits = logits self.state = state self.train_op = train_op # attach placeholders self.features_ = features_ self.init_state_c_ = init_state_c_ self.init_state_h_ = init_state_h_ self.action_ = action_ # user attention values self.prev_hidden_states = prev_hidden_states self.user_encodings = user_encodings # attention placeholders self.action_projection = action_projection self.action_one_hot = action_one_hot self.prev_action_encodings = prev_action_encodings # attention values self.action_encoding = action_encoding self.action_encodings = action_encodings self.projected_actions = projected_actions self.user_attention_weights = user_attention_weights self.action_weights = action_weights
def _build_body(self): # input projection _units = tf.layers.dense(self._features, self.dense_size, kernel_regularizer=tf.nn.l2_loss, kernel_initializer=xav()) if self.attn: attn_scope = "attention_mechanism/{}".format(self.attn.type) with tf.variable_scope(attn_scope): if self.attn.type == 'general': _attn_output = am.general_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, projected_align=self.attn.projected_align) elif self.attn.type == 'bahdanau': _attn_output = am.bahdanau_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, projected_align=self.attn.projected_align) elif self.attn.type == 'cs_general': _attn_output = am.cs_general_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, depth=self.attn.depth, projected_align=self.attn.projected_align) elif self.attn.type == 'cs_bahdanau': _attn_output = am.cs_bahdanau_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, depth=self.attn.depth, projected_align=self.attn.projected_align) elif self.attn.type == 'light_general': _attn_output = am.light_general_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, projected_align=self.attn.projected_align) elif self.attn.type == 'light_bahdanau': _attn_output = am.light_bahdanau_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, projected_align=self.attn.projected_align) else: raise ValueError("wrong value for attention mechanism type") _units = tf.concat([_units, _attn_output], -1) _units = tf_layers.variational_dropout(_units, keep_prob=self._dropout_keep_prob) # recurrent network unit _lstm_cell = tf.nn.rnn_cell.LSTMCell(self.hidden_size) _utter_lengths = tf.to_int32(tf.reduce_sum(self._utterance_mask, axis=-1)) _output, _state = tf.nn.dynamic_rnn(_lstm_cell, _units, initial_state=self._initial_state, sequence_length=_utter_lengths) # output projection _logits = tf.layers.dense(_output, self.action_size, kernel_regularizer=tf.nn.l2_loss, kernel_initializer=xav(), name='logits') return _logits, _state
def __graph__(self): with tf.variable_scope('ae'): ae = ae_ood.RNNAutoencoder(self.config, self.rev_vocab) input_contexts = tf.placeholder( tf.float32, [None, self.max_input_length, self.feature_vector_size], name='input_contexts') action_ = tf.placeholder(tf.int32, [None, self.max_input_length], name='ground_truth_action') action_mask_ = tf.placeholder( tf.float32, [None, self.max_input_length, self.action_size], name='action_mask') action_seq_length = tf.count_nonzero(action_, -1) ae_turn_encodings = tf.concat(ae.enc_state, axis=-1) turn_features = tf.concat([ tf.reshape(ae_turn_encodings, shape=[ -1, self.max_input_length, self.config['embedding_size'] * 2 ]), input_contexts ], axis=-1) # input projection Wi = tf.get_variable('Wi', shape=[ self.feature_vector_size + self.nb_hidden * 2, self.nb_hidden ], dtype=tf.float32, initializer=xav()) bi = tf.get_variable('bi', shape=[self.nb_hidden], dtype=tf.float32, initializer=tf.constant_initializer(0.)) # add relu/tanh here if necessary projected_features = tf.tensordot(turn_features, Wi, axes=1) + bi lstm_cell = tf.contrib.rnn.BasicLSTMCell(self.nb_hidden, state_is_tuple=True, name='dialog_encoder') outputs, states = tf.nn.dynamic_rnn(lstm_cell, projected_features, dtype=tf.float32) # output projection Wo = tf.get_variable('Wo', shape=[self.nb_hidden, self.action_size], dtype=tf.float32, initializer=xav()) bo = tf.get_variable('bo', shape=[self.action_size], dtype=tf.float32, initializer=tf.constant_initializer(0.)) # get logits logits = tf.tensordot(outputs, Wo, axes=1) + bo # probabilities # normalization : elemwise multiply with action mask # not doing softmax because it's taken care of in the cross-entropy! probs = tf.multiply(logits, action_mask_) # prediction prediction = tf.argmax(probs, axis=-1) mask_fn = lambda l: tf.sequence_mask( l, self.max_input_length, dtype=tf.float32) sequence_mask = mask_fn(action_seq_length) # loss self.hcn_loss = tf.contrib.seq2seq.sequence_loss( logits=logits, targets=action_, weights=sequence_mask, average_across_batch=True) self.ae_loss = ae.loss_op # vae_loss = self.vae_nll_loss + self.vae_kl_w * self.vae_kl_loss loss = tf.reduce_mean(self.hcn_loss + self.ae_loss) self.lr = tf.train.exponential_decay( self.config['learning_rate'], self.global_step, self.config.get('steps_before_decay', 0), self.config.get('learning_rate_decay', 1.0), staircase=True) optimizer = getattr(tf.train, self.config['optimizer'])(self.lr) gradients, variables = zip(*optimizer.compute_gradients(loss)) # gradients, _ = tf.clip_by_global_norm(gradients, self.config['clip_norm']) self.train_op = optimizer.apply_gradients(zip(gradients, variables), global_step=self.global_step) # attach symbols to self self.ae = ae self.loss = loss self.prediction = prediction self.probs = probs self.logits = logits self.sequence_mask_ = sequence_mask # attach placeholders self.input_contexts = input_contexts self.action_ = action_ self.action_mask_ = action_mask_
def __graph__(): tf.reset_default_graph() # entry points features_ = tf.placeholder(tf.float32, [1, obs_size], name='input_features') init_state_c_, init_state_h_ = (tf.placeholder( tf.float32, [1, nb_hidden]) for _ in range(2)) action_ = tf.placeholder(tf.int32, name='ground_truth_action') action_mask_ = tf.placeholder(tf.float32, [action_size], name='action_mask') # input projection - 인풋 dimention을 맞춰주기 위한 trick ############## Wi = tf.get_variable('Wi', [obs_size, nb_hidden], initializer=xav()) bi = tf.get_variable('bi', [nb_hidden], initializer=tf.constant_initializer(0.)) projected_features = tf.matmul(features_, Wi) + bi cell_fw = tf.contrib.rnn.GRUCell(num_units=nb_hidden) cell_bw = tf.contrib.rnn.GRUCell(num_units=nb_hidden) outputs, output_state_fw, output_state_bw = tf.nn.static_bidirectional_rnn( cell_fw, cell_bw, inputs=[projected_features], dtype=tf.float32) state_reshaped = tf.concat(axis=1, values=(output_state_fw, output_state_bw)) Wo = tf.get_variable('Wo', [2 * nb_hidden, action_size], initializer=xav()) bo = tf.get_variable('bo', [action_size], initializer=tf.constant_initializer(0.)) logits = tf.matmul(state_reshaped, Wo) + bo loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=action_) train_op = tf.train.AdadeltaOptimizer(0.1).minimize(loss) if self.is_action_mask: probs = tf.multiply(tf.squeeze(tf.nn.softmax(logits)), action_mask_) else: probs = tf.squeeze(tf.squeeze(tf.nn.softmax(logits))) prediction = tf.arg_max(probs, dimension=0) # each output values self.loss = loss self.prediction = prediction self.probs = probs self.logits = logits # self.state = state self.train_op = train_op # attach placeholder self.features_ = features_ self.init_state_c_ = init_state_c_ self.init_state_h_ = init_state_h_ self.action_ = action_ if self.is_action_mask: self.action_mask_ = action_mask_
def __graph__(): tf.reset_default_graph() features_ = tf.placeholder(tf.float32, [1, obs_size], name='input_features') init_state_c_, init_state_h_ = (tf.placeholder( tf.float32, [1, nb_hidden]) for _ in range(2)) system_features = tf.placeholder(tf.float32, [300], name='system_features') ground_label = tf.placeholder(tf.int32, name='ground_truth_action') # input projection Wi = tf.get_variable('Wi', [obs_size, nb_hidden], initializer=xav()) bi = tf.get_variable('bi', [nb_hidden], initializer=tf.constant_initializer(0.)) # add relu/tanh here if necessary projected_features = tf.matmul(features_, Wi) + bi lstm_f = tf.contrib.rnn.LSTMCell(nb_hidden, state_is_tuple=True) lstm_op, state = lstm_f(inputs=projected_features, state=(init_state_c_, init_state_h_)) # reshape LSTM's state tuple (2,128) -> (1,256) state_reshaped = tf.concat(axis=1, values=(state.c, state.h)) # (256, 1) transposed_hidden_state = tf.transpose(state_reshaped) # output: 1 x 300 => 현재 시스템 메모리 system_encoding = tf.expand_dims(system_features, 0) W_system = tf.get_variable('W_system', [300, 256], initializer=xav()) current_system_attention_score = tf.matmul( tf.matmul(system_encoding, W_system), transposed_hidden_state) # 이전 시스템 메모리 값들 prev_system_encodings = tf.placeholder(tf.float32, [None, 300]) prev_system_attention_scores = tf.matmul( tf.matmul(prev_system_encodings, W_system), transposed_hidden_state) # output : [number of prev_utter + current_utter, 1] system_attention_scores = tf.concat( [prev_system_attention_scores, current_system_attention_score], 0) transposed_system_attention_scores = tf.transpose( system_attention_scores) # [number of prev_utter + current_utter] system_attention_weights = tf.nn.softmax( transposed_system_attention_scores) # [number of prev_utter + current_utter, 300] system_encodings = tf.concat( [prev_system_encodings, system_encoding], 0) weighted_system_encodings = tf.matmul(system_attention_weights, system_encodings) concatenated_features = tf.concat( [state_reshaped, weighted_system_encodings], 1) # output projection Wo = tf.get_variable('Wo', [556, num_class], initializer=xav()) bo = tf.get_variable('bo', [num_class], initializer=tf.constant_initializer(0.)) # get logits logits = tf.matmul(concatenated_features, Wo) + bo probs = tf.squeeze(tf.nn.softmax(logits)) # prediction prediction = tf.arg_max(probs, dimension=0) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=ground_label) train_op = tf.train.AdadeltaOptimizer(0.1).minimize(loss) self.loss = loss self.prediction = prediction self.probs = probs self.logits = logits self.state = state self.train_op = train_op # attach placeholders self.features_ = features_ self.system_features = system_features self.init_state_c_ = init_state_c_ self.init_state_h_ = init_state_h_ self.ground_label = ground_label self.prev_system_encodings = prev_system_encodings self.system_encodings = system_encodings