コード例 #1
0
def cs_bahdanau_attention(key, context, hidden_size, depth, projected_align=False):
    """ It is a implementation of the Bahdanau et al. attention mechanism. Based on the papers:
        https://arxiv.org/abs/1409.0473 "Neural Machine Translation by Jointly Learning to Align and Translate"
        https://andre-martins.github.io/docs/emnlp2017_final.pdf "Learning What's Easy: Fully Differentiable Neural Easy-First Taggers"
    Args:
        key: A tensorflow tensor with dimensionality [None, None, key_size]
        context: A tensorflow tensor with dimensionality [None, None, max_num_tokens, token_size]
        hidden_size: Number of units in hidden representation
        depth: Number of csoftmax usages
        projected_align: Using bidirectional lstm for hidden representation of context.
        If true, beetween input and attention mechanism insert layer of bidirectional lstm with dimensionality [hidden_size].
        If false, bidirectional lstm is not used.
    Returns:
        output: Tensor at the output with dimensionality [None, None, depth * hidden_size]
    """
    if hidden_size % 2 != 0:
        raise ValueError("hidden size must be dividable by two")
    batch_size = tf.shape(context)[0]
    max_num_tokens, token_size = context.get_shape().as_list()[-2:]

    r_context = tf.reshape(context, shape=[-1, max_num_tokens, token_size])
    # projected context: [None, max_num_tokens, token_size]
    projected_context = tf.layers.dense(r_context, token_size,
                                        kernel_initializer=xav(),
                                        name='projected_context')

    # projected_key: [None, None, hidden_size]
    projected_key = tf.layers.dense(key, hidden_size, kernel_initializer=xav(),
                                    name='projected_key')
    r_projected_key = \
        tf.tile(tf.reshape(projected_key, shape=[-1, 1, hidden_size]),
                [1, max_num_tokens, 1])

    lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size//2)
    lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size//2)
    (output_fw, output_bw), states = \
        tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fw_cell,
                                        cell_bw=lstm_bw_cell,
                                        inputs=projected_context,
                                        dtype=tf.float32)

    # bilstm_output: [-1, max_num_tokens, hidden_size]
    bilstm_output = tf.concat([output_fw, output_bw], -1)
    concat_h_state = tf.concat([r_projected_key, output_fw, output_bw], -1)

    if projected_align:
        log.info("Using projected attention alignment")
        h_state_for_attn_alignment = bilstm_output
        aligned_h_state = csoftmax_attention.attention_bah_block(
            concat_h_state, h_state_for_attn_alignment, depth)
        output = \
            tf.reshape(aligned_h_state, shape=[batch_size, -1, depth * hidden_size])
    else:
        log.info("Using without projected attention alignment")
        h_state_for_attn_alignment = projected_context
        aligned_h_state = csoftmax_attention.attention_bah_block(
            concat_h_state, h_state_for_attn_alignment, depth)
        output = \
            tf.reshape(aligned_h_state, shape=[batch_size, -1, depth * token_size])
    return output
コード例 #2
0
ファイル: lstm_model.py プロジェクト: ehwa009/dialogue_system
        def __graph__():
            tf.reset_default_graph()

            # entry points
            features_ = tf.placeholder(tf.float32, [1, obs_size],
                                       name='input_features')
            init_state_c_, init_state_h_ = (tf.placeholder(
                tf.float32, [1, nb_hidden]) for _ in range(2))
            action_ = tf.placeholder(tf.int32, name='ground_truth_action')

            # input projection - 인풋 dimention을 맞춰주기 위한 trick ##############
            Wi = tf.get_variable('Wi', [obs_size, nb_hidden],
                                 initializer=xav())
            bi = tf.get_variable('bi', [nb_hidden],
                                 initializer=tf.constant_initializer(0.))

            projected_features = tf.matmul(features_, Wi) + bi
            ########################################################################
            lstm_f = tf.contrib.rnn.LSTMCell(num_units=nb_hidden,
                                             state_is_tuple=True)
            lstm_op, state = lstm_f(inputs=projected_features,
                                    state=(init_state_c_, init_state_h_))

            # ouput projection - 아웃풋 dimention을 맞춰주기 위한 trick ###########
            state_reshaped = tf.concat(axis=1, values=(state.c, state.h))

            Wo = tf.get_variable('Wo', [2 * nb_hidden, action_size],
                                 initializer=xav())
            bo = tf.get_variable('bo', [action_size],
                                 initializer=tf.constant_initializer(0.))

            logits = tf.matmul(state_reshaped, Wo) + bo
            ########################################################################

            probs = tf.squeeze(tf.nn.softmax(logits))

            prediction = tf.arg_max(probs, dimension=0)

            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits, labels=action_)

            # default was 0.1
            train_op = tf.train.AdadeltaOptimizer(0.01).minimize(loss)

            # each output values
            self.loss = loss
            self.prediction = prediction
            self.probs = probs
            self.logits = logits
            self.state = state
            self.train_op = train_op

            # attach placeholder
            self.features_ = features_
            self.init_state_c_ = init_state_c_
            self.init_state_h_ = init_state_h_
            self.action_ = action_
コード例 #3
0
ファイル: lstm.py プロジェクト: ehwa009/dialogue_system
        def __graph__():
            tf.reset_default_graph()

            features_ = tf.placeholder(tf.float32, [1, obs_size], name='input_features')
            init_state_c_, init_state_h_ = ( tf.placeholder(tf.float32, [1, nb_hidden]) for _ in range(2) )
            action_ = tf.placeholder(tf.int32, name='ground_truth_action')
            if self.is_action_mask:
                action_mask_ = tf.placeholder(tf.float32, [action_size], name='action_mask')

            # input projection
            Wi = tf.get_variable('Wi', [obs_size, nb_hidden], 
                    initializer=xav())
            bi = tf.get_variable('bi', [nb_hidden], 
                    initializer=tf.constant_initializer(0.))
 
            projected_features = tf.matmul(features_, Wi) + bi 
                
            lstm_f = tf.contrib.rnn.LSTMCell(nb_hidden, state_is_tuple=True)
            output, state = lstm_f(inputs=projected_features, state=(init_state_c_, init_state_h_))
            
            # reshape LSTM's state tuple (2,128) -> (1,256)
            state_reshaped = tf.concat(axis=1, values=(state.c, state.h))

            # output projection - desnse
            Wo = tf.get_variable('Wo', [2*nb_hidden, action_size], 
                    initializer=xav())
            bo = tf.get_variable('bo', [action_size], 
                    initializer=tf.constant_initializer(0.))
            
            logits = tf.matmul(state_reshaped, Wo) + bo
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=action_)
            train_op = tf.train.AdadeltaOptimizer(0.1).minimize(loss)
            
            # probabilities
            #  normalization : elemwise multiply with action mask
            if self.is_action_mask:
                probs = tf.multiply(tf.squeeze(tf.nn.softmax(logits)), action_mask_)
            else:
                probs = tf.squeeze(tf.squeeze(tf.nn.softmax(logits)))
            # prediction
            prediction = tf.arg_max(probs, dimension=0)

            # attach symbols to self
            self.loss = loss
            self.prediction = prediction
            self.probs = probs
            self.logits = logits
            self.state = state
            self.train_op = train_op

            # attach placeholders
            self.features_ = features_
            self.init_state_c_ = init_state_c_
            self.init_state_h_ = init_state_h_
            self.action_ = action_
            if self.is_action_mask:
                self.action_mask_ = action_mask_
コード例 #4
0
def bahdanau_attention(key, context, hidden_size, projected_align=False):
    """ It is a implementation of the Bahdanau et al. attention mechanism. Based on the paper:
        https://arxiv.org/abs/1409.0473 "Neural Machine Translation by Jointly Learning to Align and Translate"
    Args:
        key: A tensorflow tensor with dimensionality [None, None, key_size]
        context: A tensorflow tensor with dimensionality [None, None, max_num_tokens, token_size]
        hidden_size: Number of units in hidden representation
        projected_align: Using bidirectional lstm for hidden representation of context.
        If true, beetween input and attention mechanism insert layer of bidirectional lstm with dimensionality [hidden_size].
        If false, bidirectional lstm is not used.
    Returns:
        output: Tensor at the output with dimensionality [None, None, hidden_size]
    """
    if hidden_size % 2 != 0:
        raise ValueError("hidden size must be dividable by two")
    batch_size = tf.shape(context)[0]
    max_num_tokens, token_size = context.get_shape().as_list()[-2:]
    r_context = tf.reshape(context, shape=[-1, max_num_tokens, token_size])

    # projected_key: [None, None, hidden_size]
    projected_key = tf.layers.dense(key, hidden_size, kernel_initializer=xav())
    r_projected_key = \
        tf.tile(tf.reshape(projected_key, shape=[-1, 1, hidden_size]),
                [1, max_num_tokens, 1])

    lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size // 2)
    lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size // 2)
    (output_fw, output_bw), states = \
        tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fw_cell,
                                        cell_bw=lstm_bw_cell,
                                        inputs=r_context,
                                        dtype=tf.float32)

    # bilstm_output: [-1,self.max_num_tokens,_n_hidden]
    bilstm_output = tf.concat([output_fw, output_bw], -1)
    concat_h_state = tf.concat([r_projected_key, output_fw, output_bw], -1)
    projected_state = \
        tf.layers.dense(concat_h_state, hidden_size, use_bias=False,
                        kernel_initializer=xav())
    score = \
        tf.layers.dense(tf.tanh(projected_state), units=1, use_bias=False,
                        kernel_initializer=xav())

    attn = tf.nn.softmax(score, dim=1)

    if projected_align:
        log.info("Using projected attention alignment")
        t_context = tf.transpose(bilstm_output, [0, 2, 1])
        output = tf.reshape(tf.matmul(t_context, attn),
                            shape=[batch_size, -1, hidden_size])
    else:
        log.info("Using without projected attention alignment")
        t_context = tf.transpose(r_context, [0, 2, 1])
        output = tf.reshape(tf.matmul(t_context, attn),
                            shape=[batch_size, -1, token_size])
    return output
コード例 #5
0
def bahdanau_attention(key, context, hidden_size, projected_align=False):
    """ It is a implementation of the Bahdanau et al. attention mechanism. Based on the paper:
        https://arxiv.org/abs/1409.0473 "Neural Machine Translation by Jointly Learning to Align and Translate"
    Args:
        key: A tensorflow tensor with dimensionality [None, None, key_size]
        context: A tensorflow tensor with dimensionality [None, None, max_num_tokens, token_size]
        hidden_size: Number of units in hidden representation
        projected_align: Using bidirectional lstm for hidden representation of context.
        If true, beetween input and attention mechanism insert layer of bidirectional lstm with dimensionality [hidden_size].
        If false, bidirectional lstm is not used.
    Returns:
        output: Tensor at the output with dimensionality [None, None, hidden_size]
    """
    if hidden_size % 2 != 0:
        raise ValueError("hidden size must be dividable by two")
    batch_size = tf.shape(context)[0]
    max_num_tokens, token_size = context.get_shape().as_list()[-2:]
    r_context = tf.reshape(context, shape=[-1, max_num_tokens, token_size])

    # projected_key: [None, None, hidden_size]
    projected_key = tf.layers.dense(key, hidden_size, kernel_initializer=xav())
    r_projected_key = \
        tf.tile(tf.reshape(projected_key, shape=[-1, 1, hidden_size]),
                [1, max_num_tokens, 1])

    lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size//2)
    lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size//2)
    (output_fw, output_bw), states = \
        tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fw_cell,
                                        cell_bw=lstm_bw_cell,
                                        inputs=r_context,
                                        dtype=tf.float32)

    # bilstm_output: [-1,self.max_num_tokens,_n_hidden]
    bilstm_output = tf.concat([output_fw, output_bw], -1)
    concat_h_state = tf.concat([r_projected_key, output_fw, output_bw], -1)
    projected_state = \
        tf.layers.dense(concat_h_state, hidden_size, use_bias=False,
                        kernel_initializer=xav())
    score = \
        tf.layers.dense(tf.tanh(projected_state), units=1, use_bias=False,
                        kernel_initializer=xav())

    attn = tf.nn.softmax(score, dim=1)

    if projected_align:
        log.info("Using projected attention alignment")
        t_context = tf.transpose(bilstm_output, [0, 2, 1])
        output = tf.reshape(tf.matmul(t_context, attn),
                            shape=[batch_size, -1, hidden_size])
    else:
        log.info("Using without projected attention alignment")
        t_context = tf.transpose(r_context, [0, 2, 1])
        output = tf.reshape(tf.matmul(t_context, attn),
                            shape=[batch_size, -1, token_size])
    return output
コード例 #6
0
def light_bahdanau_attention(key, context, hidden_size, projected_align=False):
    """ It is a implementation of the Bahdanau et al. attention mechanism. Based on the paper:
        https://arxiv.org/abs/1409.0473 "Neural Machine Translation by Jointly Learning to Align and Translate"
    Args:
        key: A tensorflow tensor with dimensionality [None, None, key_size]
        context: A tensorflow tensor with dimensionality [None, None, max_num_tokens, token_size]
        hidden_size: Number of units in hidden representation
        projected_align: Using dense layer for hidden representation of context.
        If true, between input and attention mechanism insert a dense layer with dimensionality [hidden_size].
        If false, a dense layer is not used.
    Returns:
        output: Tensor at the output with dimensionality [None, None, hidden_size]
    """
    batch_size = tf.shape(context)[0]
    max_num_tokens, token_size = context.get_shape().as_list()[-2:]
    r_context = tf.reshape(context, shape=[-1, max_num_tokens, token_size])

    # projected_key: [None, None, hidden_size]
    projected_key = tf.layers.dense(key, hidden_size, kernel_initializer=xav())
    r_projected_key = \
        tf.tile(tf.reshape(projected_key, shape=[-1, 1, hidden_size]),
                [1, max_num_tokens, 1])

    # projected_context: [None, max_num_tokens, hidden_size]
    projected_context = \
        tf.layers.dense(r_context, hidden_size, kernel_initializer=xav())
    concat_h_state = tf.concat([projected_context, r_projected_key], -1)

    projected_state = \
        tf.layers.dense(concat_h_state, hidden_size, use_bias=False,
                        kernel_initializer=xav())
    score = \
        tf.layers.dense(tf.tanh(projected_state), units=1, use_bias=False,
                        kernel_initializer=xav())

    attn = tf.nn.softmax(score, dim=1)

    if projected_align:
        log.info("Using projected attention alignment")
        t_context = tf.transpose(projected_context, [0, 2, 1])
        output = tf.reshape(tf.matmul(t_context, attn),
                            shape=[batch_size, -1, hidden_size])
    else:
        log.info("Using without projected attention alignment")
        t_context = tf.transpose(r_context, [0, 2, 1])
        output = tf.reshape(tf.matmul(t_context, attn),
                            shape=[batch_size, -1, token_size])
    return output
コード例 #7
0
def light_bahdanau_attention(key, context, hidden_size, projected_align=False):
    """ It is a implementation of the Bahdanau et al. attention mechanism. Based on the paper:
        https://arxiv.org/abs/1409.0473 "Neural Machine Translation by Jointly Learning to Align and Translate"
    Args:
        key: A tensorflow tensor with dimensionality [None, None, key_size]
        context: A tensorflow tensor with dimensionality [None, None, max_num_tokens, token_size]
        hidden_size: Number of units in hidden representation
        projected_align: Using dense layer for hidden representation of context.
        If true, between input and attention mechanism insert a dense layer with dimensionality [hidden_size].
        If false, a dense layer is not used.
    Returns:
        output: Tensor at the output with dimensionality [None, None, hidden_size]
    """
    batch_size = tf.shape(context)[0]
    max_num_tokens, token_size = context.get_shape().as_list()[-2:]
    r_context = tf.reshape(context, shape=[-1, max_num_tokens, token_size])

    # projected_key: [None, None, hidden_size]
    projected_key = tf.layers.dense(key, hidden_size, kernel_initializer=xav())
    r_projected_key = \
        tf.tile(tf.reshape(projected_key, shape=[-1, 1, hidden_size]),
                [1, max_num_tokens, 1])

    # projected_context: [None, max_num_tokens, hidden_size]
    projected_context = \
        tf.layers.dense(r_context, hidden_size, kernel_initializer=xav())
    concat_h_state = tf.concat([projected_context, r_projected_key], -1)

    projected_state = \
        tf.layers.dense(concat_h_state, hidden_size, use_bias=False,
                        kernel_initializer=xav())
    score = \
        tf.layers.dense(tf.tanh(projected_state), units=1, use_bias=False,
                        kernel_initializer=xav())

    attn = tf.nn.softmax(score, dim=1)

    if projected_align:
        log.info("Using projected attention alignment")
        t_context = tf.transpose(projected_context, [0, 2, 1])
        output = tf.reshape(tf.matmul(t_context, attn),
                            shape=[batch_size, -1, hidden_size])
    else:
        log.info("Using without projected attention alignment")
        t_context = tf.transpose(r_context, [0, 2, 1])
        output = tf.reshape(tf.matmul(t_context, attn),
                            shape=[batch_size, -1, token_size])
    return output
コード例 #8
0
def light_general_attention(key, context, hidden_size, projected_align=False):
    """ It is a implementation of the Luong et al. attention mechanism with general score. Based on the paper:
        https://arxiv.org/abs/1508.04025 "Effective Approaches to Attention-based Neural Machine Translation"
    Args:
        key: A tensorflow tensor with dimensionality [None, None, key_size]
        context: A tensorflow tensor with dimensionality [None, None, max_num_tokens, token_size]
        hidden_size: Number of units in hidden representation
        projected_align: Using dense layer for hidden representation of context.
        If true, between input and attention mechanism insert a dense layer with dimensionality [hidden_size].
        If false, a dense layer is not used.
    Returns:
        output: Tensor at the output with dimensionality [None, None, hidden_size]
    """
    batch_size = tf.shape(context)[0]
    max_num_tokens, token_size = context.get_shape().as_list()[-2:]
    r_context = tf.reshape(context, shape=[-1, max_num_tokens, token_size])

    # projected_key: [None, None, hidden_size]
    projected_key = tf.layers.dense(key, hidden_size, kernel_initializer=xav())
    r_projected_key = tf.reshape(projected_key, shape=[-1, hidden_size, 1])

    # projected context: [None, None, hidden_size]
    projected_context = \
        tf.layers.dense(r_context, hidden_size, kernel_initializer=xav())

    attn = tf.nn.softmax(tf.matmul(projected_context, r_projected_key), dim=1)

    if projected_align:
        log.info("Using projected attention alignment")
        t_context = tf.transpose(projected_context, [0, 2, 1])
        output = tf.reshape(tf.matmul(t_context, attn),
                            shape=[batch_size, -1, hidden_size])
    else:
        log.info("Using without projected attention alignment")
        t_context = tf.transpose(r_context, [0, 2, 1])
        output = tf.reshape(tf.matmul(t_context, attn),
                            shape=[batch_size, -1, token_size])
    return output
コード例 #9
0
def light_general_attention(key, context, hidden_size, projected_align=False):
    """ It is a implementation of the Luong et al. attention mechanism with general score. Based on the paper:
        https://arxiv.org/abs/1508.04025 "Effective Approaches to Attention-based Neural Machine Translation"
    Args:
        key: A tensorflow tensor with dimensionality [None, None, key_size]
        context: A tensorflow tensor with dimensionality [None, None, max_num_tokens, token_size]
        hidden_size: Number of units in hidden representation
        projected_align: Using dense layer for hidden representation of context.
        If true, between input and attention mechanism insert a dense layer with dimensionality [hidden_size].
        If false, a dense layer is not used.
    Returns:
        output: Tensor at the output with dimensionality [None, None, hidden_size]
    """
    batch_size = tf.shape(context)[0]
    max_num_tokens, token_size = context.get_shape().as_list()[-2:]
    r_context = tf.reshape(context, shape=[-1, max_num_tokens, token_size])

    # projected_key: [None, None, hidden_size]
    projected_key = tf.layers.dense(key, hidden_size, kernel_initializer=xav())
    r_projected_key = tf.reshape(projected_key, shape=[-1, hidden_size, 1])

    # projected context: [None, None, hidden_size]
    projected_context = \
        tf.layers.dense(r_context, hidden_size, kernel_initializer=xav())

    attn = tf.nn.softmax(tf.matmul(projected_context, r_projected_key), dim=1)

    if projected_align:
        log.info("Using projected attention alignment")
        t_context = tf.transpose(projected_context, [0, 2, 1])
        output = tf.reshape(tf.matmul(t_context, attn),
                            shape=[batch_size, -1, hidden_size])
    else:
        log.info("Using without projected attention alignment")
        t_context = tf.transpose(r_context, [0, 2, 1])
        output = tf.reshape(tf.matmul(t_context, attn),
                            shape=[batch_size, -1, token_size])
    return output
コード例 #10
0
    def _build_body(self) -> Tuple[tf.Tensor, tf.Tensor]:
        # input projection
        _units = tf.layers.dense(self._features,
                                 self.dense_size,
                                 kernel_regularizer=tf.nn.l2_loss,
                                 kernel_initializer=xav())

        if self.attention_params:
            _attn_output = self._build_attn_body()
            _units = tf.concat([_units, _attn_output], -1)

        _units = tf_layers.variational_dropout(
            _units, keep_prob=self._dropout_keep_prob)

        # recurrent network unit
        _lstm_cell = tf.nn.rnn_cell.LSTMCell(self.hidden_size)
        _utter_lengths = tf.cast(tf.reduce_sum(self._utterance_mask, axis=-1),
                                 tf.int32)

        # _output: [batch_size, max_time, hidden_size]
        # _state: tuple of two [batch_size, hidden_size]
        _output, _state = tf.nn.dynamic_rnn(_lstm_cell,
                                            _units,
                                            time_major=False,
                                            initial_state=self._initial_state,
                                            sequence_length=_utter_lengths)

        _output = tf.reshape(_output, (self._batch_size, -1, self.hidden_size))
        _output = tf_layers.variational_dropout(
            _output, keep_prob=self._dropout_keep_prob)
        # output projection
        _logits = tf.layers.dense(_output,
                                  self.action_size,
                                  kernel_regularizer=tf.nn.l2_loss,
                                  kernel_initializer=xav(),
                                  name='logits')
        return _logits, _state
コード例 #11
0
        def __graph__():
            tf.reset_default_graph()

            # entry points
            features_ = tf.placeholder(tf.float32, [1, obs_size],
                                       name='input_features')
            init_state_c_, init_state_h_ = (tf.placeholder(
                tf.float32, [1, nb_hidden]) for _ in range(2))
            action_ = tf.placeholder(tf.int32, name='ground_truth_action')
            # action_mask disabled (line 22, 49, 74, 96, 112)
            action_mask_ = tf.placeholder(tf.float32, [action_size],
                                          name='action_mask')

            # input projection
            with tf.name_scope("input"):
                with tf.name_scope("weights"):
                    Wi = tf.get_variable('Wi', [obs_size, nb_hidden],
                                         initializer=xav())
                    variable_summarize(Wi)
                with tf.name_scope("biases"):
                    bi = tf.get_variable(
                        'bi', [nb_hidden],
                        initializer=tf.constant_initializer(0.))
                    variable_summarize(bi)

            # add relu/tanh here if necessary
                with tf.name_scope("projected_features"):

                    projected_features = tf.matmul(features_, Wi) + bi
                    tf.summary.histogram('histogram', projected_features)

            lstm_f = tf.contrib.rnn.LSTMCell(nb_hidden, state_is_tuple=True)

            lstm_op, state = lstm_f(inputs=projected_features,
                                    state=(init_state_c_, init_state_h_))

            # reshape LSTM's state tuple (2,128) -> (1,256)
            state_reshaped = tf.concat(axis=1, values=(state.c, state.h))

            # output projection

            with tf.name_scope("outputs"):
                with tf.name_scope("weights"):
                    Wo = tf.get_variable('Wo', [2 * nb_hidden, action_size],
                                         initializer=xav())
                    variable_summarize(Wo)
                with tf.name_scope("biases"):
                    bo = tf.get_variable(
                        'bo', [action_size],
                        initializer=tf.constant_initializer(0.))
                    variable_summarize(bo)
            # get logits
                with tf.name_scope("logits"):
                    logits = tf.matmul(state_reshaped, Wo) + bo
                    tf.summary.histogram('histogram', logits)
            # probabilities
            #  normalization : elemwise multiply with action mask

            probs = tf.multiply(tf.squeeze(tf.nn.softmax(logits)),
                                action_mask_)

            #print("PROBS : ", probs)
            # prediction
            prediction = tf.argmax(probs, dimension=0)

            # loss
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits, labels=action_, name="loss")

            tf.summary.scalar('loss', tf.squeeze(loss))

            # train op
            train_op = tf.train.AdadeltaOptimizer(0.1).minimize(loss)

            # attach symbols to self
            self.loss = loss
            self.prediction = prediction
            self.probs = probs
            self.logits = logits
            self.state = state
            self.train_op = train_op

            # attach placeholders
            self.features_ = features_
            self.init_state_c_ = init_state_c_
            self.init_state_h_ = init_state_h_
            self.action_ = action_
            self.action_mask_ = action_mask_
            self.merged = tf.summary.merge_all()
コード例 #12
0
    def __graph__(self):
        with tf.variable_scope('vae'):
            self.vrae = getattr(vae, self.config['vae_model'])(self.config,
                                                               self.rev_vocab,
                                                               self.sess)
        # entry points
        input_words = tf.placeholder(
            tf.float32,
            [None, self.max_input_length, self.max_sequence_length],
            name='input_words')
        input_contexts = tf.placeholder(
            tf.float32,
            [None, self.max_input_length, self.feature_vector_size],
            name='input_contexts')
        action_ = tf.placeholder(tf.int32, [None, self.max_input_length],
                                 name='ground_truth_action')
        prev_action_ = tf.placeholder(
            tf.float32, [None, self.max_input_length, self.action_size],
            name='prev_action')
        action_mask_ = tf.placeholder(
            tf.float32, [None, self.max_input_length, self.action_size],
            name='action_mask')
        action_seq_length = tf.count_nonzero(action_, -1)

        vae_outputs = tf.reshape(
            self.vrae.z,
            shape=[-1, self.max_input_length, self.config['latent_size']])

        # input projection
        Wi_var = tf.get_variable(
            'Wi_var',
            shape=[
                self.feature_vector_size + self.config['latent_size'] +
                2 * self.action_size,
                self.config['dialog_level_embedding_size']
            ],
            dtype=tf.float32,
            initializer=xav())
        bi_var = tf.get_variable(
            'bi_var',
            shape=[self.config['dialog_level_embedding_size']],
            dtype=tf.float32,
            initializer=tf.constant_initializer(0.))

        turn_features_var = tf.concat([vae_outputs, input_contexts], axis=-1)
        all_inputs_var = tf.concat(
            [turn_features_var, action_mask_, prev_action_], axis=-1)
        # add relu/tanh here if necessary
        projected_features_var = tf.tensordot(all_inputs_var, Wi_var,
                                              axes=1) + bi_var

        lstm_cell_var = tf.contrib.rnn.BasicLSTMCell(
            self.config['dialog_level_embedding_size'],
            state_is_tuple=True,
            name='dialog_encoder_var')
        outputs_var, states_var = tf.nn.dynamic_rnn(lstm_cell_var,
                                                    projected_features_var,
                                                    dtype=tf.float32)

        # output projection
        Wo = tf.get_variable('Wo',
                             shape=[
                                 self.config['dialog_level_embedding_size'],
                                 self.action_size
                             ],
                             dtype=tf.float32,
                             initializer=xav())
        bo = tf.get_variable('bo',
                             shape=[self.action_size],
                             dtype=tf.float32,
                             initializer=tf.constant_initializer(0.))
        logits = tf.tensordot(outputs_var, Wo, axes=1) + bo
        # probabilities
        #  normalization : elemwise multiply with action mask
        # not doing softmax because it's taken care of in the cross-entropy!
        probs = tf.multiply(logits, action_mask_)

        # prediction
        prediction = tf.argmax(probs, axis=-1)

        mask_fn = lambda l: tf.sequence_mask(
            l, self.max_input_length, dtype=tf.float32)
        sequence_mask = mask_fn(action_seq_length)
        # sequence_mask = tf.placeholder(tf.float32, [None, self.max_input_length], name='sequence_mask')
        # loss
        self.hcn_loss = tf.contrib.seq2seq.sequence_loss(
            logits=logits,
            targets=action_,
            weights=sequence_mask,
            average_across_batch=False)
        self.vae_kl_loss = tf.reduce_mean(tf.reshape(
            self.vrae._kl_loss_fn(self.vrae.z_mean, self.vrae.z_logvar),
            shape=[-1, self.max_input_length]),
                                          axis=-1)
        # self.vae_nll_loss = tf.reduce_mean(tf.reshape(self.vrae._nll_loss_fn(), shape=[-1, self.max_input_length]), axis=-1)
        self.vae_bow_loss = tf.reduce_mean(tf.reshape(
            self.vrae._bow_loss_fn(self.vrae.bow_logits,
                                   self.vrae.bow_targets),
            shape=[-1, self.max_input_length]),
                                           axis=-1)
        self.vae_overall_loss = self.vae_kl_loss + self.vae_bow_loss  # + self.vae_nll_loss
        self.loss = self.hcn_loss + self.vae_overall_loss

        self.lr = tf.train.exponential_decay(
            self.config['learning_rate'],
            self.global_step,
            self.config.get('steps_before_decay', 0),
            self.config.get('learning_rate_decay', 1.0),
            staircase=True)
        # train op
        optimizer = getattr(tf.train, self.config['optimizer'])(self.lr)
        gradients, variables = zip(*optimizer.compute_gradients(self.loss))
        gradients, _ = tf.clip_by_global_norm(gradients,
                                              self.config['clip_norm'])
        train_op = optimizer.apply_gradients(zip(gradients, variables),
                                             global_step=self.global_step)

        # attach symbols to self
        self.prediction = prediction
        self.probs = probs
        self.logits = logits
        self.sequence_mask_ = sequence_mask
        self.train_op = train_op

        # attach placeholders
        self.input_words = input_words
        self.input_contexts = input_contexts
        self.action_ = action_
        self.action_mask_ = action_mask_
        self.prev_action_ = prev_action_
コード例 #13
0
    def __graph__(self):
        # entry points
        input_words_ = tf.placeholder(tf.int32, [None, self.max_input_length, self.max_sequence_length], name='input_words')
        bow_features_ = tf.placeholder(tf.float32, [None, self.max_input_length, self.config['vocabulary_size']], name='bow_features')
        context_features_ = tf.placeholder(tf.float32, [None, self.max_input_length, self.feature_vector_size], name='input_features')
        action_ = tf.placeholder(tf.int32, [None, self.max_input_length], name='ground_truth_action')
        prev_action_ = tf.placeholder(tf.float32, [None, self.max_input_length, self.action_size], name='prev_action')
        action_mask_ = tf.placeholder(tf.float32, [None, self.max_input_length, self.action_size], name='action_mask')
        action_seq_length = tf.count_nonzero(action_, -1)

        embedding_matrix = tf.get_variable('emb',
                                           initializer=tf.constant(get_w2v_model(self.vocab)),
                                           trainable=True)
        lookup_result = tf.nn.embedding_lookup(embedding_matrix, input_words_)
        masked_emb = tf.concat([tf.zeros([1, 1]), tf.ones([embedding_matrix.get_shape()[0] - 1, 1])], axis=0)
        mask_lookup_result = tf.nn.embedding_lookup(masked_emb, input_words_)
        lookup_result = tf.multiply(lookup_result, mask_lookup_result)
        utterance_embeddings = tf.reduce_mean(lookup_result, axis=2)

        all_input = tf.concat([utterance_embeddings, bow_features_, context_features_, prev_action_, action_mask_], axis=-1) 
        # input projection
        Wi = tf.get_variable('Wi',
                             shape=[self.feature_vector_size + self.config['w2v_embedding_size'] + self.config['vocabulary_size'] + 2 * self.action_size, self.nb_hidden],
                             dtype=tf.float32,
                             initializer=xav())
        bi = tf.get_variable('bi',
                             shape=[self.nb_hidden],
                             dtype=tf.float32,
                             initializer=tf.constant_initializer(0.))

        # add relu/tanh here if necessary
        projected_features = tf.tensordot(all_input, Wi, axes=1) + bi

        Wbow = tf.get_variable('Wbow',
                               shape=[self.nb_hidden, len(self.vocab)],
                               dtype=tf.float32,
                               initializer=xav())
        bbow = tf.get_variable('bbow',
                               shape=[len(self.vocab)],
                               dtype=tf.float32,
                               initializer=tf.constant_initializer(0.))
        bow_logits = tf.tensordot(projected_features, Wbow, axes=1) + bbow

        lstm_cell = tf.contrib.rnn.BasicLSTMCell(self.nb_hidden, state_is_tuple=True)
        outputs, states = tf.nn.dynamic_rnn(lstm_cell, projected_features, dtype=tf.float32)

        # output projection
        Wo = tf.get_variable('Wo',
                             shape=[self.nb_hidden, self.action_size],
                             dtype=tf.float32,
                             initializer=xav())
        bo = tf.get_variable('bo',
                             shape=[self.action_size],
                             dtype=tf.float32,
                             initializer=tf.constant_initializer(0.))
        # get logits
        logits = tf.tensordot(outputs, Wo, axes=1) + bo
        # probabilities
        #  normalization : elemwise multiply with action mask
        # not doing softmax because it's taken care of in the cross-entropy!
        probs = tf.multiply(logits, action_mask_)

        # prediction
        prediction = tf.argmax(probs, axis=-1)

        mask_fn = lambda l: tf.sequence_mask(l, self.max_input_length, dtype=tf.float32)
        sequence_mask = mask_fn(action_seq_length)
        # sequence_mask = tf.placeholder(tf.float32, [None, self.max_input_length], name='sequence_mask')
        # loss
        l2_loss = tf.reduce_sum([tf.nn.l2_loss(v)
                                 for v in tf.trainable_variables()
                                 if v.name[0] != 'b']) * self.config['l2_coef']
        hcn_loss = tf.contrib.seq2seq.sequence_loss(logits=logits, targets=action_, weights=sequence_mask, average_across_batch=False)
        bow_loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=bow_logits, labels=bow_features_), axis=-1)
        
        loss = hcn_loss + l2_loss + bow_loss

        # train op
        self.lr = tf.train.exponential_decay(self.config['learning_rate'],
                                             self.global_step,
                                             self.config.get('steps_before_decay', 0),
                                             self.config.get('learning_rate_decay', 1.0),
                                             staircase=True)
        optimizer = getattr(tf.train, self.config['optimizer'])(self.lr)
        gradients, variables = zip(*optimizer.compute_gradients(loss))
        gradients_filtered, variables_filtered = [], []
        if len(self.trainable_vars):
            for gradient, variable in zip(gradients, variables):
                if variable.name in self.trainable_vars:
                    gradients_filtered.append(gradient)
                    variables_filtered.append(variable)
        else:
            gradients_filtered, variables_filtered = gradients, variables
        gradients, _ = tf.clip_by_global_norm(gradients_filtered, self.config['clip_norm'])
        train_op = optimizer.apply_gradients(zip(gradients_filtered, variables_filtered), global_step=self.global_step)

        # attach symbols to self
        self.loss = loss
        self.prediction = prediction
        self.probs = probs
        self.logits = logits
        self.train_op = train_op

        # attach placeholders
        self.input_words_ = input_words_
        self.context_features_ = context_features_
        self.bow_features_ = bow_features_
        self.action_ = action_
        self.prev_action_ = prev_action_
        self.action_mask_ = action_mask_
コード例 #14
0
        def __graph__():
            tf.reset_default_graph()

            # entry points
            features_ = tf.placeholder(tf.float32, [1, obs_size],
                                       name='input_features')
            init_state_f_, init_state_s_ = (tf.placeholder(
                tf.float32, [1, nb_hidden]) for _ in range(2))
            action_ = tf.placeholder(tf.int32, name='ground_truth_action')
            if self.is_action_mask:
                action_mask_ = tf.placeholder(tf.float32, [action_size],
                                              name='action_mask')

            # input projection - 인풋 dimention을 맞춰주기 위한 trick ##############
            Wi = tf.get_variable('Wi', [obs_size, nb_hidden],
                                 initializer=xav())
            bi = tf.get_variable('bi', [nb_hidden],
                                 initializer=tf.constant_initializer(0.))

            projected_features = tf.matmul(features_, Wi) + bi
            ########################################################################

            gru_f = tf.contrib.rnn.GRUCell(num_units=nb_hidden)
            stacked_gru = tf.contrib.rnn.MultiRNNCell([gru_f] * 2)

            output, state = stacked_gru(projected_features,
                                        state=(init_state_f_, init_state_s_))

            # ouput projection - 아웃풋 dimention을 맞춰주기 위한 trick ###########
            state_reshaped = tf.concat(axis=1, values=(state[0], state[1]))

            Wo = tf.get_variable('Wo', [2 * nb_hidden, action_size],
                                 initializer=xav())
            bo = tf.get_variable('bo', [action_size],
                                 initializer=tf.constant_initializer(0.))

            logits = tf.matmul(state_reshaped, Wo) + bo
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits, labels=action_)
            train_op = tf.train.AdamOptimizer(0.1).minimize(loss)
            ########################################################################

            if self.is_action_mask:
                probs = tf.multiply(tf.squeeze(tf.nn.softmax(logits)),
                                    action_mask_)
            else:
                probs = tf.squeeze(tf.squeeze(tf.nn.softmax(logits)))

            prediction = tf.arg_max(probs, dimension=0)

            # each output values
            self.loss = loss
            self.prediction = prediction
            self.probs = probs
            self.logits = logits
            self.state = state
            self.train_op = train_op

            # attach placeholder
            self.features_ = features_
            self.init_state_f_ = init_state_f_
            self.init_state_s_ = init_state_s_
            self.action_ = action_
            if self.is_action_mask:
                self.action_mask_ = action_mask_
コード例 #15
0
        def __graph__():
            tf.reset_default_graph()

            # entry points
            features_ = tf.placeholder(tf.float32, [1, obs_size],
                                       name='input_features')
            init_state_h_ = tf.placeholder(tf.float32, [1, nb_hidden])
            action_ = tf.placeholder(tf.int32, name='ground_truth_action')
            if self.is_action_mask:
                action_mask_ = tf.placeholder(tf.float32, [action_size],
                                              name='action_mask')

            # input projection - 인풋 dimention을 맞춰주기 위한 trick ##############
            Wi = tf.get_variable('Wi', [obs_size, nb_hidden],
                                 initializer=xav())
            bi = tf.get_variable('bi', [nb_hidden],
                                 initializer=tf.constant_initializer(0.))

            projected_features = tf.matmul(features_, Wi) + bi
            ########################################################################

            gru_f = tf.contrib.rnn.GRUCell(num_units=nb_hidden)
            gru_op, state = gru_f(inputs=projected_features,
                                  state=init_state_h_)

            # ouput projection - 아웃풋 dimention을 맞춰주기 위한 trick ###########
            state_reshaped = state

            Wo = tf.get_variable('Wo', [nb_hidden, action_size],
                                 initializer=xav())
            bo = tf.get_variable('bo', [action_size],
                                 initializer=tf.constant_initializer(0.))

            logits = tf.matmul(state_reshaped, Wo) + bo
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits, labels=action_)
            train_op = tf.train.AdadeltaOptimizer(0.1).minimize(loss)
            ########################################################################

            # probabilities
            #  normalization : elemwise multiply with action mask
            if self.is_action_mask:
                probs = tf.multiply(tf.squeeze(tf.nn.softmax(logits)),
                                    action_mask_)
            else:
                probs = tf.squeeze(tf.squeeze(tf.nn.softmax(logits)))
            # prediction
            prediction = tf.arg_max(probs, dimension=0)

            # each output values
            self.loss = loss
            self.prediction = prediction
            self.probs = probs
            self.logits = logits
            self.state = state
            self.train_op = train_op

            # attach placeholder
            self.features_ = features_
            self.init_state_h_ = init_state_h_
            self.action_ = action_
            if self.is_action_mask:
                self.action_mask_ = action_mask_
コード例 #16
0
        def __graph__():
            tf.reset_default_graph()

            # entry points
            features_ = tf.placeholder(tf.float32, [1, obs_size],
                                       name='input_features')  # 365
            init_state_c_, init_state_h_ = (tf.placeholder(
                tf.float32, [1, nb_hidden]) for _ in range(2))  # 128
            action_ = tf.placeholder(tf.int32,
                                     name='ground_truth_action')  # label
            action_mask_ = tf.placeholder(
                tf.float32, [action_size],
                name='action_mask')  # 7个二进制(将要与softmax后的分布相乘)

            # input projection
            Wi = tf.get_variable('Wi', [obs_size, nb_hidden],
                                 initializer=xav())  # [365,128]
            bi = tf.get_variable(
                'bi', [nb_hidden],
                initializer=tf.constant_initializer(0.))  # 128

            # add relu/tanh here if necessary
            projected_features = tf.matmul(features_, Wi) + bi  # 128

            # state_is_tuple如果为True,则接受和返回的状态是c_state和m_state的2-tuple;如果为False,则他们沿着列轴连接后一种即将被弃用
            lstm_f = tf.contrib.rnn.LSTMCell(nb_hidden,
                                             state_is_tuple=True)  # 128

            lstm_op, state = lstm_f(inputs=projected_features,
                                    state=(init_state_c_, init_state_h_))

            # reshape LSTM's state tuple (2,128) -> (1,256)  (joint h and c)
            state_reshaped = tf.concat(axis=1,
                                       values=(state.c, state.h))  # 256

            # output projection
            Wo = tf.get_variable('Wo', [2 * nb_hidden, action_size],
                                 initializer=xav())  # [256,7]
            bo = tf.get_variable('bo', [action_size],
                                 initializer=tf.constant_initializer(0.))  # 7

            # get logits
            logits = tf.matmul(state_reshaped, Wo) + bo  # 7
            # probabilities
            #  normalization : elemwise multiply with action mask
            probs = tf.multiply(tf.squeeze(tf.nn.softmax(logits)),
                                action_mask_)  # softmax后的概率分布与7个二进制0 1相乘

            # prediction
            prediction = tf.arg_max(probs, dimension=0)  # 取概率最大的action_id作为输出

            # loss  由于有sparse_,labels为一维向量,长度=batch_size,此处长度为1 [action_id],代表分类结果
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits, labels=action_)

            # train op
            train_op = tf.train.AdadeltaOptimizer(0.1).minimize(loss)

            # attach symbols to self
            self.loss = loss
            self.prediction = prediction
            self.probs = probs
            self.logits = logits
            self.state = state
            self.train_op = train_op

            # attach placeholders
            self.features_ = features_
            self.init_state_c_ = init_state_c_
            self.init_state_h_ = init_state_h_
            self.action_ = action_
            self.action_mask_ = action_mask_
コード例 #17
0
ファイル: network.py プロジェクト: zhuchen1990/DeepPavlov
    def _build_body(self):
        # input projection
        _units = tf.layers.dense(self._features, self.dense_size,
                                 kernel_regularizer=tf.nn.l2_loss,
                                 kernel_initializer=xav())
        if self.attn:
            attn_scope = "attention_mechanism/{}".format(self.attn.type)
            with tf.variable_scope(attn_scope):
                if self.attn.type == 'general':
                    _attn_output = am.general_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        projected_align=self.attn.projected_align)
                elif self.attn.type == 'bahdanau':
                    _attn_output = am.bahdanau_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        projected_align=self.attn.projected_align)
                elif self.attn.type == 'cs_general':
                    _attn_output = am.cs_general_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        depth=self.attn.depth,
                        projected_align=self.attn.projected_align)
                elif self.attn.type == 'cs_bahdanau':
                    _attn_output = am.cs_bahdanau_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        depth=self.attn.depth,
                        projected_align=self.attn.projected_align)
                elif self.attn.type == 'light_general':
                    _attn_output = am.light_general_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        projected_align=self.attn.projected_align)
                elif self.attn.type == 'light_bahdanau':
                    _attn_output = am.light_bahdanau_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        projected_align=self.attn.projected_align)
                else:
                    raise ValueError("wrong value for attention mechanism type")
            _units = tf.concat([_units, _attn_output], -1)

        _units = tf_layers.variational_dropout(_units,
                                               keep_prob=self._dropout_keep_prob)

        # recurrent network unit
        _lstm_cell = tf.nn.rnn_cell.LSTMCell(self.hidden_size)
        _utter_lengths = tf.to_int32(tf.reduce_sum(self._utterance_mask, axis=-1))
        _output, _state = tf.nn.dynamic_rnn(_lstm_cell,
                                            _units,
                                            time_major=False,
                                            initial_state=self._initial_state,
                                            sequence_length=_utter_lengths)
        _output = tf.reshape(_output, (self._batch_size, -1, self.hidden_size))
        _output = tf_layers.variational_dropout(_output,
                                                keep_prob=self._dropout_keep_prob)
        # output projection
        _logits = tf.layers.dense(_output, self.action_size,
                                  kernel_regularizer=tf.nn.l2_loss,
                                  kernel_initializer=xav(), name='logits')
        return _logits, _state
コード例 #18
0
        def __graph__():
            tf.reset_default_graph()
            self.dropout = tf.placeholder(dtype=tf.float32,
                                          shape=[],
                                          name="dropout")

            # entry points
            features_ = tf.placeholder(tf.float32, [1, obs_size],
                                       name='input_features')
            init_state_c_, init_state_h_ = (tf.placeholder(
                tf.float32, [1, nb_hidden]) for _ in range(2))
            action_ = tf.placeholder(tf.int32, name='ground_truth_action')

            # input projection
            Wi = tf.get_variable('Wi', [obs_size, nb_hidden],
                                 initializer=xav())
            bi = tf.get_variable('bi', [nb_hidden],
                                 initializer=tf.constant_initializer(0.))

            # add relu/tanh here if necessary
            projected_features = tf.matmul(features_, Wi) + bi

            lstm_f = tf.contrib.rnn.LSTMCell(nb_hidden, state_is_tuple=True)

            lstm_op, state = lstm_f(inputs=projected_features,
                                    state=(init_state_c_, init_state_h_))

            # reshape LSTM's state tuple (2,128) -> (1,256)
            state_reshaped = tf.concat(axis=1, values=(state.c, state.h))
            state_reshaped = tf.nn.dropout(state_reshaped, self.dropout)

            # define user utterance memory
            prev_hidden_states = tf.placeholder(tf.float32,
                                                [None, nb_hidden * 2],
                                                name='prev_hidden_states')
            W_user = tf.get_variable('W_user', [nb_hidden * 2, nb_hidden * 2],
                                     initializer=xav())

            # (None, 256) x (256, 256) x (256, 1) => (None, 1)
            user_attention_score = tf.matmul(
                tf.matmul(prev_hidden_states, W_user),
                tf.transpose(state_reshaped))
            # (None)
            user_attention_weights = tf.nn.softmax(
                tf.transpose(user_attention_score))
            # (None, 256)
            user_encodings = prev_hidden_states
            # (None) x (None, 256) => (1, 256)
            user_weighted_sum = tf.matmul(user_attention_weights,
                                          user_encodings)
            user_weighted_sum = tf.nn.dropout(user_weighted_sum, self.dropout)

            # define action attention variables
            action_projection = tf.placeholder(tf.float32, [300, action_size],
                                               name='action_projection')
            action_one_hot = tf.placeholder(tf.float32, [action_size],
                                            name='action_one_hot')
            expanded_action_one_hot = tf.expand_dims(action_one_hot, 1)

            # action_encoding => (300 x 1) 현재 메모리값임
            action_encoding = tf.matmul(action_projection,
                                        expanded_action_one_hot)
            action_encoding = tf.nn.dropout(action_encoding, self.dropout)
            # (1 x 300)
            action_encoding = tf.transpose(action_encoding)

            W_action = tf.get_variable('W_action', [300, nb_hidden * 2],
                                       initializer=xav())

            # output : 1 dimension scalar value (current system action projection value) 이거 전 액션에 대한거임 변수명때문에 헷갈리지 말것.
            # 1 x 1
            transposed_hidden_state = tf.transpose(state_reshaped)  # 256 x 1

            # 이전 시스템 메모리값들임
            prev_action_encodings = tf.placeholder(tf.float32, [None, 300],
                                                   name='prev_actions')
            # output : [None, 1]
            prev_projected_actions = tf.matmul(
                tf.matmul(prev_action_encodings, W_action),
                transposed_hidden_state)

            # shape : [number of prev_utter, 1]
            projected_actions = prev_projected_actions

            # shape : [1, number of prev_utter]
            transposed_projected_actions = tf.transpose(projected_actions)

            # output shape : [number of prev_utter]
            # Get action weights (probability distribution of each action encodings)
            action_weights = tf.nn.softmax(transposed_projected_actions)

            action_encodings = prev_action_encodings
            # output shape : (1, 300)
            system_weighted_sum = tf.matmul(action_weights, action_encodings)
            system_weighted_sum = tf.nn.dropout(system_weighted_sum,
                                                self.dropout)

            # 이 밑에 부분 3가지로 실험할 것. (1. +, 2. AVG, 3.POOLING)
            sum_features = tf.reduce_sum(
                [state_reshaped, user_weighted_sum, system_weighted_sum], 0)
            # avg_features = tf.reduce_mean([state_reshaped, user_weighted_sum, system_weighted_sum], 0)
            # 3. pooled_features = tf.reduce_max([state_reshaped, user_weighted_sum, system_weighted_sum], 0)

            # output projection
            Wo = tf.get_variable('Wo', [300, action_size], initializer=xav())
            bo = tf.get_variable('bo', [action_size],
                                 initializer=tf.constant_initializer(0.))
            # get logits
            logits = tf.matmul(sum_features, Wo) + bo

            # concate lstm features with weighted sum attention feature
            # concatenated_features = tf.concat([state_reshaped, user_weighted_sum, system_weighted_sum], 1)
            # concatenated_features = tf.nn.dropout(concatenated_features, self.dropout)

            # # output projection
            # Wo = tf.get_variable('Wo', [300 + 256 + 256, action_size],
            #         initializer=xav())
            # bo = tf.get_variable('bo', [action_size],
            #         initializer=tf.constant_initializer(0.))

            # # get logits
            # logits = tf.matmul(concatenated_features, Wo) + bo

            probs = tf.squeeze(tf.nn.softmax(logits))

            # prediction
            prediction = tf.arg_max(probs, dimension=0)

            # loss
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits, labels=action_)

            global_step = tf.Variable(0, trainable=False)
            learning_rate = tf.train.exponential_decay(0.25,
                                                       global_step,
                                                       200000,
                                                       0.8,
                                                       staircase=True)
            # train op
            train_op = tf.train.AdadeltaOptimizer(learning_rate).minimize(
                loss, global_step=global_step)

            # attach symbols to self
            self.loss = loss
            self.prediction = prediction
            self.probs = probs
            self.logits = logits
            self.state = state
            self.train_op = train_op

            # attach placeholders
            self.features_ = features_
            self.init_state_c_ = init_state_c_
            self.init_state_h_ = init_state_h_
            self.action_ = action_

            # user attention values
            self.prev_hidden_states = prev_hidden_states
            self.user_encodings = user_encodings

            # attention placeholders
            self.action_projection = action_projection
            self.action_one_hot = action_one_hot
            self.prev_action_encodings = prev_action_encodings

            # attention values
            self.action_encoding = action_encoding
            self.action_encodings = action_encodings
            self.projected_actions = projected_actions

            self.user_attention_weights = user_attention_weights
            self.action_weights = action_weights
コード例 #19
0
ファイル: network.py プロジェクト: wangzhenya/DeepPavlov
    def _build_body(self):
        # input projection
        _units = tf.layers.dense(self._features, self.dense_size,
                                 kernel_regularizer=tf.nn.l2_loss,
                                 kernel_initializer=xav())
        if self.attn:
            attn_scope = "attention_mechanism/{}".format(self.attn.type)
            with tf.variable_scope(attn_scope):
                if self.attn.type == 'general':
                    _attn_output = am.general_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        projected_align=self.attn.projected_align)
                elif self.attn.type == 'bahdanau':
                    _attn_output = am.bahdanau_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        projected_align=self.attn.projected_align)
                elif self.attn.type == 'cs_general':
                    _attn_output = am.cs_general_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        depth=self.attn.depth,
                        projected_align=self.attn.projected_align)
                elif self.attn.type == 'cs_bahdanau':
                    _attn_output = am.cs_bahdanau_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        depth=self.attn.depth,
                        projected_align=self.attn.projected_align)
                elif self.attn.type == 'light_general':
                    _attn_output = am.light_general_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        projected_align=self.attn.projected_align)
                elif self.attn.type == 'light_bahdanau':
                    _attn_output = am.light_bahdanau_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        projected_align=self.attn.projected_align)
                else:
                    raise ValueError("wrong value for attention mechanism type")
            _units = tf.concat([_units, _attn_output], -1)

        _units = tf_layers.variational_dropout(_units,
                                               keep_prob=self._dropout_keep_prob)

        # recurrent network unit
        _lstm_cell = tf.nn.rnn_cell.LSTMCell(self.hidden_size)
        _utter_lengths = tf.to_int32(tf.reduce_sum(self._utterance_mask, axis=-1))
        _output, _state = tf.nn.dynamic_rnn(_lstm_cell,
                                            _units,
                                            initial_state=self._initial_state,
                                            sequence_length=_utter_lengths)

        # output projection
        _logits = tf.layers.dense(_output, self.action_size,
                                  kernel_regularizer=tf.nn.l2_loss,
                                  kernel_initializer=xav(), name='logits')
        return _logits, _state
コード例 #20
0
    def __graph__(self):
        with tf.variable_scope('ae'):
            ae = ae_ood.RNNAutoencoder(self.config, self.rev_vocab)

        input_contexts = tf.placeholder(
            tf.float32,
            [None, self.max_input_length, self.feature_vector_size],
            name='input_contexts')
        action_ = tf.placeholder(tf.int32, [None, self.max_input_length],
                                 name='ground_truth_action')
        action_mask_ = tf.placeholder(
            tf.float32, [None, self.max_input_length, self.action_size],
            name='action_mask')
        action_seq_length = tf.count_nonzero(action_, -1)

        ae_turn_encodings = tf.concat(ae.enc_state, axis=-1)
        turn_features = tf.concat([
            tf.reshape(ae_turn_encodings,
                       shape=[
                           -1, self.max_input_length,
                           self.config['embedding_size'] * 2
                       ]), input_contexts
        ],
                                  axis=-1)

        # input projection
        Wi = tf.get_variable('Wi',
                             shape=[
                                 self.feature_vector_size + self.nb_hidden * 2,
                                 self.nb_hidden
                             ],
                             dtype=tf.float32,
                             initializer=xav())
        bi = tf.get_variable('bi',
                             shape=[self.nb_hidden],
                             dtype=tf.float32,
                             initializer=tf.constant_initializer(0.))

        # add relu/tanh here if necessary
        projected_features = tf.tensordot(turn_features, Wi, axes=1) + bi

        lstm_cell = tf.contrib.rnn.BasicLSTMCell(self.nb_hidden,
                                                 state_is_tuple=True,
                                                 name='dialog_encoder')
        outputs, states = tf.nn.dynamic_rnn(lstm_cell,
                                            projected_features,
                                            dtype=tf.float32)

        # output projection
        Wo = tf.get_variable('Wo',
                             shape=[self.nb_hidden, self.action_size],
                             dtype=tf.float32,
                             initializer=xav())
        bo = tf.get_variable('bo',
                             shape=[self.action_size],
                             dtype=tf.float32,
                             initializer=tf.constant_initializer(0.))
        # get logits
        logits = tf.tensordot(outputs, Wo, axes=1) + bo
        # probabilities
        #  normalization : elemwise multiply with action mask
        # not doing softmax because it's taken care of in the cross-entropy!
        probs = tf.multiply(logits, action_mask_)

        # prediction
        prediction = tf.argmax(probs, axis=-1)

        mask_fn = lambda l: tf.sequence_mask(
            l, self.max_input_length, dtype=tf.float32)
        sequence_mask = mask_fn(action_seq_length)
        # loss
        self.hcn_loss = tf.contrib.seq2seq.sequence_loss(
            logits=logits,
            targets=action_,
            weights=sequence_mask,
            average_across_batch=True)

        self.ae_loss = ae.loss_op
        # vae_loss = self.vae_nll_loss + self.vae_kl_w * self.vae_kl_loss
        loss = tf.reduce_mean(self.hcn_loss + self.ae_loss)
        self.lr = tf.train.exponential_decay(
            self.config['learning_rate'],
            self.global_step,
            self.config.get('steps_before_decay', 0),
            self.config.get('learning_rate_decay', 1.0),
            staircase=True)
        optimizer = getattr(tf.train, self.config['optimizer'])(self.lr)
        gradients, variables = zip(*optimizer.compute_gradients(loss))
        # gradients, _ = tf.clip_by_global_norm(gradients, self.config['clip_norm'])
        self.train_op = optimizer.apply_gradients(zip(gradients, variables),
                                                  global_step=self.global_step)

        # attach symbols to self
        self.ae = ae
        self.loss = loss
        self.prediction = prediction
        self.probs = probs
        self.logits = logits
        self.sequence_mask_ = sequence_mask

        # attach placeholders
        self.input_contexts = input_contexts
        self.action_ = action_
        self.action_mask_ = action_mask_
コード例 #21
0
        def __graph__():
            tf.reset_default_graph()

            # entry points
            features_ = tf.placeholder(tf.float32, [1, obs_size],
                                       name='input_features')
            init_state_c_, init_state_h_ = (tf.placeholder(
                tf.float32, [1, nb_hidden]) for _ in range(2))

            action_ = tf.placeholder(tf.int32, name='ground_truth_action')
            action_mask_ = tf.placeholder(tf.float32, [action_size],
                                          name='action_mask')

            # input projection - 인풋 dimention을 맞춰주기 위한 trick ##############
            Wi = tf.get_variable('Wi', [obs_size, nb_hidden],
                                 initializer=xav())
            bi = tf.get_variable('bi', [nb_hidden],
                                 initializer=tf.constant_initializer(0.))

            projected_features = tf.matmul(features_, Wi) + bi

            cell_fw = tf.contrib.rnn.GRUCell(num_units=nb_hidden)
            cell_bw = tf.contrib.rnn.GRUCell(num_units=nb_hidden)

            outputs, output_state_fw, output_state_bw = tf.nn.static_bidirectional_rnn(
                cell_fw,
                cell_bw,
                inputs=[projected_features],
                dtype=tf.float32)

            state_reshaped = tf.concat(axis=1,
                                       values=(output_state_fw,
                                               output_state_bw))

            Wo = tf.get_variable('Wo', [2 * nb_hidden, action_size],
                                 initializer=xav())
            bo = tf.get_variable('bo', [action_size],
                                 initializer=tf.constant_initializer(0.))

            logits = tf.matmul(state_reshaped, Wo) + bo
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits, labels=action_)
            train_op = tf.train.AdadeltaOptimizer(0.1).minimize(loss)

            if self.is_action_mask:
                probs = tf.multiply(tf.squeeze(tf.nn.softmax(logits)),
                                    action_mask_)
            else:
                probs = tf.squeeze(tf.squeeze(tf.nn.softmax(logits)))

            prediction = tf.arg_max(probs, dimension=0)

            # each output values
            self.loss = loss
            self.prediction = prediction
            self.probs = probs
            self.logits = logits
            # self.state = state
            self.train_op = train_op

            # attach placeholder
            self.features_ = features_
            self.init_state_c_ = init_state_c_
            self.init_state_h_ = init_state_h_
            self.action_ = action_
            if self.is_action_mask:
                self.action_mask_ = action_mask_
コード例 #22
0
        def __graph__():
            tf.reset_default_graph()

            features_ = tf.placeholder(tf.float32, [1, obs_size],
                                       name='input_features')
            init_state_c_, init_state_h_ = (tf.placeholder(
                tf.float32, [1, nb_hidden]) for _ in range(2))
            system_features = tf.placeholder(tf.float32, [300],
                                             name='system_features')
            ground_label = tf.placeholder(tf.int32, name='ground_truth_action')

            # input projection
            Wi = tf.get_variable('Wi', [obs_size, nb_hidden],
                                 initializer=xav())
            bi = tf.get_variable('bi', [nb_hidden],
                                 initializer=tf.constant_initializer(0.))

            # add relu/tanh here if necessary
            projected_features = tf.matmul(features_, Wi) + bi

            lstm_f = tf.contrib.rnn.LSTMCell(nb_hidden, state_is_tuple=True)

            lstm_op, state = lstm_f(inputs=projected_features,
                                    state=(init_state_c_, init_state_h_))

            # reshape LSTM's state tuple (2,128) -> (1,256)
            state_reshaped = tf.concat(axis=1, values=(state.c, state.h))

            # (256, 1)
            transposed_hidden_state = tf.transpose(state_reshaped)

            # output: 1 x 300 => 현재 시스템 메모리
            system_encoding = tf.expand_dims(system_features, 0)
            W_system = tf.get_variable('W_system', [300, 256],
                                       initializer=xav())

            current_system_attention_score = tf.matmul(
                tf.matmul(system_encoding, W_system), transposed_hidden_state)

            # 이전 시스템 메모리 값들
            prev_system_encodings = tf.placeholder(tf.float32, [None, 300])
            prev_system_attention_scores = tf.matmul(
                tf.matmul(prev_system_encodings, W_system),
                transposed_hidden_state)

            # output : [number of prev_utter + current_utter, 1]
            system_attention_scores = tf.concat(
                [prev_system_attention_scores, current_system_attention_score],
                0)
            transposed_system_attention_scores = tf.transpose(
                system_attention_scores)

            # [number of prev_utter + current_utter]
            system_attention_weights = tf.nn.softmax(
                transposed_system_attention_scores)

            # [number of prev_utter + current_utter, 300]
            system_encodings = tf.concat(
                [prev_system_encodings, system_encoding], 0)

            weighted_system_encodings = tf.matmul(system_attention_weights,
                                                  system_encodings)

            concatenated_features = tf.concat(
                [state_reshaped, weighted_system_encodings], 1)

            # output projection
            Wo = tf.get_variable('Wo', [556, num_class], initializer=xav())
            bo = tf.get_variable('bo', [num_class],
                                 initializer=tf.constant_initializer(0.))
            # get logits
            logits = tf.matmul(concatenated_features, Wo) + bo

            probs = tf.squeeze(tf.nn.softmax(logits))

            # prediction
            prediction = tf.arg_max(probs, dimension=0)

            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits, labels=ground_label)
            train_op = tf.train.AdadeltaOptimizer(0.1).minimize(loss)

            self.loss = loss
            self.prediction = prediction
            self.probs = probs
            self.logits = logits
            self.state = state
            self.train_op = train_op

            # attach placeholders
            self.features_ = features_
            self.system_features = system_features
            self.init_state_c_ = init_state_c_
            self.init_state_h_ = init_state_h_
            self.ground_label = ground_label

            self.prev_system_encodings = prev_system_encodings
            self.system_encodings = system_encodings