Python is_train_modeの例、texar.tf.utils.mode.is_train_mode Pythonの例

コード例 #1

0

ファイルを表示

ファイル: xlnet_utils.py プロジェクト: jiajunhua/asyml-texar

    def _build(self, input, mode=None):
        r"""Compute feed forward for the input.

        Args:
            input: Input tensor of size `(max_time, batch_size, hidden_dim)`
            mode (optional): A tensor taking value in
                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
                `TRAIN`, `EVAL`, and `PREDICT`. If `None`, dropout is
                controlled by :func:`texar.tf.global_mode`.

        :returns: A tensor output of the position wise feed forward network
        """
        is_training = is_train_mode(mode)
        output = self.linear1(input)
        output = self.dropout(output, training=is_training)
        output = self.linear2(output)
        output = self.dropout(output, training=is_training)

        # residual + layer norm
        output = tf.contrib.layers.layer_norm(input + output,
                                              begin_norm_axis=-1,
                                              scope=self.variable_scope,
                                              reuse=tf.AUTO_REUSE)

        return output

コード例 #2

0

ファイルを表示

    def _build(self, token_ids, segment_ids=None, input_mask=None, mode=None):
        r"""Feeds the inputs through the network and makes regression.

        Args:
            token_ids: Shape `[batch_size, max_time]`.
            segment_ids: Shape `[batch_size, max_time]`.
            input_mask: Float tensor of shape `[batch_size, max_time]`. Note
                that positions with value 1 are masked out.
            mode (optional): A tensor taking value in
                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`,
                including `TRAIN`, `EVAL`, and `PREDICT`. Used to toggle
                dropout.
                If `None` (default), :func:`texar.tf.global_mode` is used.

        Returns:
            Regression predictions.

            - If ``regr_strategy`` is ``cls_time`` or ``all_time``, predictions
              have shape `[batch_size]`.

            - If ``clas_strategy`` is ``time_wise``, predictions have shape
              `[batch_size, max_time]`.
        """
        is_training = is_train_mode(mode)
        output, _ = self._encoder(token_ids, segment_ids, input_mask=input_mask,
                                  mode=mode)

        strategy = self._hparams.regr_strategy
        if strategy == "time_wise":
            summary = output
        elif strategy == "cls_time":
            summary = output[:, -1]
        elif strategy == "all_time":
            length_diff = self._hparams.max_seq_len - tf.shape(token_ids)[1]
            summary_input = tf.pad(output,
                                   paddings=[[0, 0], [0, length_diff], [0, 0]])
            summary_input_dim = \
                self._encoder.output_size * self._hparams.max_seq_len
            summary = tf.reshape(summary_input, shape=[-1, summary_input_dim])
        else:
            raise ValueError("Unknown classification strategy: {}".
                             format(strategy))

        if self._hparams.use_projection:
            summary = tf.tanh(self.projection(summary))

        # summary: (batch_size, hidden_dim)
        summary = self._dropout_layer(summary, training=is_training)

        logits = tf.squeeze(self._logit_layer(summary), -1)

        if not self._built:
            self._add_internal_trainable_variables()
            if self._logit_layer:
                self._add_trainable_variable(
                    self._logit_layer.trainable_variables)
            self._built = True

        return logits

コード例 #3

0

ファイルを表示

ファイル: rnn_encoders.py プロジェクト: zxlzr/texar

def _forward_output_layers(inputs,
                           input_size,
                           output_layer,
                           time_major,
                           hparams,
                           mode,
                           sequence_length=None):
    """Forwards inputs through the output layers.

    Args:
        inputs: A Tensor of shape `[batch_size, max_time] + input_size` if
            :attr:`time_major=False`, or shape
            `[max_time, batch_size] + input_size` if :attr:`time_major=True`.

    Returns:
        A pair :attr:`(outputs, outputs_size), where

        - :attr:`outputs`: A Tensor of shape \
          `[batch_size, max_time] + outputs_size`.

        - :attr:`outputs_size`: An `int` or 1D `int` array representing the \
          output size.
    """
    if output_layer is None:
        return inputs, input_size

    if hparams is None:
        # output_layer was passed in from the constructor
        if isinstance(output_layer, (list, tuple)):
            raise ValueError('output_layer must not be a list or tuple.')
        output, output_size = _forward_single_output_layer(
            inputs, input_size, output_layer)
    else:
        # output_layer was built based on hparams
        output_layer = _to_list(output_layer)

        dropout_layer_ids = _to_list(hparams.dropout_layer_ids)
        if len(dropout_layer_ids) > 0:
            training = is_train_mode(mode)

        output = inputs
        output_size = input_size
        for i, layer in enumerate(output_layer):
            if i in dropout_layer_ids:
                output = _apply_dropout(output, time_major, hparams, training)
            output, output_size = _forward_single_output_layer(
                output, output_size, layer)

        if len(output_layer) in dropout_layer_ids:
            output = _apply_dropout(output, time_major, hparams, training)

    if sequence_length is not None:
        output = mask_sequences(output,
                                sequence_length,
                                time_major=time_major,
                                tensor_rank=3)

    return output, output_size

コード例 #4

0

ファイルを表示

    def test_mode(self):
        """ Tests mode related utilities.
        """
        training = mode.is_train_mode(None)
        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())
            training_ = sess.run(training)
            self.assertTrue(training_)

            training_ = sess.run(
                training,
                feed_dict={context.global_mode(): tf.estimator.ModeKeys.TRAIN})
            self.assertTrue(training_)

            training_ = sess.run(
                training,
                feed_dict={context.global_mode(): tf.estimator.ModeKeys.EVAL})
            self.assertFalse(training_)

        training = mode.is_train_mode(tf.estimator.ModeKeys.TRAIN)
        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())
            training_ = sess.run(training)
            self.assertTrue(training_)

コード例 #5

0

ファイルを表示

ファイル: layers.py プロジェクト: zhongyunuestc/texar

    def call(self, inputs, mode=None):  # pylint: disable=arguments-differ
        training = is_train_mode(mode)

        outputs = inputs
        for layer in self._layers:
            if isinstance(layer, tf.layers.Dropout) or \
                    isinstance(layer, tf.layers.BatchNormalization):
                outputs = layer(outputs, training=training)
            else:
                outputs = layer(inputs)
            inputs = outputs

        if not self.built or not self._vars_built:
            self._collect_weights()
            self._vars_built = True

        return outputs

コード例 #6

0

ファイルを表示

ファイル: xlnet_utils.py プロジェクト: jiajunhua/asyml-texar

    def _compute_attention_score(self,
                                 q_head,
                                 k_head_h,
                                 v_head_h,
                                 k_head_r,
                                 segment_mat,
                                 attn_mask=None,
                                 mode=None):
        is_training = is_train_mode(mode)

        # Content based attention score.
        q_head_rw = q_head + self.r_w_bias
        # attn_ac: (max_time, tot_len, batch_size, n_head)
        attn_ac = tf.einsum('ibnd,jbnd->ijbn', q_head_rw, k_head_h)

        # Position based attention score.
        q_head_rr = q_head + self.r_r_bias
        # attn_bd: (max_time, tot_len, batch_size, n_head)
        attn_bd = tf.einsum('ibnd,jbnd->ijbn', q_head_rr, k_head_r)
        attn_bd = self._rel_shift(attn_bd, klen=tf.shape(attn_ac)[1])

        # Segment based attention score.
        if segment_mat is None:
            attn_ef = 0
        else:
            q_head_rs = q_head + self.r_s_bias
            attn_ef = tf.einsum('ibnd,snd->ibns', q_head_rs,
                                self.segment_embed)
            attn_ef = tf.einsum('ijbs,ibns->ijbn', segment_mat, attn_ef)

        # Merge attention scores and perform masking.
        # attn_score: (max_time, tot_len, batch_size, n_head)
        attn_score = (attn_ac + attn_bd + attn_ef) * self.scale
        if attn_mask is not None:
            # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask
            attn_score = attn_score - 1e30 * attn_mask

        # attention probability
        attn_prob = tf.nn.softmax(attn_score, 1)
        attn_prob = self.dropout_attn(attn_prob, training=is_training)

        # attention output
        attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, v_head_h)

        return attn_vec

コード例 #7

0

ファイルを表示

ファイル: network_base.py プロジェクト: jiajunhua/asyml-texar

    def _build(self, inputs, mode=None):
        """Feeds forward inputs through the network layers and returns outputs.

        Args:
            inputs: The inputs to the network. The requirements on inputs
                depends on the first layer and subsequent layers in the
                network.
            mode (optional): A tensor taking value in
                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
                `TRAIN`, `EVAL`, and `PREDICT`. If `None`,
                :func:`texar.tf.global_mode` is used.

        Returns:
            The output of the network.
        """
        training = is_train_mode(mode)

        prev_outputs = inputs
        for layer_id, layer in enumerate(self._layers):
            if isinstance(layer, tf.layers.Dropout) or \
                    isinstance(layer, tf.layers.BatchNormalization):
                outputs = layer(prev_outputs, training=training)
            else:
                outputs = layer(prev_outputs)
            self._layer_outputs.append(outputs)
            self._layer_outputs_by_name[self._layer_names[layer_id]] = outputs
            prev_outputs = outputs

        if not self._built:
            self._add_internal_trainable_variables()
            # Add trainable variables of `self._layers` which may be
            # constructed externally.
            for layer in self._layers:
                self._add_trainable_variable(layer.trainable_variables)
            self._built = True

        return outputs

コード例 #8

0

ファイルを表示

ファイル: transformer_decoders.py プロジェクト: jiajunhua/asyml-texar

    def _self_attention_stack(self,
                              inputs,
                              memory,
                              decoder_self_attention_bias=None,
                              memory_attention_bias=None,
                              cache=None,
                              mode=None):
        """Stacked multihead attention module.
        """
        def _layer_norm(x, scope):
            return layers.layer_normalize(x, reuse=tf.AUTO_REUSE, scope=scope)

        inputs = tf.layers.dropout(inputs,
                                   rate=self._hparams.embedding_dropout,
                                   training=is_train_mode(mode))
        if cache is not None:
            if memory is not None:
                memory_attention_bias = \
                    cache['memory_attention_bias']
        else:
            assert decoder_self_attention_bias is not None

        x = inputs
        for i in range(self._hparams.num_blocks):
            layer_name = 'layer_{}'.format(i)
            layer_cache = cache[layer_name] if cache is not None else None
            with tf.variable_scope(layer_name) as layer_scope:
                with tf.variable_scope("self_attention"):
                    multihead_attention = \
                        self.multihead_attentions['self_att'][i]
                    selfatt_output = multihead_attention(
                        queries=_layer_norm(x, layer_scope),
                        memory=None,
                        memory_attention_bias=decoder_self_attention_bias,
                        cache=layer_cache,
                        mode=mode,
                    )
                    x = x + tf.layers.dropout(
                        selfatt_output,
                        rate=self._hparams.residual_dropout,
                        training=is_train_mode(mode),
                    )
                if memory is not None:
                    with tf.variable_scope('encdec_attention') as \
                            encdec_attention_scope:
                        multihead_attention = \
                            self.multihead_attentions['encdec_att'][i]
                        encdec_output = multihead_attention(
                            queries=_layer_norm(x, encdec_attention_scope),
                            memory=memory,
                            memory_attention_bias=memory_attention_bias,
                            mode=mode,
                        )
                        x = x + tf.layers.dropout(
                            encdec_output,
                            rate=self._hparams.residual_dropout,
                            training=is_train_mode(mode))
                poswise_network = self.poswise_networks[i]
                with tf.variable_scope('past_poswise_ln') as \
                        past_poswise_ln_scope:
                    sub_output = tf.layers.dropout(
                        poswise_network(_layer_norm(x, past_poswise_ln_scope)),
                        rate=self._hparams.residual_dropout,
                        training=is_train_mode(mode),
                    )
                    x = x + sub_output

        return _layer_norm(x, scope=self.variable_scope)

コード例 #9

0

ファイルを表示

    def _build(self, ids=None, soft_ids=None, mode=None, **kwargs):
        r"""Embeds (soft) ids.

        Either :attr:`ids` or :attr:`soft_ids` must be given, and they
        must not be given at the same time.

        Args:
            ids (optional): An integer tensor containing the ids to embed.
            soft_ids (optional): A tensor of weights (probabilities) used to
                mix the embedding vectors.
            mode (optional): A tensor taking value in
                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
                `TRAIN`, `EVAL`, and `PREDICT`. If `None`, dropout is
                controlled by :func:`texar.tf.global_mode`.
            kwargs: Additional keyword arguments for
                :tf_main:`tf.nn.embedding_lookup <nn/embedding_lookup>` besides
                :attr:`params` and :attr:`ids`.

        Returns:
            If :attr:`ids` is given, returns a Tensor of shape
            ``shape(ids) + embedding-dim``. For example,
            if ``shape(ids) = [batch_size, max_time]``
            and ``shape(embedding) = [vocab_size, emb_dim]``, then the return
            tensor has shape ``[batch_size, max_time, emb_dim]``.

            If :attr:`soft_ids` is given, returns a Tensor of shape
            ``shape(soft_ids)[:-1] + embdding-dim``. For example,
            if ``shape(soft_ids) = [batch_size, max_time, vocab_size]``
            and ``shape(embedding) = [vocab_size, emb_dim]``, then the return
            tensor has shape ``[batch_size, max_time, emb_dim]``.
        """
        if ids is not None:
            if soft_ids is not None:
                raise ValueError(
                    'Must not specify `ids` and `soft_ids` at the same time.')
            ids_rank = get_rank(ids)
        elif soft_ids is not None:
            ids_rank = get_rank(soft_ids) - 1
        else:
            raise ValueError('Either `ids` or `soft_ids` must be given.')

        embedding = self._embedding

        is_training = is_train_mode(mode)
        if self._hparams.dropout_strategy == 'item_type':
            dropout_layer = self._get_dropout_layer(self._hparams)
            if dropout_layer:
                embedding = dropout_layer.apply(inputs=embedding,
                                                training=is_training)

        if ids is not None:
            outputs = tf.nn.embedding_lookup(embedding, ids, **kwargs)
        else:
            outputs = embedder_utils.soft_embedding_lookup(embedding, soft_ids)

        if self._hparams.dropout_strategy != 'item_type':
            dropout_layer = self._get_dropout_layer(self._hparams,
                                                    ids_rank=ids_rank,
                                                    dropout_input=outputs)
            if dropout_layer:
                outputs = dropout_layer.apply(inputs=outputs,
                                              training=is_training)

        return outputs

コード例 #10

0

ファイルを表示

ファイル: rnn_dynamic_decoders.py プロジェクト: li3cmz/SRGT

    def _build(self,
               decoding_strategy="train_greedy",
               initial_state=None,
               inputs=None,
               memory=None,
               sequence_length=None,
               embedding=None,
               start_tokens=None,
               end_token=None,
               softmax_temperature=None,
               max_decoding_length=None,
               impute_finished=False,
               output_time_major=False,
               input_time_major=False,
               helper=None,
               mode=None,
               **kwargs):
        # Memory
        for _mechanism in self._cell._attention_mechanisms:
            _mechanism.initialize_memory(memory)
        # Helper
        if helper is not None:
            pass
        elif decoding_strategy is not None:
            if decoding_strategy == "train_greedy":
                helper = rnn_decoder_helpers._get_training_helper(
                    inputs, sequence_length, embedding, input_time_major)
            elif decoding_strategy == "infer_greedy":
                helper = tx_helper.GreedyEmbeddingHelper(
                    embedding, start_tokens, end_token)
            elif decoding_strategy == "infer_sample":
                helper = tx_helper.SampleEmbeddingHelper(
                    embedding, start_tokens, end_token, softmax_temperature)
            else:
                raise ValueError(
                    "Unknown decoding strategy: {}".format(decoding_strategy))
        else:
            if is_train_mode_py(mode):
                kwargs_ = copy.copy(self._hparams.helper_train.kwargs.todict())
                helper_type = self._hparams.helper_train.type
            else:
                kwargs_ = copy.copy(self._hparams.helper_infer.kwargs.todict())
                helper_type = self._hparams.helper_infer.type
            kwargs_.update({
                "inputs": inputs,
                "sequence_length": sequence_length,
                "time_major": input_time_major,
                "embedding": embedding,
                "start_tokens": start_tokens,
                "end_token": end_token,
                "softmax_temperature": softmax_temperature})
            kwargs_.update(kwargs)
            helper = rnn_decoder_helpers.get_helper(helper_type, **kwargs_)
        self._helper = helper

        # Initial state
        if initial_state is not None:
            self._initial_state = initial_state
        else:
            self._initial_state = self.zero_state(
                batch_size=self.batch_size, dtype=tf.float32)

        # Maximum decoding length
        max_l = max_decoding_length
        if max_l is None:
            max_l_train = self._hparams.max_decoding_length_train
            if max_l_train is None:
                max_l_train = utils.MAX_SEQ_LENGTH
            max_l_infer = self._hparams.max_decoding_length_infer
            if max_l_infer is None:
                max_l_infer = utils.MAX_SEQ_LENGTH
            max_l = tf.cond(is_train_mode(mode),
                            lambda: max_l_train, lambda: max_l_infer)
        self.max_decoding_length = max_l
        # Decode
        outputs, final_state, sequence_lengths = dynamic_decode(
            decoder=self, impute_finished=impute_finished,
            maximum_iterations=max_l, output_time_major=output_time_major)

        if not self._built:
            self._add_internal_trainable_variables()
            # Add trainable variables of `self._cell` which may be
            # constructed externally.
            self._add_trainable_variable(
                layers.get_rnn_cell_trainable_variables(self._cell))
            if isinstance(self._output_layer, tf.layers.Layer):
                self._add_trainable_variable(
                    self._output_layer.trainable_variables)
            # Add trainable variables of `self._beam_search_rnn_cell` which
            # may already be constructed and used.
            if self._beam_search_cell is not None:
                self._add_trainable_variable(
                    self._beam_search_cell.trainable_variables)

            self._built = True

        return outputs, final_state, sequence_lengths

コード例 #11

0

ファイルを表示

    def _build(self,
               decoding_strategy="train_greedy",
               initial_state=None,
               inputs=None,
               sequence_length=None,
               embedding=None,
               start_tokens=None,
               end_token=None,
               softmax_temperature=None,
               max_decoding_length=None,
               impute_finished=False,
               output_time_major=False,
               input_time_major=False,
               helper=None,
               mode=None,
               **kwargs):
        """Performs decoding. This is a shared interface for both
        :class:`~texar.tf.modules.BasicRNNDecoder` and
        :class:`~texar.tf.modules.AttentionRNNDecoder`.

        The function provides **3 ways** to specify the
        decoding method, with varying flexibility:

        1. The :attr:`decoding_strategy` argument: A string taking value of:

            - **"train_greedy"**: decoding in teacher-forcing fashion \
              (i.e., feeding \
              `ground truth` to decode the next step), and each sample is \
              obtained by taking the `argmax` of the RNN output logits. \
              Arguments :attr:`(inputs, sequence_length, input_time_major)` \
              are required for this strategy, and argument :attr:`embedding` \
              is optional.
            - **"infer_greedy"**: decoding in inference fashion (i.e., feeding \
              the `generated` sample to decode the next step), and each sample\
              is obtained by taking the `argmax` of the RNN output logits.\
              Arguments :attr:`(embedding, start_tokens, end_token)` are \
              required for this strategy, and argument \
              :attr:`max_decoding_length` is optional.
            - **"infer_sample"**: decoding in inference fashion, and each
              sample is obtained by `random sampling` from the RNN output
              distribution. Arguments \
              :attr:`(embedding, start_tokens, end_token)` are \
              required for this strategy, and argument \
              :attr:`max_decoding_length` is optional.

          This argument is used only when argument :attr:`helper` is `None`.

          Example:

            .. code-block:: python

                embedder = WordEmbedder(vocab_size=data.vocab.size)
                decoder = BasicRNNDecoder(vocab_size=data.vocab.size)

                # Teacher-forcing decoding
                outputs_1, _, _ = decoder(
                    decoding_strategy='train_greedy',
                    inputs=embedder(data_batch['text_ids']),
                    sequence_length=data_batch['length']-1)

                # Random sample decoding. Gets 100 sequence samples
                outputs_2, _, sequence_length = decoder(
                    decoding_strategy='infer_sample',
                    start_tokens=[data.vocab.bos_token_id]*100,
                    end_token=data.vocab.eos.token_id,
                    embedding=embedder,
                    max_decoding_length=60)

        2. The :attr:`helper` argument: An instance of subclass of \
           :class:`texar.tf.modules.Helper`. This
           provides a superset of decoding strategies than above, for example:

            - :class:`~texar.tf.modules.TrainingHelper` corresponding to the \
              "train_greedy" strategy.
            - :class:`~texar.tf.modules.GreedyEmbeddingHelper` and \
              :class:`~texar.tf.modules.SampleEmbeddingHelper` corresponding to \
              the "infer_greedy" and "infer_sample", respectively.
            - :class:`~texar.tf.modules.TopKSampleEmbeddingHelper` for Top-K \
              sample decoding.
            - :class:`ScheduledEmbeddingTrainingHelper` and \
              :class:`ScheduledOutputTrainingHelper` for scheduled \
              sampling.
            - :class:`~texar.tf.modules.SoftmaxEmbeddingHelper` and \
              :class:`~texar.tf.modules.GumbelSoftmaxEmbeddingHelper` for \
              soft decoding and gradient backpropagation.

          Helpers give the maximal flexibility of configuring the decoding\
          strategy.

          Example:

            .. code-block:: python

                embedder = WordEmbedder(vocab_size=data.vocab.size)
                decoder = BasicRNNDecoder(vocab_size=data.vocab.size)

                # Teacher-forcing decoding, same as above with
                # `decoding_strategy='train_greedy'`
                helper_1 = tx.modules.TrainingHelper(
                    inputs=embedders(data_batch['text_ids']),
                    sequence_length=data_batch['length']-1)
                outputs_1, _, _ = decoder(helper=helper_1)

                # Gumbel-softmax decoding
                helper_2 = GumbelSoftmaxEmbeddingHelper(
                    embedding=embedder,
                    start_tokens=[data.vocab.bos_token_id]*100,
                    end_token=data.vocab.eos_token_id,
                    tau=0.1)
                outputs_2, _, sequence_length = decoder(
                    max_decoding_length=60, helper=helper_2)

        3. :attr:`hparams["helper_train"]` and :attr:`hparams["helper_infer"]`:\
           Specifying the helper through hyperparameters. Train and infer \
           strategy is toggled based on :attr:`mode`. Appriopriate arguments \
           (e.g., :attr:`inputs`, :attr:`start_tokens`, etc) are selected to \
           construct the helper. Additional arguments for helper constructor \
           can be provided either through :attr:`**kwargs`, or through \
           :attr:`hparams["helper_train/infer"]["kwargs"]`.

           This means is used only when both :attr:`decoding_strategy` and \
           :attr:`helper` are `None`.

           Example:

             .. code-block:: python

                 h = {
                     "helper_infer": {
                         "type": "GumbelSoftmaxEmbeddingHelper",
                         "kwargs": { "tau": 0.1 }
                     }
                 }
                 embedder = WordEmbedder(vocab_size=data.vocab.size)
                 decoder = BasicRNNDecoder(vocab_size=data.vocab.size, hparams=h)

                 # Gumbel-softmax decoding
                 output, _, _ = decoder(
                     decoding_strategy=None, # Sets to None explicit
                     embedding=embedder,
                     start_tokens=[data.vocab.bos_token_id]*100,
                     end_token=data.vocab.eos_token_id,
                     max_decoding_length=60,
                     mode=tf.estimator.ModeKeys.PREDICT)
                         # PREDICT mode also shuts down dropout

        Args:
            decoding_strategy (str): A string specifying the decoding
                strategy. Different arguments are required based on the
                strategy.
                Ignored if :attr:`helper` is given.
            initial_state (optional): Initial state of decoding.
                If `None` (default), zero state is used.

            inputs (optional): Input tensors for teacher forcing decoding.
                Used when `decoding_strategy` is set to "train_greedy", or
                when `hparams`-configured helper is used.

                - If :attr:`embedding` is `None`, `inputs` is directly \
                fed to the decoder. E.g., in `"train_greedy"` strategy, \
                `inputs` must be a 3D Tensor of shape \
                `[batch_size, max_time, emb_dim]` (or \
                `[max_time, batch_size, emb_dim]` if `input_time_major`==True).
                - If `embedding` is given, `inputs` is used as index \
                to look up embeddings and feed in the decoder. \
                E.g., if `embedding` is an instance of \
                :class:`~texar.tf.modules.WordEmbedder`, \
                then :attr:`inputs` is usually a 2D int Tensor \
                `[batch_size, max_time]` (or \
                `[max_time, batch_size]` if `input_time_major`==True) \
                containing the token indexes.
            sequence_length (optional): A 1D int Tensor containing the
                sequence length of :attr:`inputs`.
                Used when `decoding_strategy="train_greedy"` or
                `hparams`-configured helper is used.
            embedding (optional): Embedding used when:

                - "infer_greedy" or "infer_sample" `decoding_strategy` is \
                used. This can be a callable or the `params` argument for \
                :tf_main:`embedding_lookup <nn/embedding_lookup>`. \
                If a callable, it can take a vector tensor of token `ids`, \
                or take two arguments (`ids`, `times`), where `ids` \
                is a vector tensor of token ids, and `times` is a vector tensor\
                of time steps (i.e., position ids). The latter case can be used\
                when attr:`embedding` is a combination of word embedding and\
                position embedding. `embedding` is required in this case.
                - "train_greedy" `decoding_strategy` is used.\
                This can be a callable or the `params` argument for \
                :tf_main:`embedding_lookup <nn/embedding_lookup>`. \
                If a callable, it can take :attr:`inputs` and returns \
                the input embedding. `embedding` is optional in this case.
            start_tokens (optional): A int Tensor of shape `[batch_size]`,
                the start tokens. Used when `decoding_strategy="infer_greedy"`
                or `"infer_sample"`, or when the helper specified in `hparams`
                is used.

                Example:

                    .. code-block:: python

                        data = tx.data.MonoTextData(hparams)
                        iterator = DataIterator(data)
                        batch = iterator.get_next()

                        bos_token_id = data.vocab.bos_token_id
                        start_tokens=tf.ones_like(batch['length'])*bos_token_id

            end_token (optional): A int 0D Tensor, the token that marks end
                of decoding.
                Used when `decoding_strategy="infer_greedy"` or
                `"infer_sample"`, or when the helper specified in `hparams`
                is used.
            softmax_temperature (optional): A float 0D Tensor, value to divide
                the logits by before computing the softmax. Larger values
                (above 1.0) result in more random samples. Must > 0. If `None`,
                1.0 is used.
                Used when `decoding_strategy="infer_sample"`.
            max_decoding_length: A int scalar Tensor indicating the maximum
                allowed number of decoding steps. If `None` (default), either
                `hparams["max_decoding_length_train"]` or
                `hparams["max_decoding_length_infer"]` is used
                according to :attr:`mode`.
            impute_finished (bool): If `True`, then states for batch
                entries which are marked as finished get copied through and
                the corresponding outputs get zeroed out.  This causes some
                slowdown at each time step, but ensures that the final state
                and outputs have the correct values and that backprop ignores
                time steps that were marked as finished.
            output_time_major (bool): If `True`, outputs are returned as
                time major tensors. If `False` (default), outputs are returned
                as batch major tensors.
            input_time_major (optional): Whether the :attr:`inputs` tensor is
                time major.
                Used when `decoding_strategy="train_greedy"` or
                `hparams`-configured helper is used.
            helper (optional): An instance of
                :class:`texar.tf.modules.Helper`
                that defines the decoding strategy. If given,
                `decoding_strategy`
                and helper configs in :attr:`hparams` are ignored.
            mode (str, optional): A string taking value in
                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`. If
                `TRAIN`, training related hyperparameters are used (e.g.,
                `hparams['max_decoding_length_train']`), otherwise,
                inference related hyperparameters are used (e.g.,
                `hparams['max_decoding_length_infer']`).
                If `None` (default), `TRAIN` mode is used.
            **kwargs: Other keyword arguments for constructing helpers
                defined by `hparams["helper_trainn"]` or
                `hparams["helper_infer"]`.

        Returns:
            `(outputs, final_state, sequence_lengths)`, where

            - **`outputs`**: an object containing the decoder output on all \
            time steps.
            - **`final_state`**: is the cell state of the final time step.
            - **`sequence_lengths`**: is an int Tensor of shape `[batch_size]` \
            containing the length of each sample.
        """
        # Helper
        if helper is not None:
            pass
        elif decoding_strategy is not None:
            if decoding_strategy == "train_greedy":
                helper = rnn_decoder_helpers._get_training_helper(
                    inputs, sequence_length, embedding, input_time_major)
            elif decoding_strategy == "infer_greedy":
                helper = tx_helper.GreedyEmbeddingHelper(
                    embedding, start_tokens, end_token)
            elif decoding_strategy == "infer_sample":
                helper = tx_helper.SampleEmbeddingHelper(
                    embedding, start_tokens, end_token, softmax_temperature)
            else:
                raise ValueError(
                    "Unknown decoding strategy: {}".format(decoding_strategy))
        else:
            if is_train_mode_py(mode):
                kwargs_ = copy.copy(self._hparams.helper_train.kwargs.todict())
                helper_type = self._hparams.helper_train.type
            else:
                kwargs_ = copy.copy(self._hparams.helper_infer.kwargs.todict())
                helper_type = self._hparams.helper_infer.type
            kwargs_.update({
                "inputs": inputs,
                "sequence_length": sequence_length,
                "time_major": input_time_major,
                "embedding": embedding,
                "start_tokens": start_tokens,
                "end_token": end_token,
                "softmax_temperature": softmax_temperature
            })
            kwargs_.update(kwargs)
            helper = rnn_decoder_helpers.get_helper(helper_type, **kwargs_)
        self._helper = helper

        # Initial state
        if initial_state is not None:
            self._initial_state = initial_state
        else:
            self._initial_state = self.zero_state(batch_size=self.batch_size,
                                                  dtype=tf.float32)

        # Maximum decoding length
        max_l = max_decoding_length
        if max_l is None:
            max_l_train = self._hparams.max_decoding_length_train
            if max_l_train is None:
                max_l_train = utils.MAX_SEQ_LENGTH
            max_l_infer = self._hparams.max_decoding_length_infer
            if max_l_infer is None:
                max_l_infer = utils.MAX_SEQ_LENGTH
            max_l = tf.cond(is_train_mode(mode), lambda: max_l_train,
                            lambda: max_l_infer)
        self.max_decoding_length = max_l
        # Decode
        outputs, final_state, sequence_lengths = dynamic_decode(
            decoder=self,
            impute_finished=impute_finished,
            maximum_iterations=max_l,
            output_time_major=output_time_major)

        if not self._built:
            self._add_internal_trainable_variables()
            # Add trainable variables of `self._cell` which may be
            # constructed externally.
            self._add_trainable_variable(
                layers.get_rnn_cell_trainable_variables(self._cell))
            if isinstance(self._output_layer, tf.layers.Layer):
                self._add_trainable_variable(
                    self._output_layer.trainable_variables)
            # Add trainable variables of `self._beam_search_rnn_cell` which
            # may already be constructed and used.
            if self._beam_search_cell is not None:
                self._add_trainable_variable(
                    self._beam_search_cell.trainable_variables)

            self._built = True

        return outputs, final_state, sequence_lengths

コード例 #12

0

ファイルを表示

ファイル: position_embedders.py プロジェクト: jiajunhua/asyml-texar

    def _build(self,
               positions=None,
               sequence_length=None,
               mode=None,
               **kwargs):
        r"""Embeds the positions.

        Either :attr:`positions` or :attr:`sequence_length` is required:

            - If both are given, :attr:`sequence_length` is used to mask out
              embeddings of those time steps beyond the respective sequence
              lengths.
            - If only :attr:`sequence_length` is given, then positions
              from 0 to ``sequence_length-1`` are embedded.

        Args:
            positions (optional): An integer tensor containing the position
                ids to embed.
            sequence_length (optional): An integer tensor of shape
                ``[batch_size]``. Time steps beyond the respective sequence
                lengths will have zero-valued embeddings.
            mode (optional): A tensor taking value in
                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
                `TRAIN`, `EVAL`, and `PREDICT`. If `None`, dropout will be
                controlled by :func:`texar.tf.global_mode`.
            kwargs: Additional keyword arguments for
                :tf_main:`tf.nn.embedding_lookup <nn/embedding_lookup>` besides
                :attr:`params` and :attr:`ids`.

        Returns:
            A `Tensor` of shape `shape(inputs) + embedding dimension`.
        """
        # Gets embedder inputs
        # pylint:disable=too-many-locals
        inputs = positions
        if positions is None:
            if sequence_length is None:
                raise ValueError(
                    "Either `positions` or `sequence_length` is required.")
            max_length = tf.reduce_max(sequence_length)
            single_inputs = tf.range(start=0, limit=max_length, dtype=tf.int32)
            # Expands `single_inputs` to have shape [batch_size, max_length]
            expander = tf.expand_dims(tf.ones_like(sequence_length), -1)
            inputs = expander * tf.expand_dims(single_inputs, 0)
        ids_rank = len(inputs.shape.dims)

        embedding = self._embedding

        is_training = is_train_mode(mode)

        # Gets dropout strategy
        st = self._hparams.dropout_strategy
        if positions is None and st == "item":
            # If `inputs` is based on `sequence_length`, then dropout
            # strategies 'item' and 'item_type' have the same effect, we
            # use 'item_type' to avoid unknown noise_shape in the 'item'
            # strategy
            st = "item_type"

        # Dropouts as 'item_type' before embedding
        if st == "item_type":
            dropout_layer = self._get_dropout_layer(self._hparams,
                                                    dropout_strategy=st)
            if dropout_layer:
                embedding = dropout_layer.apply(inputs=embedding,
                                                training=is_training)

        # Embeds
        outputs = tf.nn.embedding_lookup(embedding, inputs, **kwargs)

        # Dropouts as 'item' or 'elements' after embedding
        if st != "item_type":
            dropout_layer = self._get_dropout_layer(
                self._hparams,
                ids_rank=ids_rank,
                dropout_input=outputs,
                dropout_strategy=st,
            )
            if dropout_layer:
                outputs = dropout_layer.apply(inputs=outputs,
                                              training=is_training)

        # Optionally masks
        if sequence_length is not None:
            outputs = mask_sequences(
                outputs,
                sequence_length,
                tensor_rank=len(inputs.shape.dims) + self._dim_rank,
            )

        return outputs

コード例 #13

0

ファイルを表示

 def _post_attention(self, attn_vec, mode=None):
     is_training = is_train_mode(mode)
     attn_out = tf.einsum('ibnd,hnd->ibh', attn_vec, self.output_projection)
     attn_out = self.dropout(attn_out, training=is_training)
     return attn_out

コード例 #14

0

ファイルを表示

    def _build(self,
               queries,
               memory,
               memory_attention_bias,
               cache=None,
               mode=None):
        """Encodes the inputs.

        Args:
            queries: A 3d tensor with shape of [batch, length_query,
                depth_query].
            memory: A 3d tensor with shape of [batch, length_key, depth_key].
            memory_attention_bias: A 3d tensor with shape of
                [batch, length_key, num_units].
            cache: Memory cache only when inferencing the sentence from sractch.
            mode (optional): A tensor taking value in
                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
                `TRAIN`, `EVAL` and `PREDICT`. Controls dropout mode.
                If `None` (default), :func:`texar.tf.global_mode` is used.

        Returns:
            A Tensor of shape `[batch_size, max_time, dim]` containing the
            encoded vectors.
        """

        with tf.variable_scope(self.variable_scope):
            num_heads = self._hparams.num_heads
            num_units = self._hparams.num_units
            if num_units % num_heads:
                raise ValueError("Value depth (%d) must be divisible by "
                                 "the number of attention heads (%d)." %(\
                                 num_units, num_heads))

            def _update_and_return(layer, key):
                if memory is None:
                    # Self Attention
                    out = layer(queries)

                    if cache is not None:
                        # 'decoder self attention when dynamic decoding'
                        key = 'self_{}'.format(key)
                        res = cache[key]
                        if isinstance(res, tf.TensorArray):
                            # inference-like decoding
                            # TODO(zhiting): This writing op may cause a bug
                            # on CPU--it looks the two TensorArray
                            # cache['self_keys'] and cache['self_values']
                            # will mix up starting from certain step, causing
                            # shape mismatch. This op looks fine on GPU.
                            res = res.write(res.size(),
                                            tf.squeeze(out, axis=[1]))
                            out = transpose_batch_time(res.stack())
                        else:
                            # normal decoding
                            res = tf.concat([res, out], axis=1)
                            out = res
                        cache[key] = res

                else:
                    # encoder decoder attention
                    if cache is not None:
                        key = 'memory_{}'.format(key)
                        res = cache[key]
                        if isinstance(res, tf.TensorArray):
                            # inference-like decoding
                            size = res.size()
                            false_fn = lambda: transpose_batch_time(res.stack(
                            ))
                        else:
                            # normal decoding
                            size = tf.shape(res)[1]
                            false_fn = lambda: res
                        out = tf.cond(tf.equal(size, 0),
                                      true_fn=lambda: layer(memory),
                                      false_fn=false_fn)
                    else:
                        out = layer(memory)

                return out

            Q = self.Q_dense(queries)
            K = _update_and_return(self.K_dense, 'keys')
            V = _update_and_return(self.V_dense, 'values')

            Q_ = self._split_heads(Q)
            K_ = self._split_heads(K)
            V_ = self._split_heads(V)
            #[batch_size, num_heads, seq_length, memory_depth]
            key_depth_per_head = num_units // num_heads
            Q_ *= key_depth_per_head**-0.5

            logits = tf.matmul(Q_, K_, transpose_b=True)
            if memory_attention_bias is not None:
                logits += memory_attention_bias
            weights = tf.nn.softmax(logits, name="attention_weights")
            weights = tf.layers.dropout(weights,
                                        rate=self._hparams.dropout_rate,
                                        training=is_train_mode(mode))
            outputs = tf.matmul(weights, V_)

            outputs = self._combine_heads(outputs)
            outputs = self.O_dense(outputs)
            #(batch_size, length_query, output_dim)

        if not self._built:
            self._add_internal_trainable_variables()
            self._built = True

        return outputs

コード例 #15

0

ファイルを表示

    def _build(self,
               inputs,
               memory,
               sequence_length,
               memory_sequence_length,
               encoder_output,
               adjs=None,
               mode=None):
        """Encodes the inputs.

        Args:
            inputs: A 3D Tensor of shape `[batch_size, max_time, dim]`,
                containing the embedding of input sequences. Note that
                the embedding dimension `dim` must equal "dim" in
                :attr:`hparams`. The input embedding is typically an aggregation
                of word embedding and position embedding.
            memory: A 3D Tensor of shape `[batch_size, memory_max_time, dim]`, 
                containing the embedding of memory sequences. Note that
                the embedding dimension `dim` must equal "dim" in
                :attr:`hparams`. The input embedding is typically an aggregation
                of word embedding and position embedding.
            sequence_length: A 1D Tensor of shape `[batch_size]`. Input tokens
                beyond respective sequence lengths are masked out
                automatically.
            sequence_length: A 1D Tensor of shape `[batch_size]`. Memory tokens
                beyond respective sequence lengths are masked out
                automatically.
            adjs: A 3D Tensor of shape `[batch_size, max_time, max_time]`,
                containing the adjacency matrices of input sequences
            encoder_output: bool. True: return encoder-like embeddings. False: return CrossGraphTransformerDecoderOutput. 
            mode (optional): A tensor taking value in
                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`,
                including `TRAIN`, `EVAL`, and `PREDICT`. Used to toggle
                dropout.
                If `None` (default), :func:`texar.tf.global_mode` is used.

        Returns:
            A Tensor of shape `[batch_size, max_time, dim]` containing the
            encoded vectors.
        """
        # Get adjacency masks from adjs
        if self._hparams.use_adj:
            adj_masks = 1 - tf.cast(tf.equal(adjs, 0), dtype=tf.float32)
        else:
            adj_masks = None

        # Multiply input embedding with the sqrt of its dimension for
        # normalization
        inputs_padding = 1 - tf.sequence_mask(
            sequence_length, tf.shape(inputs)[1], dtype=tf.float32)
        if self._hparams.use_bert_config:
            ignore_padding = attn.attention_bias_ignore_padding(
                inputs_padding, bias_value=-1e4)
        else:
            ignore_padding = attn.attention_bias_ignore_padding(inputs_padding)
        encoder_self_attention_bias = ignore_padding

        input_embedding = inputs  # shape (batch_size, max_time, dim)

        if self._hparams.use_bert_config:
            x = layers.layer_normalize(input_embedding)
            x = tf.layers.dropout(x,
                                  rate=self._hparams.embedding_dropout,
                                  training=is_train_mode(mode))
        else:
            x = tf.layers.dropout(input_embedding,
                                  rate=self._hparams.embedding_dropout,
                                  training=is_train_mode(mode))

        # Just to keep consistent with BERT, actually makes no difference
        if self._hparams.use_bert_config:
            pad_remover = None
        else:
            pad_remover = utils.transformer_utils.PadRemover(inputs_padding)

        for i in range(self._hparams.num_blocks):
            with tf.variable_scope("layer_{}".format(i)):
                graph_multihead_attention = self.graph_multihead_attention_list[
                    i]

                # trivial difference between BERT and original Transformer
                if self._hparams.use_bert_config:
                    _queries_input = x
                else:
                    _queries_input = layers.layer_normalize(x)

                attention_output = graph_multihead_attention(
                    queries=_queries_input,
                    memory=memory,
                    adj_masks=adj_masks,
                    memory_attention_bias=encoder_self_attention_bias,
                    mode=mode,
                    use_adj=self._hparams.use_adj)
                attention_output = tf.layers.dropout(
                    attention_output,
                    rate=self._hparams.residual_dropout,
                    training=is_train_mode(mode),
                )
                # attention_output: weighted sum of V of memory with weights determined by querying keys of memory
                x = x + attention_output
                with tf.variable_scope('output'):
                    if self._hparams.use_bert_config:
                        x = layers.layer_normalize(x)
                        y = x
                    else:
                        y = layers.layer_normalize(x)

                poswise_network = self.poswise_networks[i]
                with tf.variable_scope(poswise_network.variable_scope):
                    original_shape = shape_list(y)
                    y = tf.reshape(y, [-1, self._hparams.dim])
                    if pad_remover:
                        y = tf.expand_dims(pad_remover.remove(y), axis=0)
                        # [1, batch_size*seq_length, hidden_dim]
                    layer_output = poswise_network(y, mode=mode)
                    sub_output = tf.layers.dropout(
                        layer_output,
                        rate=self._hparams.residual_dropout,
                        training=is_train_mode(mode))
                    if pad_remover:
                        sub_output = tf.reshape(pad_remover.restore(tf.squeeze(\
                            sub_output, axis=0)), original_shape \
                        )
                    else:
                        sub_output = tf.reshape(sub_output, original_shape)

                    x = x + sub_output
                    if self._hparams.use_bert_config:
                        x = layers.layer_normalize(x)

        if not self._hparams.use_bert_config:
            x = layers.layer_normalize(x)

        if not self._built:
            self._add_internal_trainable_variables()
            self._built = True

        if encoder_output:
            return x

        logits = self._output_layer(x)
        sample_ids = tf.to_int32(tf.argmax(logits, axis=-1))
        probs = ''
        # probs = GumbelSoftmax(self._tau, logits=logits).sample()
        # probs = tf.nn.softmax(logits / self._tau) # vanilla softmax

        rets = CrossGraphTransformerFixedLengthDecoderOutput(
            logits=logits, sample_id=sample_ids, probs=probs)

        return rets

コード例 #16

0

ファイルを表示

ファイル: transformer_encoders.py プロジェクト: jiajunhua/asyml-texar

    def _build(self, inputs, sequence_length, mode=None):
        """Encodes the inputs.

        Args:
            inputs: A 3D Tensor of shape `[batch_size, max_time, dim]`,
                containing the embedding of input sequences. Note that
                the embedding dimension `dim` must equal "dim" in
                :attr:`hparams`. The input embedding is typically an aggregation
                of word embedding and position embedding.
            sequence_length: A 1D Tensor of shape `[batch_size]`. Input tokens
                beyond respective sequence lengths are masked out
                automatically.
            mode (optional): A tensor taking value in
                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`,
                including `TRAIN`, `EVAL`, and `PREDICT`. Used to toggle
                dropout.
                If `None` (default), :func:`texar.tf.global_mode` is used.

        Returns:
            A Tensor of shape `[batch_size, max_time, dim]` containing the
            encoded vectors.
        """
        # Multiply input embedding with the sqrt of its dimension for
        # normalization

        inputs_padding = 1 - tf.sequence_mask(
            sequence_length, tf.shape(inputs)[1], dtype=tf.float32)
        if self._hparams.use_bert_config:
            ignore_padding = attn.attention_bias_ignore_padding(
                inputs_padding, bias_value=-1e4)
        else:
            ignore_padding = attn.attention_bias_ignore_padding(
                inputs_padding)
        encoder_self_attention_bias = ignore_padding

        input_embedding = inputs

        if self._hparams.use_bert_config:
            x = layers.layer_normalize(input_embedding)
            x = tf.layers.dropout(x,
                                  rate=self._hparams.embedding_dropout,
                                  training=is_train_mode(mode))
        else:
            x = tf.layers.dropout(input_embedding,
                                  rate=self._hparams.embedding_dropout,
                                  training=is_train_mode(mode))

        # Just to keep consistent with BERT, actually makes no difference
        if self._hparams.use_bert_config:
            pad_remover = None
        else:
            pad_remover = transformer_utils.PadRemover(inputs_padding)

        for i in range(self._hparams.num_blocks):
            with tf.variable_scope("layer_{}".format(i)):
                multihead_attention = self.multihead_attention_list[i]

                # trivial difference between BERT and original Transformer
                if self._hparams.use_bert_config:
                    _queries_input = x
                else:
                    _queries_input = layers.layer_normalize(x)

                attention_output = multihead_attention(
                    queries=_queries_input,
                    memory=_queries_input,
                    memory_attention_bias=encoder_self_attention_bias,
                    mode=mode,
                )
                attention_output = tf.layers.dropout(
                    attention_output,
                    rate=self._hparams.residual_dropout,
                    training=is_train_mode(mode),
                )
                x = x + attention_output
                with tf.variable_scope('output'):
                    if self._hparams.use_bert_config:
                        x = layers.layer_normalize(x)
                        y = x
                    else:
                        y = layers.layer_normalize(x)

                poswise_network = self.poswise_networks[i]
                with tf.variable_scope(poswise_network.variable_scope):
                    original_shape = shape_list(y)
                    y = tf.reshape(y, [-1, self._hparams.dim])
                    if pad_remover:
                        y = tf.expand_dims(pad_remover.remove(y), axis=0)
                        # [1, batch_size*seq_length, hidden_dim]
                    layer_output = poswise_network(y, mode=mode)
                    sub_output = tf.layers.dropout(
                        layer_output,
                        rate=self._hparams.residual_dropout,
                        training=is_train_mode(mode)
                    )
                    if pad_remover:
                        sub_output = tf.reshape(
                            pad_remover.restore(tf.squeeze(sub_output, axis=0)),
                            original_shape)
                    else:
                        sub_output = tf.reshape(sub_output, original_shape)

                    x = x + sub_output
                    if self._hparams.use_bert_config:
                        x = layers.layer_normalize(x)

        if not self._hparams.use_bert_config:
            x = layers.layer_normalize(x)

        if not self._built:
            self._add_internal_trainable_variables()
            self._built = True

        return x

コード例 #17

0

ファイルを表示

ファイル: xlnet_classifier.py プロジェクト: jiajunhua/asyml-texar

    def _build(self, token_ids, segment_ids=None, input_mask=None, mode=None):
        r"""Feeds the inputs through the network and makes classification.

        Args:
            token_ids: Shape `[batch_size, max_time]`.
            segment_ids: Shape `[batch_size, max_time]`.
            input_mask: Float tensor of shape `[batch_size, max_time]`. Note
                that positions with value 1 are masked out.
            mode (optional): A tensor taking value in
                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`,
                including `TRAIN`, `EVAL`, and `PREDICT`. Used to toggle
                dropout.
                If `None` (default), :func:`texar.tf.global_mode` is used.

        Returns:
            A tuple `(logits, preds)`, containing the logits over classes and
            the predictions, respectively.

            - If ``clas_strategy`` is ``cls_time`` or ``all_time``:

                - If ``num_classes`` == 1, ``logits`` and ``pred`` are both of
                  shape ``[batch_size]``.
                - If ``num_classes`` > 1, ``logits`` is of shape
                  ``[batch_size, num_classes]`` and ``pred`` is of shape
                  ``[batch_size]``.

            - If ``clas_strategy`` is ``time_wise``:

                - ``num_classes`` == 1, ``logits`` and ``pred`` are both of
                  shape ``[batch_size, max_time]``.
                - If ``num_classes`` > 1, ``logits`` is of shape
                  ``[batch_size, max_time, num_classes]`` and ``pred`` is of
                  shape ``[batch_size, max_time]``.
        """
        is_training = is_train_mode(mode)
        output, _ = self._encoder(token_ids, segment_ids, input_mask=input_mask,
                                  mode=mode)
        strategy = self._hparams.clas_strategy
        if strategy == "time_wise":
            summary = output
        elif strategy == "cls_time":
            summary = output[:, -1]
        elif strategy == "all_time":
            length_diff = self._hparams.max_seq_len - tf.shape(token_ids)[1]
            summary_input = tf.pad(output,
                                   paddings=[[0, 0], [0, length_diff], [0, 0]])
            summary_input_dim = \
                self._encoder.output_size * self._hparams.max_seq_len
            summary = tf.reshape(summary_input, shape=[-1, summary_input_dim])
        else:
            raise ValueError("Unknown classification strategy: {}"
                             .format(strategy))

        if self._hparams.use_projection:
            summary = tf.tanh(self.projection(summary))
        # summary: (batch_size, hidden_dim)
        summary = self._dropout_layer(summary, training=is_training)

        logits = (self._logit_layer(summary) if self._logit_layer is not None
                  else summary)

        # Compute predictions
        num_classes = self._hparams.num_classes
        is_binary = num_classes == 1 or (num_classes <= 0
                                         and logits.shape[-1] == 1)

        if strategy == "time_wise":
            if is_binary:
                pred = tf.squeeze(tf.greater(logits, 0), -1)
                logits = tf.squeeze(logits, -1)
            else:
                pred = tf.argmax(logits, axis=-1)
        else:
            if is_binary:
                pred = tf.greater(logits, 0)
                logits = tf.reshape(logits, [-1])
            else:
                pred = tf.argmax(logits, axis=-1)
            pred = tf.reshape(pred, [-1])

        pred = tf.to_int64(pred)

        if not self._built:
            self._add_internal_trainable_variables()
            if self._logit_layer:
                self._add_trainable_variable(
                    self._logit_layer.trainable_variables)
            self._built = True

        return logits, pred

コード例 #18

0

ファイルを表示

    def _execute(
            self,
            word_embed,
            segment_ids=None,  # noqa: C901
            input_mask=None,
            memory=None,
            permute_mask=None,
            target_mapping=None,
            bi_data=False,
            clamp_len=None,
            cache_len=0,
            same_length=False,
            attn_type='bi',
            two_stream=False,
            mode=None):
        r"""Compute XLNet representations for the input. This layer exists
        because :class:`XLNetDecoder` compute embeddings in the decoder helper.
        `word_embed` has shape `[batch_size, max_time, word_embed_dim]`.
        Please refer to :meth:`_build` for the detailed information of other
        arguments.
        """
        # word_embed: [max_time, batch_size, word_embed_dim]
        word_embed = tf.transpose(word_embed, perm=[1, 0, 2])
        # segment_ids: [max_time, batch_size]
        if segment_ids is not None:
            segment_ids = tf.transpose(segment_ids, perm=[1, 0])
        # input_mask: [max_time, batch_size]
        if input_mask is not None:
            input_mask = tf.transpose(input_mask, perm=[1, 0])
        # memory: A list of length num_layers
        # each tensor of shape [mem_len, batch_size, hidden_dim]
        if memory is not None:
            memory = [tf.transpose(m, perm=[1, 0, 2]) for m in memory]
        # permute_mask: [max_time, max_time, batch_size]
        if permute_mask is not None:
            permute_mask = tf.transpose(permute_mask, perm=[1, 2, 0])
        # target_mapping: [num_targets, max_time, batch_size]
        if target_mapping is not None:
            target_mapping = tf.transpose(target_mapping, perm=[1, 2, 0])

        max_time = tf.shape(word_embed)[0]
        batch_size = tf.shape(word_embed)[1]
        mem_len = tf.shape(memory[0])[0] if memory is not None else 0
        tot_len = max_time + mem_len
        reuse_len = self._hparams.reuse_len
        is_training = is_train_mode(mode)

        # Attention mask
        # causal attention mask
        if attn_type == 'uni':
            attn_mask = self._create_mask(max_time, mem_len, tf.float32,
                                          same_length)
            attn_mask = attn_mask[:, :, None, None]
        elif attn_type == 'bi':
            attn_mask = None
        else:
            raise ValueError(
                'Unsupported attention type: {}'.format(attn_type))

        # data mask: input mask & perm mask
        if input_mask is not None and permute_mask is not None:
            data_mask = input_mask[None] + permute_mask
        elif input_mask is not None and permute_mask is None:
            data_mask = input_mask[None]
        elif input_mask is None and permute_mask is not None:
            data_mask = permute_mask
        else:
            data_mask = None

        if data_mask is not None:
            # all mems can be attended to
            mems_mask = tf.zeros([tf.shape(data_mask)[0], mem_len, batch_size],
                                 dtype=tf.float32)
            data_mask = tf.concat([mems_mask, data_mask], 1)
            if attn_mask is None:
                attn_mask = data_mask[:, :, :, None]
            else:
                attn_mask += data_mask[:, :, :, None]

        if attn_mask is not None:
            attn_mask = tf.cast(attn_mask > 0, dtype=tf.float32)

        if attn_mask is not None:
            non_tgt_mask = -tf.eye(max_time, dtype=tf.float32)
            non_tgt_mask = tf.concat([
                tf.zeros([max_time, mem_len], dtype=tf.float32), non_tgt_mask
            ],
                                     axis=-1)
            non_tgt_mask = tf.cast(
                (attn_mask + non_tgt_mask[:, :, None, None]) > 0,
                dtype=tf.float32)
        else:
            non_tgt_mask = None

        # Segment embedding
        if segment_ids is not None:
            mem_pad = tf.zeros([mem_len, batch_size], dtype=tf.int32)
            cat_ids = tf.concat([mem_pad, segment_ids], 0)
            segment_matrix = tf.cast(
                tf.logical_not(tf.equal(segment_ids[:, None],
                                        cat_ids[None, :])), tf.int32)
            segment_matrix = tf.one_hot(segment_matrix, 2, dtype=tf.float32)
        else:
            segment_matrix = None

        # Position embedding
        pos_embed = self.pos_embed(batch_size, max_time, tot_len, clamp_len,
                                   attn_type, bi_data)
        pos_embed = self.dropout(pos_embed, training=is_training)

        states_h = self.dropout(word_embed, training=is_training)

        if two_stream:
            if target_mapping is not None:
                word_embed_q = tf.tile(
                    self.mask_embed,
                    [tf.shape(target_mapping)[0], batch_size, 1])
            else:
                word_embed_q = word_embed
            states_g = self.dropout(word_embed_q)
        else:
            states_g = None

        new_memory = []
        num_layers = self._hparams.num_layers
        for i in range(num_layers):
            cur_memory = memory[i] if memory is not None else None
            if cache_len > 0:
                new_memory.append(
                    self._cache_mem(states_h, cur_memory, cache_len,
                                    reuse_len))
            states_h, states_g = self.attn_layers[i](
                states_h=states_h,
                pos_embed=pos_embed,
                states_g=states_g,
                segment_mat=segment_matrix,
                attn_mask_h=non_tgt_mask,
                attn_mask_g=attn_mask,
                target_mapping=None,
                memory=cur_memory,
                mode=mode)
            ff_layer = self.ff_layers[i]
            states_h = ff_layer(states_h, mode=mode)

            if states_g is not None:
                states_g = ff_layer(states_g, mode=mode)

        output = self.dropout(states_h if states_g is None else states_g,
                              training=is_training)

        # Now output: [max_time, batch_size, hidden_dim]
        # new_memory: None or A list of length num_layers,
        # each tensor of shape [cache_len, batch_size, hidden_dim]
        output = tf.transpose(output, perm=[1, 0, 2])
        if new_memory is not None:
            new_memory = [tf.transpose(m, perm=[1, 0, 2]) for m in new_memory]

        if not self._built:
            self._add_internal_trainable_variables()
            self._built = True

            if self.pretrained_model_dir:
                self.init_pretrained_weights(self.variable_scope.name)

        if cache_len == 0:
            return output, None

        return output, new_memory