Esempio n. 1
0
    def __init__(self, config):
        # Set attributes
        self.config = config
        self.source_vocab_size = config.source_vocab_sizes[0]
        self.target_vocab_size = config.target_vocab_size
        self.name = 'transformer'
        self.int_dtype = tf.int32
        self.float_dtype = tf.float32

        # Placeholders
        self.inputs = model_inputs.ModelInputs(config)

        # Convert from time-major to batch-major, handle factors
        self.source_ids, \
            self.source_mask, \
            self.target_ids_in, \
            self.target_ids_out, \
            self.target_mask = self._convert_inputs(self.inputs)

        self.training = self.inputs.training

        # Build the common parts of the graph.
        with tf.name_scope('{:s}_loss'.format(self.name)):
            # (Re-)generate the computational graph
            self.dec_vocab_size = self._build_graph()

        # Build the training-specific parts of the graph.

        with tf.name_scope('{:s}_loss'.format(self.name)):
            # Encode source sequences
            with tf.name_scope('{:s}_encode'.format(self.name)):
                enc_output, cross_attn_mask = self.enc.encode(
                    self.source_ids, self.source_mask)
            # Decode into target sequences
            with tf.name_scope('{:s}_decode'.format(self.name)):
                logits = self.dec.decode_at_train(self.target_ids_in,
                                                  enc_output,
                                                  cross_attn_mask)
            # Instantiate loss layer(s)
            loss_layer = MaskedCrossEntropy(self.dec_vocab_size,
                                            self.config.label_smoothing,
                                            self.int_dtype,
                                            self.float_dtype,
                                            time_major=False,
                                            name='loss_layer')
            # Calculate loss
            masked_loss, sentence_loss, batch_loss = \
                loss_layer.forward(logits, self.target_ids_out, self.target_mask, self.training)

            sent_lens = tf.reduce_sum(self.target_mask, axis=1, keepdims=False)
            self._loss_per_sentence = sentence_loss * sent_lens
            self._loss = tf.reduce_mean(self._loss_per_sentence, keepdims=False)
        
        self.sampling_utils = SamplingUtils(config)
Esempio n. 2
0
    def __init__(self, config):
        self.inputs = model_inputs.ModelInputs(config)

        # Dropout functions for words.
        # These probabilistically zero-out all embedding values for individual
        # words.
        dropout_source, dropout_target = None, None
        if config.rnn_use_dropout and config.rnn_dropout_source > 0.0:

            def dropout_source(x):
                return tf.layers.dropout(x,
                                         noise_shape=(tf.shape(x)[0],
                                                      tf.shape(x)[1], 1),
                                         rate=config.rnn_dropout_source,
                                         training=self.inputs.training)

        if config.rnn_use_dropout and config.rnn_dropout_target > 0.0:

            def dropout_target(y):
                return tf.layers.dropout(y,
                                         noise_shape=(tf.shape(y)[0],
                                                      tf.shape(y)[1], 1),
                                         rate=config.rnn_dropout_target,
                                         training=self.inputs.training)

        # Dropout functions for use within FF, GRU, and attention layers.
        # We use Gal and Ghahramani (2016)-style dropout, so these functions
        # will be used to create 2D dropout masks that are reused at every
        # timestep.
        dropout_embedding, dropout_hidden = None, None
        if config.rnn_use_dropout and config.rnn_dropout_embedding > 0.0:

            def dropout_embedding(e):
                return tf.layers.dropout(e,
                                         noise_shape=tf.shape(e),
                                         rate=config.rnn_dropout_embedding,
                                         training=self.inputs.training)

        if config.rnn_use_dropout and config.rnn_dropout_hidden > 0.0:

            def dropout_hidden(h):
                return tf.layers.dropout(h,
                                         noise_shape=tf.shape(h),
                                         rate=config.rnn_dropout_hidden,
                                         training=self.inputs.training)

        batch_size = tf.shape(self.inputs.x)[-1]  # dynamic value

        with tf.variable_scope("encoder"):
            self.encoder = Encoder(config, batch_size, dropout_source,
                                   dropout_embedding, dropout_hidden)
            ctx, embs = self.encoder.get_context(self.inputs.x,
                                                 self.inputs.x_mask)

        with tf.variable_scope("decoder"):
            if config.tie_encoder_decoder_embeddings:
                tied_embeddings = self.encoder.emb_layer
            else:
                tied_embeddings = None
            self.decoder = Decoder(config, ctx, embs, self.inputs.x_mask,
                                   dropout_target, dropout_embedding,
                                   dropout_hidden, tied_embeddings)
            self.logits = self.decoder.score(self.inputs.y)

        with tf.variable_scope("loss"):
            self.loss_layer = layers.Masked_cross_entropy_loss(
                self.inputs.y,
                self.inputs.y_mask,
                config.label_smoothing,
                training=self.inputs.training)
            self._loss_per_sentence = self.loss_layer.forward(self.logits)
            self._loss = tf.reduce_mean(self._loss_per_sentence,
                                        keepdims=False)

        self.sampling_utils = SamplingUtils(config)
Esempio n. 3
0
    def __init__(self, config):
        # Set attributes
        self.config = config
        self.source_vocab_size = config.source_vocab_sizes[0]
        self.target_vocab_size = config.target_vocab_size
        self.name = 'transformer'

        # Placeholders
        self.inputs = model_inputs.ModelInputs(config)

        # Convert from time-major to batch-major, handle factors
        self.source_ids, \
            self.source_mask, \
            self.target_ids_in, \
            self.target_ids_out, \
            self.target_mask = self._convert_inputs(self.inputs)

        self.training = self.inputs.training
        self.scores = self.inputs.scores
        self.index = self.inputs.index

        # Build the common parts of the graph.
        with tf.name_scope('{:s}_loss'.format(self.name)):
            # (Re-)generate the computational graph
            self.dec_vocab_size = self._build_graph()

        # Build the training-specific parts of the graph.

        with tf.name_scope('{:s}_loss'.format(self.name)):
            # Encode source sequences
            with tf.name_scope('{:s}_encode'.format(self.name)):
                enc_output, cross_attn_mask = self.enc.encode(
                    self.source_ids, self.source_mask)
            # Decode into target sequences
            with tf.name_scope('{:s}_decode'.format(self.name)):
                logits = self.dec.decode_at_train(self.target_ids_in,
                                                  enc_output, cross_attn_mask)
            # Instantiate loss layer(s)
            loss_layer = MaskedCrossEntropy(self.dec_vocab_size,
                                            self.config.label_smoothing,
                                            INT_DTYPE,
                                            FLOAT_DTYPE,
                                            time_major=False,
                                            name='loss_layer')
            # Calculate loss
            masked_loss, sentence_loss, batch_loss = \
                loss_layer.forward(logits, self.target_ids_out, self.target_mask, self.training)
            if self.config.print_per_token_pro:
                # e**(-(-log(probability))) =  probability
                self._print_pro = tf.math.exp(-masked_loss)

            sent_lens = tf.reduce_sum(self.target_mask, axis=1, keepdims=False)
            self._loss_per_sentence = sentence_loss * sent_lens
            self._loss = tf.reduce_mean(self._loss_per_sentence,
                                        keepdims=False)

            # calculate expected risk
            if self.config.loss_function == 'MRT':
                # self._loss_per_sentence is negative log probability of the output sentence, each element represents
                # the loss of each sample pair.
                self._risk = mru.mrt_cost(self._loss_per_sentence, self.scores,
                                          self.index, self.config)

            self.sampling_utils = SamplingUtils(config)