def __init__(self, config): # Set attributes self.config = config self.source_vocab_size = config.source_vocab_sizes[0] self.target_vocab_size = config.target_vocab_size self.name = 'transformer' self.int_dtype = tf.int32 self.float_dtype = tf.float32 # Placeholders self.inputs = model_inputs.ModelInputs(config) # Convert from time-major to batch-major, handle factors self.source_ids, \ self.source_mask, \ self.target_ids_in, \ self.target_ids_out, \ self.target_mask = self._convert_inputs(self.inputs) self.training = self.inputs.training # Build the common parts of the graph. with tf.name_scope('{:s}_loss'.format(self.name)): # (Re-)generate the computational graph self.dec_vocab_size = self._build_graph() # Build the training-specific parts of the graph. with tf.name_scope('{:s}_loss'.format(self.name)): # Encode source sequences with tf.name_scope('{:s}_encode'.format(self.name)): enc_output, cross_attn_mask = self.enc.encode( self.source_ids, self.source_mask) # Decode into target sequences with tf.name_scope('{:s}_decode'.format(self.name)): logits = self.dec.decode_at_train(self.target_ids_in, enc_output, cross_attn_mask) # Instantiate loss layer(s) loss_layer = MaskedCrossEntropy(self.dec_vocab_size, self.config.label_smoothing, self.int_dtype, self.float_dtype, time_major=False, name='loss_layer') # Calculate loss masked_loss, sentence_loss, batch_loss = \ loss_layer.forward(logits, self.target_ids_out, self.target_mask, self.training) sent_lens = tf.reduce_sum(self.target_mask, axis=1, keepdims=False) self._loss_per_sentence = sentence_loss * sent_lens self._loss = tf.reduce_mean(self._loss_per_sentence, keepdims=False) self.sampling_utils = SamplingUtils(config)
def __init__(self, config): self.inputs = model_inputs.ModelInputs(config) # Dropout functions for words. # These probabilistically zero-out all embedding values for individual # words. dropout_source, dropout_target = None, None if config.rnn_use_dropout and config.rnn_dropout_source > 0.0: def dropout_source(x): return tf.layers.dropout(x, noise_shape=(tf.shape(x)[0], tf.shape(x)[1], 1), rate=config.rnn_dropout_source, training=self.inputs.training) if config.rnn_use_dropout and config.rnn_dropout_target > 0.0: def dropout_target(y): return tf.layers.dropout(y, noise_shape=(tf.shape(y)[0], tf.shape(y)[1], 1), rate=config.rnn_dropout_target, training=self.inputs.training) # Dropout functions for use within FF, GRU, and attention layers. # We use Gal and Ghahramani (2016)-style dropout, so these functions # will be used to create 2D dropout masks that are reused at every # timestep. dropout_embedding, dropout_hidden = None, None if config.rnn_use_dropout and config.rnn_dropout_embedding > 0.0: def dropout_embedding(e): return tf.layers.dropout(e, noise_shape=tf.shape(e), rate=config.rnn_dropout_embedding, training=self.inputs.training) if config.rnn_use_dropout and config.rnn_dropout_hidden > 0.0: def dropout_hidden(h): return tf.layers.dropout(h, noise_shape=tf.shape(h), rate=config.rnn_dropout_hidden, training=self.inputs.training) batch_size = tf.shape(self.inputs.x)[-1] # dynamic value with tf.variable_scope("encoder"): self.encoder = Encoder(config, batch_size, dropout_source, dropout_embedding, dropout_hidden) ctx, embs = self.encoder.get_context(self.inputs.x, self.inputs.x_mask) with tf.variable_scope("decoder"): if config.tie_encoder_decoder_embeddings: tied_embeddings = self.encoder.emb_layer else: tied_embeddings = None self.decoder = Decoder(config, ctx, embs, self.inputs.x_mask, dropout_target, dropout_embedding, dropout_hidden, tied_embeddings) self.logits = self.decoder.score(self.inputs.y) with tf.variable_scope("loss"): self.loss_layer = layers.Masked_cross_entropy_loss( self.inputs.y, self.inputs.y_mask, config.label_smoothing, training=self.inputs.training) self._loss_per_sentence = self.loss_layer.forward(self.logits) self._loss = tf.reduce_mean(self._loss_per_sentence, keepdims=False) self.sampling_utils = SamplingUtils(config)
def __init__(self, config): # Set attributes self.config = config self.source_vocab_size = config.source_vocab_sizes[0] self.target_vocab_size = config.target_vocab_size self.name = 'transformer' # Placeholders self.inputs = model_inputs.ModelInputs(config) # Convert from time-major to batch-major, handle factors self.source_ids, \ self.source_mask, \ self.target_ids_in, \ self.target_ids_out, \ self.target_mask = self._convert_inputs(self.inputs) self.training = self.inputs.training self.scores = self.inputs.scores self.index = self.inputs.index # Build the common parts of the graph. with tf.name_scope('{:s}_loss'.format(self.name)): # (Re-)generate the computational graph self.dec_vocab_size = self._build_graph() # Build the training-specific parts of the graph. with tf.name_scope('{:s}_loss'.format(self.name)): # Encode source sequences with tf.name_scope('{:s}_encode'.format(self.name)): enc_output, cross_attn_mask = self.enc.encode( self.source_ids, self.source_mask) # Decode into target sequences with tf.name_scope('{:s}_decode'.format(self.name)): logits = self.dec.decode_at_train(self.target_ids_in, enc_output, cross_attn_mask) # Instantiate loss layer(s) loss_layer = MaskedCrossEntropy(self.dec_vocab_size, self.config.label_smoothing, INT_DTYPE, FLOAT_DTYPE, time_major=False, name='loss_layer') # Calculate loss masked_loss, sentence_loss, batch_loss = \ loss_layer.forward(logits, self.target_ids_out, self.target_mask, self.training) if self.config.print_per_token_pro: # e**(-(-log(probability))) = probability self._print_pro = tf.math.exp(-masked_loss) sent_lens = tf.reduce_sum(self.target_mask, axis=1, keepdims=False) self._loss_per_sentence = sentence_loss * sent_lens self._loss = tf.reduce_mean(self._loss_per_sentence, keepdims=False) # calculate expected risk if self.config.loss_function == 'MRT': # self._loss_per_sentence is negative log probability of the output sentence, each element represents # the loss of each sample pair. self._risk = mru.mrt_cost(self._loss_per_sentence, self.scores, self.index, self.config) self.sampling_utils = SamplingUtils(config)