def __init__(self, config): self.inputs = model_inputs.ModelInputs(config) # Dropout functions for words. # These probabilistically zero-out all embedding values for individual # words. dropout_source, dropout_target = None, None if config.rnn_use_dropout and config.rnn_dropout_source > 0.0: def dropout_source(x): return tf.layers.dropout(x, noise_shape=(tf.shape(x)[0], tf.shape(x)[1], 1), rate=config.rnn_dropout_source, training=self.inputs.training) if config.rnn_use_dropout and config.rnn_dropout_target > 0.0: def dropout_target(y): return tf.layers.dropout(y, noise_shape=(tf.shape(y)[0], tf.shape(y)[1], 1), rate=config.rnn_dropout_target, training=self.inputs.training) # Dropout functions for use within FF, GRU, and attention layers. # We use Gal and Ghahramani (2016)-style dropout, so these functions # will be used to create 2D dropout masks that are reused at every # timestep. dropout_embedding, dropout_hidden = None, None if config.rnn_use_dropout and config.rnn_dropout_embedding > 0.0: def dropout_embedding(e): return tf.layers.dropout(e, noise_shape=tf.shape(e), rate=config.rnn_dropout_embedding, training=self.inputs.training) if config.rnn_use_dropout and config.rnn_dropout_hidden > 0.0: def dropout_hidden(h): return tf.layers.dropout(h, noise_shape=tf.shape(h), rate=config.rnn_dropout_hidden, training=self.inputs.training) batch_size = tf.shape(self.inputs.x)[-1] # dynamic value with tf.variable_scope("encoder"): self.encoder = Encoder(config, batch_size, dropout_source, dropout_embedding, dropout_hidden) ctx, embs = self.encoder.get_context(self.inputs.x, self.inputs.x_mask) with tf.variable_scope("decoder"): if config.tie_encoder_decoder_embeddings: tied_embeddings = self.encoder.emb_layer else: tied_embeddings = None self.decoder = Decoder(config, ctx, embs, self.inputs.x_mask, dropout_target, dropout_embedding, dropout_hidden, tied_embeddings) self.logits = self.decoder.score(self.inputs.y) with tf.variable_scope("loss"): self.loss_layer = layers.Masked_cross_entropy_loss( self.inputs.y, self.inputs.y_mask, config.label_smoothing, training=self.inputs.training) self._loss_per_sentence = self.loss_layer.forward(self.logits) self._loss = tf.reduce_mean(self._loss_per_sentence, keepdims=False) self.sampling_utils = SamplingUtils(config)
def __init__(self, config): self.inputs = ModelInputs(config) # Dropout functions for words. # These probabilistically zero-out all embedding values for individual # words. dropout_source, dropout_target = None, None if config.use_dropout and config.dropout_source > 0.0: def dropout_source(x): return tf.layers.dropout( x, noise_shape=(tf.shape(x)[0], tf.shape(x)[1], 1), rate=config.dropout_source, training=self.inputs.training) if config.use_dropout and config.dropout_target > 0.0: def dropout_target(y): return tf.layers.dropout( y, noise_shape=(tf.shape(y)[0], tf.shape(y)[1], 1), rate=config.dropout_target, training=self.inputs.training) # Dropout functions for use within FF, GRU, and attention layers. # We use Gal and Ghahramani (2016)-style dropout, so these functions # will be used to create 2D dropout masks that are reused at every # timestep. dropout_embedding, dropout_hidden = None, None if config.use_dropout and config.dropout_embedding > 0.0: def dropout_embedding(e): return tf.layers.dropout(e, noise_shape=tf.shape(e), rate=config.dropout_embedding, training=self.inputs.training) if config.use_dropout and config.dropout_hidden > 0.0: def dropout_hidden(h): return tf.layers.dropout(h, noise_shape=tf.shape(h), rate=config.dropout_hidden, training=self.inputs.training) batch_size = tf.shape(self.inputs.x)[-1] # dynamic value with tf.variable_scope("encoder"): self.encoder = Encoder(config, batch_size, dropout_source, dropout_embedding, dropout_hidden) ctx = self.encoder.get_context(self.inputs.x, self.inputs.x_mask) with tf.variable_scope("decoder"): if config.tie_encoder_decoder_embeddings: tied_embeddings = self.encoder.emb_layer else: tied_embeddings = None self.decoder = Decoder(config, ctx, self.inputs.x_mask, dropout_target, dropout_embedding, dropout_hidden, tied_embeddings) self.logits = self.decoder.score(self.inputs.y) with tf.variable_scope("loss"): self.loss_layer = layers.Masked_cross_entropy_loss( self.inputs.y, self.inputs.y_mask, config.label_smoothing, training=self.inputs.training) self.loss_per_sentence = self.loss_layer.forward(self.logits) self.objective = tf.reduce_mean(self.loss_per_sentence, keepdims=False) self.l2_loss = tf.constant(0.0, dtype=tf.float32) if config.decay_c > 0.0: self.l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()]) * tf.constant(config.decay_c, dtype=tf.float32) self.objective += self.l2_loss self.map_l2_loss = tf.constant(0.0, dtype=tf.float32) if config.map_decay_c > 0.0: map_l2_acc = [] for v in tf.trainable_variables(): prior_name = 'prior/'+v.name.split(':')[0] prior_v = tf.get_variable( prior_name, initializer=v.initialized_value(), trainable=False, collections=['prior_variables'], dtype=v.initialized_value().dtype) map_l2_acc.append(tf.nn.l2_loss(v - prior_v)) self.map_l2_loss = tf.add_n(map_l2_acc) * tf.constant(config.map_decay_c, dtype=tf.float32) self.objective += self.map_l2_loss self.sampled_ys = None self.beam_size, self.beam_ys, self.parents, self.cost = None, None, None, None