Beispiel #1
0
    def __init__(self, training, tokenized_data, batch_input, scope=None):
        """
        Create the model.

        Args:
            training: A boolean value to indicate whether this model will be used for training.
            tokenized_data: The data object containing all information required for the model.
            scope: scope of the model.
        """
        self.training = training
        self.batch_input = batch_input
        self.vocab_table = tokenized_data.vocab_table
        self.vocab_size = tokenized_data.vocab_size
        self.reverse_vocab_table = tokenized_data.reverse_vocab_table

        hparams = tokenized_data.hparams
        self.hparams = hparams

        self.num_layers = hparams.num_layers
        self.time_major = hparams.time_major

        # Initializer
        initializer = model_helper.get_initializer(hparams.init_op,
                                                   hparams.random_seed,
                                                   hparams.init_weight)
        tf.get_variable_scope().set_initializer(initializer)

        # Embeddings
        self.embedding = (model_helper.create_embbeding(
            vocab_size=self.vocab_size,
            embed_size=hparams.num_units,
            scope=scope))
        # This batch_size might vary among each batch instance due to the bucketing and/or reach
        # the end of the training set. Treat it as size_of_the_batch.
        self.batch_size = tf.size(self.batch_input.source_sequence_length)

        # Projection
        with tf.variable_scope(scope or "build_network"):
            with tf.variable_scope("decoder/output_projection"):
                self.output_layer = layers_core.Dense(self.vocab_size,
                                                      use_bias=False,
                                                      name="output_projection")

        # Training or inference graph
        print("# Building graph for the model ...")
        res = self.build_graph(hparams, scope=scope)

        if training:
            self.train_loss = res[1]
            self.word_count = tf.reduce_sum(self.batch_input.source_sequence_length) + \
                              tf.reduce_sum(self.batch_input.target_sequence_length)
            # Count the number of predicted words for compute perplexity.
            self.predict_count = tf.reduce_sum(
                self.batch_input.target_sequence_length)
        else:
            self.infer_logits, _, self.final_context_state, self.sample_id = res
            self.sample_words = self.reverse_vocab_table.lookup(
                tf.to_int64(self.sample_id))

        self.global_step = tf.Variable(0, trainable=False)

        params = tf.trainable_variables()

        # Gradients update operation for training the model.
        if training:
            self.learning_rate = tf.placeholder(tf.float32,
                                                shape=[],
                                                name='learning_rate')
            opt = tf.train.AdamOptimizer(self.learning_rate)

            gradients = tf.gradients(self.train_loss, params)

            clipped_gradients, gradient_norm_summary = model_helper.gradient_clip(
                gradients, max_gradient_norm=hparams.max_gradient_norm)

            self.update = opt.apply_gradients(zip(clipped_gradients, params),
                                              global_step=self.global_step)

            # Summary
            self.train_summary = tf.summary.merge([
                tf.summary.scalar("learning_rate", self.learning_rate),
                tf.summary.scalar("train_loss", self.train_loss),
            ] + gradient_norm_summary)
        else:
            self.infer_summary = tf.no_op()

        # Saver
        self.saver = tf.train.Saver(tf.global_variables())

        # Print trainable variables
        if training:
            print("# Trainable variables:")
            for param in params:
                print("  {}, {}, {}".format(param.name, str(param.get_shape()),
                                            param.op.device))
    def __init__(self, training, tokenized_data, batch_input, scope=None):
        """
        Create the model.

        Args:
            training: A boolean value to indicate whether this model will be used for training.
            tokenized_data: The data object containing all information required for the model.
            scope: scope of the model.
        """
        self.training = training
        self.batch_input = batch_input
        self.vocab_table = tokenized_data.vocab_table
        self.vocab_size = tokenized_data.vocab_size
        self.reverse_vocab_table = tokenized_data.reverse_vocab_table

        hparams = tokenized_data.hparams
        self.hparams = hparams

        self.num_layers = hparams.num_layers
        self.time_major = hparams.time_major

        # Initializer
        initializer = model_helper.get_initializer(hparams.init_op,
                                                   hparams.random_seed,
                                                   hparams.init_weight)
        tf.get_variable_scope().set_initializer(initializer)

        # Embeddings
        self.embedding = (model_helper.create_embbeding(
            vocab_size=self.vocab_size,
            embed_size=hparams.num_units,
            scope=scope))
        # This batch_size might vary among each batch instance due to the bucketing and/or reach
        # the end of the training set. Treat it as size_of_the_batch.
        self.batch_size = tf.size(self.batch_input.source_sequence_length)

        # Projection
        with tf.variable_scope(scope or "build_network"):
            with tf.variable_scope("decoder/output_projection"):
                self.output_layer = layers_core.Dense(self.vocab_size,
                                                      use_bias=False,
                                                      name="output_projection")

        # Training or inference graph
        print('\n\n{} Building graph for the model ...{}\n'.format(
            colorama.Fore.GREEN, colorama.Fore.RESET))
        print("***************************************")
        res = self.build_graph(hparams, scope=scope)

        if training:
            self.train_loss = res[1]
            self.word_count = tf.reduce_sum(self.batch_input.source_sequence_length) + \
                              tf.reduce_sum(self.batch_input.target_sequence_length)
            # Count the number of predicted words for compute perplexity.
            self.predict_count = tf.reduce_sum(
                self.batch_input.target_sequence_length)
        else:
            self.infer_logits, _, self.final_context_state, self.sample_id = res
            self.sample_words = self.reverse_vocab_table.lookup(
                tf.to_int64(self.sample_id))

        self.global_step = tf.Variable(0, trainable=False)

        params = tf.trainable_variables()

        # Gradients update operation for training the model.
        if training:
            self.learning_rate = tf.placeholder(tf.float32,
                                                shape=[],
                                                name='learning_rate')
            opt = tf.train.AdamOptimizer(self.learning_rate)

            gradients = tf.gradients(self.train_loss, params)
            # Added by Nuruzzaman on 18/05/2018
            # Run the training for at least one epoch, note down the time,
            # and then set it to False (or just remove it) and run the training
            # for at least one epoch and see if the times required for one epoch
            # are significantly different. It is shocking to me at least.
            colocate_gradients_with_ops = True

            clipped_gradients, gradient_norm_summary = model_helper.gradient_clip(
                gradients, max_gradient_norm=hparams.max_gradient_norm)

            self.update = opt.apply_gradients(zip(clipped_gradients, params),
                                              global_step=self.global_step)

            # Summary
            self.train_summary = tf.summary.merge([
                tf.summary.scalar("learning_rate", self.learning_rate),
                tf.summary.scalar("train_loss", self.train_loss),
            ] + gradient_norm_summary)
        else:
            self.infer_summary = tf.no_op()

        # Saver
        self.saver = tf.train.Saver(tf.global_variables())