def _prepare(self):
        """ Prepares for evaluation.

        Builds the model with reuse=True, mode=EVAL and preprocesses
        data file(s).
        """
        text_inputter = TextLineInputter(dataset=self._dataset,
                                         data_field_name="eval_features_file",
                                         batch_size=self._batch_size)
        self._eval_feeding_data = text_inputter.make_feeding_data()
        self._model_configs = update_infer_params(  # update inference parameters
            self._model_configs,
            beam_size=self._beam_size,
            maximum_labels_length=self._maximum_labels_length,
            length_penalty=self._length_penalty)
        estimator_spec = model_fn(model_configs=self._model_configs,
                                  mode=ModeKeys.INFER,
                                  dataset=self._dataset,
                                  name=self._model_name,
                                  reuse=True,
                                  verbose=False)
        self._predict_ops = estimator_spec.predictions
        tmp_trans_dir = os.path.join(self._model_configs["model_dir"],
                                     GlobalNames.TMP_TRANS_DIRNAME)
        if not gfile.Exists(tmp_trans_dir):
            gfile.MakeDirs(tmp_trans_dir)
        self._tmp_trans_file_prefix = os.path.join(
            tmp_trans_dir, GlobalNames.TMP_TRANS_FILENAME_PREFIX)
        self._read_ckpt_bleulog()
        self._eval_labels_file = self._dataset.eval_labels_file
        self._check_bleu_script()
        self._estop_patience = 0
        self._best_bleu_score = 0.
Exemple #2
0
    def run(self):
        """ Trains the model. """
        # vocabulary
        self._vocab_source = Vocab(
            filename=self._model_configs["data"]["source_words_vocabulary"],
            bpe_codes_file=self._model_configs["data"]["source_bpecodes"],
            reverse_seq=False)
        self._vocab_target = Vocab(
            filename=self._model_configs["data"]["target_words_vocabulary"],
            bpe_codes_file=self._model_configs["data"]["target_bpecodes"],
            reverse_seq=self._model_configs["train"]["reverse_target"])
        # build dataset
        dataset = Dataset(
            self._vocab_source,
            self._vocab_target,
            train_features_file=self._model_configs["data"]
            ["train_features_file"],
            train_labels_file=self._model_configs["data"]["train_labels_file"],
            eval_features_file=self._model_configs["data"]
            ["eval_features_file"],
            eval_labels_file=self._model_configs["data"]["eval_labels_file"])

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

        estimator_spec = model_fn(model_configs=self._model_configs,
                                  mode=tf.contrib.learn.ModeKeys.TRAIN,
                                  dataset=dataset)
        train_op = estimator_spec.train_op
        hooks = estimator_spec.training_hooks
        # build training session
        sess = tf.train.MonitoredSession(session_creator=None, hooks=hooks)

        train_text_inputter = ParallelTextInputter(
            dataset, "train_features_file", "train_labels_file",
            self._model_configs["train"]["batch_size"],
            self._model_configs["train"]["batch_tokens_size"],
            self._model_configs["train"]["shuffle_every_epoch"])
        train_data = train_text_inputter.make_feeding_data(
            maximum_encoded_features_length=self._model_configs["train"]
            ["maximum_features_length"],
            maximum_encoded_labels_length=self._model_configs["train"]
            ["maximum_labels_length"])
        eidx = 0
        while True:
            if sess.should_stop():
                break
            tf.logging.info("STARTUP Epoch {}".format(eidx))

            for _, data_feeding in train_data:
                if sess.should_stop():
                    break
                sess.run(train_op, feed_dict=data_feeding)
            eidx += 1
Exemple #3
0
    def _prepare(self):
        """ Prepares for evaluation.

        Builds the model with reuse=True, mode=EVAL and preprocesses
        data file(s).

        Furthermore, if the decay_type of optimizer is "loss_decay", creates
        the controller variables/operations.
        """
        text_inputter = ParallelTextInputter(
            dataset=self._dataset,
            features_field_name="eval_features_file",
            labels_field_name="eval_labels_file",
            batch_size=self._batch_size,
            batch_tokens_size=None,
            maximum_features_length=None,
            maximum_labels_length=None,
            shuffle_every_epoch=None,
            bucketing=True)
        self._eval_feeding_data = text_inputter.make_feeding_data()
        estimator_spec = model_fn(model_configs=self._model_configs,
                                  mode=ModeKeys.EVAL,
                                  dataset=self._dataset,
                                  name=self._model_name,
                                  reuse=True,
                                  verbose=False)
        self._loss_op = estimator_spec.loss
        # for learning decay decay
        self._half_lr = False
        if self._model_configs["optimizer_params"]["optimizer.lr_decay"][
                "decay_type"] == "loss_decay":
            self._half_lr = True
            lr_tensor_dict = get_dict_from_collection(
                GlobalNames.LEARNING_RATE_VAR_NAME)
            self._learning_rate = lr_tensor_dict[
                GlobalNames.LEARNING_RATE_VAR_NAME]
            self._max_patience = self._model_configs["optimizer_params"][
                "optimizer.lr_decay"]["patience"]
            div_factor = lr_tensor_dict[GlobalNames.LR_ANNEAL_DIV_FACTOR_NAME]
            self._half_lr_op = div_factor.assign(div_factor * 2.)
            self._patience = 0
            self._min_loss = 10000.
Exemple #4
0
    def run(self):
        """Infers data files. """
        # build datasets
        self._vocab_source = Vocab(
            filename=self._model_configs["infer"]["source_words_vocabulary"],
            bpe_codes_file=self._model_configs["infer"]["source_bpecodes"],
            reverse_seq=False)
        self._vocab_target = Vocab(
            filename=self._model_configs["infer"]["target_words_vocabulary"],
            bpe_codes_file=self._model_configs["infer"]["target_bpecodes"],
            reverse_seq=self._model_configs["train"]["reverse_target"])
        # build dataset
        dataset = Dataset(self._vocab_source,
                          self._vocab_target,
                          eval_features_file=[
                              p["features_file"]
                              for p in self._model_configs["infer_data"]
                          ])

        self._model_configs = update_infer_params(
            self._model_configs,
            beam_size=self._model_configs["infer"]["beam_size"],
            maximum_labels_length=self._model_configs["infer"]
            ["maximum_labels_length"],
            length_penalty=self._model_configs["infer"]["length_penalty"])
        # build model
        estimator_spec = model_fn(model_configs=self._model_configs,
                                  mode=tf.contrib.learn.ModeKeys.INFER,
                                  dataset=dataset)
        predict_op = estimator_spec.predictions

        sess = self._build_default_session()

        text_inputter = TextLineInputter(
            dataset=dataset,
            data_field_name="eval_features_file",
            batch_size=self._model_configs["infer"]["batch_size"])
        # reload
        checkpoint_path = tf.train.latest_checkpoint(
            self._model_configs["model_dir"])
        if checkpoint_path:
            tf.logging.info("reloading models...")
            optimistic_restore(sess, checkpoint_path)
        else:
            raise OSError(
                "File NOT Found. Fail to find checkpoint file from: {}".format(
                    self._model_configs["model_dir"]))

        tf.logging.info("Start inference.")
        overall_start_time = time.time()

        for feeding_data, param in zip(text_inputter.make_feeding_data(),
                                       self._model_configs["infer_data"]):
            tf.logging.info("Infer Source Features File: {}.".format(
                param["features_file"]))
            start_time = time.time()
            infer(sess=sess,
                  prediction_op=predict_op,
                  feeding_data=feeding_data,
                  output=param["output_file"],
                  vocab_target=self._vocab_target,
                  alpha=self._model_configs["infer"]["length_penalty"],
                  delimiter=self._model_configs["infer"]["delimiter"],
                  output_attention=param["output_attention"],
                  tokenize_output=self._model_configs["infer"]["char_level"],
                  tokenize_script=self._model_configs["infer"]
                  ["tokenize_script"],
                  verbose=True)
            tf.logging.info("FINISHED {}. Elapsed Time: {}.".format(
                param["features_file"], str(time.time() - start_time)))
            if param["labels_file"] is not None:
                bleu_score = multi_bleu_score(
                    self._model_configs["infer"]["multibleu_script"],
                    param["labels_file"], param["output_file"])
                tf.logging.info("BLEU score ({}): {}".format(
                    param["features_file"], bleu_score))
        tf.logging.info("Total Elapsed Time: %s" %
                        str(time.time() - overall_start_time))