Esempio n. 1
0
    def run(self):
        """ Evaluation on a existing model.

        Step 1: Build model.
        Step 2: Builds evaluation dataset.
        Step 3: Restore checkpoints.
        Step 4: Evaluate and reduce metric.
        """

        with training_utils.get_strategy_scope(self.strategy):
            tfds = training_utils.build_datasets(compat.ModeKeys.EVAL,
                                                 self.strategy,
                                                 self.custom_dataset,
                                                 self.task,
                                                 cache=True)
            keras_model = self.build_evaluation_model(self.task, self.model,
                                                      self._criterion)
            keras_model.summary()
            summary_model_variables(keras_model)
            # Step 4: Restore checkpoints.
            stat = restore_checkpoint_if_possible(self.model, self.model_dir)
            if not stat:
                logging.info(
                    f"WARNING: Fail to restore checkpoint from {self.model_dir}. "
                    "We assume this was done on purpose. ")
        # Step 5: Evaluate and reduce metric.
        start_time = time.time()
        results, avg_res, whole_res = training_utils.reduce_eval_results(
            self._criterion, self.custom_dataset,
            training_utils.make_predictions(self.strategy, keras_model, tfds,
                                            self.custom_dataset))
        logging.info("Evaluation elapsed: %.2fs", time.time() - start_time)

        def _display(res, name=None):
            if name:
                logging.info(f"Evaluation Results ({name}):")
            for k, v in res.items():
                logging.info("   %s: %.2f", k, v)

        if not isinstance(self.custom_dataset, MultipleDataset):
            _display(results)
        else:
            for name, res in results.items():
                _display(res, name)
            _display(
                avg_res,
                f"on average by weights {self.custom_dataset.sample_weights}")
            _display(whole_res, "mixed")
Esempio n. 2
0
    def validate(self, step):
        if not self._validate_criterion:
            return
        start_time = time.time()
        results, avg_res, mixed_res = training_utils.reduce_eval_results(
            self._criterion, self._custom_dataset,
            training_utils.make_predictions(self._strategy,
                                            self._criterion_model,
                                            self._eval_tfds,
                                            self._custom_dataset))
        elapsed = time.time() - start_time
        elapsed_from_start = time.time() - self._criterion_start_time

        def _display(res, best, name=None, tb_name=None):
            if tb_name is None:
                tb_name = name
            tb_name = "" if tb_name is None else (tb_name + "_")
            name = "" if name is None else f" ({name})"
            for k, v in res.items():
                logging.info(
                    "Evaluating (%s) validation set%s: %s=%.2f (Best %.2f)  "
                    "step=%d\tElapsed %.2fs  FromSTART %.2fs",
                    self._criterion_metric.flag, name, k, v, best[k], step,
                    elapsed, elapsed_from_start)
                tf.summary.scalar(compat.GlobalKeys.TBPREFIX_VALID +
                                  f"/{tb_name}{k}",
                                  v,
                                  step=step)

        if isinstance(self._custom_dataset, MultipleDataset):
            for name, res in results.items():
                self._criterion_recorder[name].record(step, res)
                _display(res, self._criterion_recorder[name].best, name=name)
            self._avg_criterion_recorder.record(step, avg_res)
            _display(
                avg_res,
                self._avg_criterion_recorder.best,
                f"on average by weights {self._custom_dataset.sample_weights}",
                tb_name="AVERAGE")
            self._mixed_criterion_recorder.record(step, mixed_res)
            _display(mixed_res, self._mixed_criterion_recorder.best, "MIXED")
        else:
            self._criterion_recorder.record(step, results)
            _display(results, self._criterion_recorder.best)
Esempio n. 3
0
    def run(self):
        """ Sequence generation from an existing model checkpoint.

        Step 1: Build model and restore checkpoints.
        Step 2: Build test dataset.
        Step 3: Sequence generation.
        Step 4: Evaluation using metric.
        """
        # Step 3: Build model.
        with training_utils.get_strategy_scope(self.strategy):
            model = self._build_and_restore_model()
            keras_model = self.build_generation_model(self.task, model,
                                                      self._search_layer)
            tfds = training_utils.build_datasets(compat.ModeKeys.INFER,
                                                 self.strategy,
                                                 self.custom_dataset,
                                                 self.task)
            keras_model.summary()
            summary_model_variables(keras_model)

        # Step 5: Sequence Generation.
        start_time = time.time()
        results = training_utils.make_predictions(
            self.strategy,
            keras_model,
            tfds,
            self.custom_dataset,
            map_func=lambda y: SequenceGenerator.postprocess_generation(
                self.task, y))
        logging.info("Generation elapsed: %.2fs", time.time() - start_time)

        if self._output_file:
            if isinstance(self.custom_dataset, MultipleDataset):
                if isinstance(self._output_file, dict):
                    for name in results:
                        if self._output_file.get(name, None):
                            with tf.io.gfile.GFile(self._output_file[name],
                                                   "w") as fw:
                                fw.write("\n".join(results[name]) + "\n")
                            logging.info(
                                "Saving generation of dataset {} results into {}"
                                .format(name, self._output_file[name]))
                else:
                    logging.info(
                        "Unsupported type of `output_file`={}({}) for MultipleDataset."
                        .format(self._output_file, type(self._output_file)))
            else:
                if isinstance(self._output_file, str):
                    with tf.io.gfile.GFile(self._output_file, "w") as fw:
                        fw.write("\n".join(results) + "\n")
                    logging.info("Saving generation results into {}".format(
                        self._output_file))
                else:
                    logging.info(
                        f"WARNING: No generation results are saved due to unsupported type "
                        f"of `output_file`: {self._output_file} ({type(self._output_file)})"
                    )

        # Step 6: evaluation using metric
        def _display(res, name=None):
            if name:
                logging.info(f"Evaluation Result ({name}):")
            else:
                logging.info("Evaluation Result:")
            for k, v in res.items():
                logging.info("   %s=%.2f", k, v)

        if self._metric is not None:
            saving_metrics = dict()
            if isinstance(self.custom_dataset, MultipleDataset):
                on_average = {}
                mixed_dsnames = []
                mixed_hypos = []
                mixed_refs = []
                for name in tfds:
                    assert isinstance(self.custom_dataset.datasets[name],
                                      TextGenDataset)
                    if self.custom_dataset.datasets[name].targets:
                        metric_result = self._metric(
                            results[name],
                            self.custom_dataset.datasets[name].targets)
                        for k, v in metric_result.items():
                            if k not in on_average:
                                on_average[k] = 0.
                            on_average[
                                k] += self.custom_dataset.sample_weights[
                                    name] * v
                        _display(metric_result, name)
                        mixed_dsnames.append(name)
                        mixed_hypos.extend(results[name])
                        mixed_refs.extend(
                            self.custom_dataset.datasets[name].targets)
                        saving_metrics[name] = metric_result
                if len(mixed_dsnames) > 1:
                    _display(
                        on_average,
                        f"on average by weights {self._custom_dataset.sample_weights}"
                    )
                    mixed_metric_result = self._metric(mixed_refs, mixed_hypos)
                    _display(mixed_metric_result,
                             "mixed of {}".format(",".join(mixed_dsnames)))
                    saving_metrics["MIXED"] = mixed_metric_result

            else:
                assert isinstance(self.custom_dataset, TextGenDataset)
                if self.custom_dataset.targets is not None:
                    metric_result = self._metric(results,
                                                 self.custom_dataset.targets)
                    _display(metric_result)
                    saving_metrics = metric_result
            if self._save_metric is not None:
                logging.info(f"Saving metric results into {self._save_metric}")
                with tf.io.gfile.GFile(self._save_metric, "w") as fw:
                    json.dump(saving_metrics, fw)
Esempio n. 4
0
    def validate(self, step):
        super(SeqGenerationValidator, self).validate(step)
        if not self._validate_gen:
            return
        start_time = time.time()
        results = training_utils.make_predictions(
            self._strategy,
            self._gen_model,
            self._gen_tfds,
            self._custom_dataset,
            map_func=self._postprocess_fn)
        elapsed = time.time() - start_time
        elapsed_from_start = time.time() - self._gen_start_time

        def _display_hypo(custom_ds, hypos, name=None):
            if name:
                logging.info(
                    f"===== Generation examples from {name} (Total {len(hypos)}) ====="
                )
            else:
                logging.info(
                    f"===== Generation examples (Total {len(hypos)}) =====")
            for sample_idx in random.sample(list(range(0, len(hypos))), 5):
                logging.info("Sample %d", sample_idx)
                if hasattr(custom_ds,
                           "sources") and custom_ds.sources is not None:
                    logging.info("  Data: %s", custom_ds.sources[sample_idx])
                logging.info("  Reference: %s", custom_ds.targets[sample_idx])
                logging.info("  Hypothesis: %s", hypos[sample_idx])

        def _display(res, best, name=None, tb_name=None):
            if tb_name is None:
                tb_name = name
            tb_name = "" if tb_name is None else (tb_name + "_")
            name = "" if name is None else f" ({name})"

            for k, v in res.items():
                logging.info(
                    "Evaluating (%s) validation set%s: %s=%.2f (Best %.2f)  "
                    "step=%d\tElapsed %.2fs  FromSTART %.2fs",
                    self._gen_metric.flag, name, k, v, best[k], step, elapsed,
                    elapsed_from_start)
                tf.summary.scalar(compat.GlobalKeys.TBPREFIX_VALID +
                                  f"/{tb_name}{k}",
                                  v,
                                  step=step)

        if isinstance(self._custom_dataset, MultipleDataset):
            on_average = {}
            mixed_dsnames = []
            mixed_hypos = []
            mixed_refs = []
            sample_weights = {
                name: self._custom_dataset.sample_weights[name]
                for name in self._gen_tfds
            }
            sample_weight_sum = sum(sample_weights.values()) * 1.
            sample_weights = {
                name: weight / sample_weight_sum
                for name, weight in sample_weights.items()
            }
            for name, res in results.items():
                metric_res = self._gen_metric(
                    res, self._custom_dataset.datasets[name].targets)
                self._gen_recorder[name].record(step, metric_res)
                for k, v in metric_res.items():
                    if k not in on_average:
                        on_average[k] = 0.
                    on_average[k] += sample_weights[name] * v
                _display_hypo(self._custom_dataset.datasets[name],
                              res,
                              name=name)
                _display(metric_res, self._gen_recorder[name].best, name=name)
                mixed_dsnames.append(name)
                mixed_hypos.extend(res)
                mixed_refs.extend(self._custom_dataset.datasets[name].targets)
            if len(mixed_dsnames) >= 1:
                self._avg_gen_recorder.record(step, on_average)
                if len(mixed_dsnames) > 1:
                    _display(on_average,
                             self._avg_gen_recorder.best,
                             f"on average by weights {sample_weights}",
                             tb_name="AVERAGE")
                    mixed_metric_result = self._gen_metric(
                        mixed_hypos, mixed_refs)
                    self._mixed_gen_recorder.record(step, mixed_metric_result)
                    _display(mixed_metric_result,
                             self._mixed_gen_recorder.best,
                             "mixed of {}".format(",".join(mixed_dsnames)),
                             tb_name="MIXED")
        else:
            metric_res = self._gen_metric(results,
                                          self._custom_dataset.targets)
            _display_hypo(self._custom_dataset, results)
            self._gen_recorder.record(step, metric_res)
            _display(metric_res, self._gen_recorder.best)