Example #1
0
    def run(self):
        """ Export savedmodel for sequence generator.

        Step 1: Build model and restore checkpoints.
        Step 2: Export.
        """
        with training_utils.get_strategy_scope(self.strategy):
            model = self._build_and_restore_model()
            keras_model = self.build_generation_model(self.task, model, self._search_layer)
            keras_model.summary()
            summary_model_variables(keras_model)

        export_path = os.path.join(self._export_path, str(self._version))
        logging.info("Saving model to {}".format(export_path))
        tf.keras.models.save_model(
            keras_model,
            export_path,
            overwrite=True,
            include_optimizer=False,
            save_format=None,
            signatures=None,
            options=None)
        loaded = tf.saved_model.load(export_path)
        logging.info("========== signatures ==========")
        for x in loaded.signatures.keys():
            logging.info(f"structured outputs for {x}:")
            logging.info("    {}".format(str(loaded.signatures["serving_default"].structured_outputs)))
Example #2
0
    def run(self):
        """ Evaluation on a existing model.

        Step 1: Build model.
        Step 2: Builds evaluation dataset.
        Step 3: Restore checkpoints.
        Step 4: Evaluate and reduce metric.
        """
        assert not isinstance(self.custom_dataset, MultipleDataset), (
            "SequenceEvaluator only supports single dataset.")
        with training_utils.get_strategy_scope(self.strategy):
            tfds = training_utils.build_datasets(compat.ModeKeys.EVAL, self.strategy,
                                                 self.custom_dataset, self.task)
            keras_model = self.build_evaluation_model(self.task, self.model, self._criterion)
            keras_model.summary()
            summary_model_variables(keras_model)
            # Step 4: Restore checkpoints.
            stat = restore_checkpoint_if_possible(self.model, self.model_dir)
            if not stat:
                logging.info(f"WARNING: Fail to restore checkpoint from {self.model_dir}. "
                             "We assume this was done on purpose. ")
            # Step 5: Evaluate and reduce metric.
            predict_fn = keras_model.make_predict_function()
            iterator = iter(training_utils.maybe_distribution_dataset(
                self.strategy, tfds.prefetch(tf.data.experimental.AUTOTUNE)))
            with tf.io.gfile.GFile(self._output_file, "w") as fw:
                while True:
                    try:
                        preds = predict_fn(iterator)
                        for pred in self._criterion.reduce_sample_metrics(preds):
                            fw.write(str(pred) + "\n")
                    except (StopIteration, tf.errors.OutOfRangeError):
                        break
Example #3
0
def run_experiment(args, remaining_argv):
    strategy = training_utils.handle_distribution_strategy(
        args["distribution_strategy"])
    flags_core.verbose_flags(FLAG_LIST, args, remaining_argv)
    training_utils.startup_env(
        dtype=args["dtype"],
        enable_check_numerics=args["enable_check_numerics"],
        enable_xla=args["enable_xla"])

    # initialize parameters for quantization.
    if args.get("quant_params", None) is None:
        args["quant_params"] = {}
    QuantLayer.global_init(args["enable_quant"], **args["quant_params"])

    # create exps: trainer, evaluator or ...
    with training_utils.get_strategy_scope(strategy):
        task = build_task(args)
        custom_dataset = build_dataset(args)
        try:
            model = task.build_model(args)
            training_utils.validate_unique_varname(model.weights)
        except AttributeError:
            model = None
        entry = build_exp(args,
                          strategy=strategy,
                          model=model,
                          task=task,
                          model_dir=args["model_dir"],
                          custom_dataset=custom_dataset)
    entry.run()
Example #4
0
 def __init__(self, args, **kwargs):
     """ Initializes a util class for training neural models. """
     super(Trainer, self).__init__(**kwargs)
     self._tb_log_dir = args["tb_log_dir"]
     self._train_steps = args["train_steps"]
     self._summary_steps = args["summary_steps"]
     self._save_checkpoint_steps = args["save_checkpoint_steps"]
     self._checkpoints_max_to_keep = args["checkpoints_max_to_keep"]
     self._initial_global_step = args["initial_global_step"]
     self._pretrain_variable_pattern = args["pretrain_variable_pattern"]
     if args["pretrain_model"] and isinstance(args["pretrain_model"][0],
                                              dict):
         self._pretrain_v2 = True
         self._pretrain_model = args["pretrain_model"]
         if self._pretrain_variable_pattern:
             logging.info(
                 "Using pretrain model v2 and ignoring pretrain_variable_pattern: "
                 f"{self._pretrain_variable_pattern}")
     else:
         self._pretrain_v2 = False
         self._pretrain_model = flatten_string_list(args["pretrain_model"])
         if self._pretrain_model and self._pretrain_variable_pattern is None:
             self._pretrain_variable_pattern = [None] * len(
                 self._pretrain_model)
         assert (
             (self._pretrain_model is None
              and self._pretrain_variable_pattern is None)
             or len(self._pretrain_model) == len(
                 self._pretrain_variable_pattern)
             or len(self._pretrain_model) == 1
         ), ("`pretrain_variable_pattern` must match with `pretrain_model`."
             )
         if self._pretrain_model is not None and self._pretrain_variable_pattern is None:
             self._pretrain_variable_pattern = [None] * len(
                 self._pretrain_model)
     self._update_cycle = args["update_cycle"]
     self._clip_value = args["clip_value"]
     self._clip_norm = args["clip_norm"]
     self._hvd_backend = self.strategy if self.strategy in [
         "byteps", "horovod"
     ] else None
     with training_utils.get_strategy_scope(self.strategy):
         self._criterion = build_criterion(args)
         self._criterion.set_model(self.model)
         self._lr_schedule_args = args
         if compat.IS_PREV_TF_2_4_0:
             self._optimizer = build_optimizer(args)
         else:
             self._optimizer = build_optimizer(args,
                                               clipnorm=self._clip_norm,
                                               clipvalue=self._clip_value)
         assert self._optimizer is not None, "optimizer parameters must be provided for training."
     self._validator = build_validator(args)
     self._experimental_count_batch_num = args[
         "experimental_count_batch_num"]
     self._freeze_variables = args["freeze_variables"]
Example #5
0
    def run(self):
        """ Evaluation on a existing model.

        Step 1: Build model.
        Step 2: Builds evaluation dataset.
        Step 3: Restore checkpoints.
        Step 4: Evaluate and reduce metric.
        """

        with training_utils.get_strategy_scope(self.strategy):
            tfds = training_utils.build_datasets(compat.ModeKeys.EVAL,
                                                 self.strategy,
                                                 self.custom_dataset,
                                                 self.task,
                                                 cache=True)
            keras_model = self.build_evaluation_model(self.task, self.model,
                                                      self._criterion)
            keras_model.summary()
            summary_model_variables(keras_model)
            # Step 4: Restore checkpoints.
            stat = restore_checkpoint_if_possible(self.model, self.model_dir)
            if not stat:
                logging.info(
                    f"WARNING: Fail to restore checkpoint from {self.model_dir}. "
                    "We assume this was done on purpose. ")
        # Step 5: Evaluate and reduce metric.
        start_time = time.time()
        results, avg_res, whole_res = training_utils.reduce_eval_results(
            self._criterion, self.custom_dataset,
            training_utils.make_predictions(self.strategy, keras_model, tfds,
                                            self.custom_dataset))
        logging.info("Evaluation elapsed: %.2fs", time.time() - start_time)

        def _display(res, name=None):
            if name:
                logging.info(f"Evaluation Results ({name}):")
            for k, v in res.items():
                logging.info("   %s: %.2f", k, v)

        if not isinstance(self.custom_dataset, MultipleDataset):
            _display(results)
        else:
            for name, res in results.items():
                _display(res, name)
            _display(
                avg_res,
                f"on average by weights {self.custom_dataset.sample_weights}")
            _display(whole_res, "mixed")
Example #6
0
 def build(self, strategy, task, model):
     """ Initializes. """
     self._strategy = strategy
     self._criterion: Criterion = build_criterion(
         self.args["eval_criterion.class"],
         **self.args["eval_criterion.params"])
     self._criterion.set_model(model)
     if self._criterion is None:
         logging.info(
             "WARNING: no criterion is provided in CriterionValidator "
             "for validation process.")
         self._validate_criterion = False
         return self
     self._custom_dataset = build_dataset(
         self.args["eval_dataset.class"],
         **self.args["eval_dataset.params"])
     if self._custom_dataset is None:
         logging.info("WARNING: no validation dataset is provided "
                      "in CriterionValidator for validation process.")
         self._validate_criterion = False
         return self
     from neurst.exps.evaluator import Evaluator
     with training_utils.get_strategy_scope(strategy):
         self._criterion_model = Evaluator.build_evaluation_model(
             task, model, self._criterion)
         self._eval_tfds = training_utils.build_datasets(
             compat.ModeKeys.EVAL, strategy, self._custom_dataset, task,
             True, self._eval_task_args)
     self._criterion_metric = self._criterion.as_metric()
     if isinstance(self._custom_dataset, MultipleDataset):
         self._criterion_recorder = {
             name: training_utils.TrainingStatusRecorder(
                 model=model, task=task, metric=self._criterion_metric)
             for name in self._custom_dataset.datasets
         }
         self._avg_criterion_recorder = training_utils.TrainingStatusRecorder(
             model=model, task=task, metric=self._criterion_metric)
         self._mixed_criterion_recorder = training_utils.TrainingStatusRecorder(
             model=model, task=task, metric=self._criterion_metric)
     else:
         self._criterion_recorder = training_utils.TrainingStatusRecorder(
             model=model, task=task, metric=self._criterion_metric)
     self._criterion_start_time = time.time()
     return self
Example #7
0
 def __init__(self, args, **kwargs):
     """ Initializes a util class for training neural models. """
     super(Trainer, self).__init__(**kwargs)
     self._tb_log_dir = args["tb_log_dir"]
     self._train_steps = args["train_steps"]
     self._summary_steps = args["summary_steps"]
     self._save_checkpoint_steps = args["save_checkpoint_steps"]
     self._checkpoints_max_to_keep = args["checkpoints_max_to_keep"]
     self._initial_global_step = args["initial_global_step"]
     self._pretrain_model = flatten_string_list(args["pretrain_model"])
     self._pretrain_variable_pattern = args["pretrain_variable_pattern"]
     if self._pretrain_model and self._pretrain_variable_pattern is None:
         self._pretrain_variable_pattern = [None] * len(
             self._pretrain_model)
     assert (
         (self._pretrain_model is None
          and self._pretrain_variable_pattern is None) or len(
              self._pretrain_model) == len(self._pretrain_variable_pattern)
         or len(self._pretrain_model) == 1
     ), ("`pretrain_variable_pattern` must match with `pretrain_model`.")
     if self._pretrain_model is not None and self._pretrain_variable_pattern is None:
         self._pretrain_variable_pattern = [None] * len(
             self._pretrain_model)
     self._update_cycle = args["update_cycle"]
     self._clip_value = args["clip_value"]
     self._clip_norm = args["clip_norm"]
     self._hvd_backend = self.strategy if self.strategy in [
         "byteps", "horovod"
     ] else None
     with training_utils.get_strategy_scope(self.strategy):
         self._criterion = build_criterion(args)
         self._criterion.set_model(self.model)
         self._lr_schedule = build_lr_schedule(args)
         optimizer = build_optimizer(args)
         assert optimizer is not None, "optimizer parameters must be provided for training."
         self._optimizer = _handle_fp16_and_distributed_optimizer(
             optimizer, self._lr_schedule, self._hvd_backend)
     self._validator = build_validator(args)
     self._experimental_count_batch_num = args[
         "experimental_count_batch_num"]
Example #8
0
def run_experiment(args, remaining_argv):
    strategy = training_utils.handle_distribution_strategy(
        args["distribution_strategy"])
    flags_core.verbose_flags(FLAG_LIST, args, remaining_argv)
    training_utils.startup_env(
        dtype=args["dtype"],
        enable_check_numerics=args["enable_check_numerics"],
        enable_xla=args["enable_xla"])

    # create exps: trainer, evaluator or ...
    with training_utils.get_strategy_scope(strategy):
        task = build_task(args)
        custom_dataset = build_dataset(args)
        try:
            model = task.build_model(args)
        except AttributeError:
            model = None
        entry = build_exp(args,
                          strategy=strategy,
                          model=model,
                          task=task,
                          model_dir=args["model_dir"],
                          custom_dataset=custom_dataset)
    entry.run()
Example #9
0
    def run(self):
        """ Training a neural model.

        Step 1: Create training model
        Step 2: Restore checkpoint/pretrain model/global_step if exists.
        Step 3: Fetch training data.
        Step 5: Fetch training training.
        Step 6: TRAIN!!!
        """
        if self._hvd_backend == "horovod":
            import horovod.tensorflow.keras as hvd
        elif self._hvd_backend == "byteps":
            import byteps.tensorflow.keras as hvd

        tfds = training_utils.build_datasets(compat.ModeKeys.TRAIN,
                                             self.strategy,
                                             self.custom_dataset, self.task)
        if isinstance(self.custom_dataset, MultipleDataset):
            _tfds = None
            for _, ds in tfds.items():
                if _tfds is None:
                    _tfds = ds
                else:
                    _tfds = _tfds.concatenate(ds)
            tfds = _tfds
        tfds = tfds.prefetch(tf.data.experimental.AUTOTUNE)
        # Step 1: create a model
        with training_utils.get_strategy_scope(self.strategy):
            inps = self.task.create_inputs(compat.ModeKeys.TRAIN)
            formatted_inps = self.task.example_to_input(
                inps, compat.ModeKeys.TRAIN)
            model_out = self.model(formatted_inps, is_training=True)
            for metric_layer in self.task.build_metric_layer():
                model_out = metric_layer([formatted_inps, model_out])
            if (LooseVersion(tf.__version__) < LooseVersion("2.3")
                    or LooseVersion(tf.__version__) >= LooseVersion("2.5")):
                logging.info(
                    f"Warning: Need further check on AccumgradKerasModel when TF version={tf.__version__}. "
                    f"Here we ignore update_cycle={self._update_cycle}, "
                    f"clip_value={self._clip_value}, clip_norm={self._clip_norm}."
                )
                keras_model = tf.keras.Model(inps, model_out)
            elif compat.IS_PREV_TF_2_4_0:
                from neurst.training.gradaccum_keras_model import TF23GradAccumKerasModel
                keras_model = TF23GradAccumKerasModel(
                    inps,
                    model_out,
                    update_cycle=self._update_cycle,
                    clip_value=self._clip_value,
                    clip_norm=self._clip_norm,
                    freeze_variables=self._freeze_variables)
            else:
                keras_model = GradAccumKerasModel(
                    inps,
                    model_out,
                    update_cycle=self._update_cycle,
                    clip_value=self._clip_value,
                    clip_norm=self._clip_norm,
                    freeze_variables=self._freeze_variables)

            loss = self._criterion.reduce_loss(formatted_inps, model_out)
            if compat.is_tf_tensor(loss) or isinstance(loss, (list, tuple)):
                keras_model.add_loss(loss)
            elif isinstance(loss, dict):
                for _name, _loss in loss.items():
                    keras_model.add_loss(_loss)
                    keras_model.add_metric(_loss,
                                           name=_name + "_mean",
                                           aggregation="mean")
            else:
                raise ValueError("criterion.reduce_loss returns "
                                 "unsupported value of type: {}".format(
                                     type(loss)))
            self._restore_ckpt_or_pretrain()
            self._lr_schedule = build_lr_schedule(self._lr_schedule_args)
            if self._pruning_schedule is not None:
                self._optimizer = create_pruning_optimizer(
                    self._optimizer,
                    self.model,
                    self._pruning_schedule,
                    pruning_variable_pattern=self._pruning_variable_pattern,
                    nopruning_variable_pattern=self.
                    _nopruning_variable_pattern,
                    keep_prune_property=True)
            self._optimizer = training_utils.handle_fp16_and_distributed_optimizer(
                self._optimizer, self._lr_schedule, self._hvd_backend)
            if self._hvd_backend is None:
                keras_model.compile(self._optimizer)
            else:
                # NOTE: we already add Horovod DistributedOptimizer in `_handle_fp16_and_distributed_optimizer`.
                # Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow
                # uses hvd.DistributedOptimizer() to compute gradients.
                keras_model.compile(self._optimizer,
                                    experimental_run_tf_function=False)
            keras_model.summary()
            summary_model_variables(self.model, self._freeze_variables)
        # initialize the checkpoint manager
        _ = compat.get_saver_or_default(
            self.model,
            self.model_dir,
            max_to_keep=self._checkpoints_max_to_keep)
        # build training training
        if not self._tb_log_dir:
            self._tb_log_dir = os.path.join(self.model_dir, "train")

        training_callbacks = [
            MetricReductionCallback(self.strategy,
                                    self._summary_steps,
                                    self._tb_log_dir,
                                    device="GPU:0",
                                    lr_schedule=self._lr_schedule)
        ]
        if self._hvd_backend is None or hvd.rank() == 0:
            training_callbacks.append(
                CustomCheckpointCallback(
                    self.task.model_configs(self.model),
                    save_checkpoint_steps=self._save_checkpoint_steps))
            if self._validator is not None:
                training_callbacks.append(
                    self._validator.build(self.strategy, self.task,
                                          self.model))
        if self._hvd_backend is not None:
            # Horovod: average metrics among workers at the end of every epoch.
            #
            # Note: This callback must be in the list before the ReduceLROnPlateau,
            # TensorBoard or other metrics-based training.
            # NOTE!!! HERE we already integrate the metric averaging behaviour into the MetricReductionCallback.
            # training_callbacks.insert(0, hvd.callbacks.MetricAverageCallback(device="GPU:0"))

            # Horovod: broadcast initial variable states from rank 0 to all other processes.
            # This is necessary to ensure consistent initialization of all workers when
            # training is started with random weights or restored from a checkpoint.
            training_callbacks.insert(
                0,
                hvd.callbacks.BroadcastGlobalVariablesCallback(0,
                                                               device="GPU:0"))
            if self._lr_schedule is not None:
                training_callbacks.append(
                    LearningRateScheduler(self._lr_schedule))

        if self._experimental_count_batch_num:
            logging.info("Scanning the dataset......")
            iterator = iter(
                training_utils.maybe_distribution_dataset(self.strategy, tfds))
            cnt = 0
            for _ in iterator:
                cnt += 1
            logging.info(f"Total {cnt} batches per EPOCH.")

        history = keras_model.fit(
            map_data_for_keras(tfds.repeat()),
            initial_epoch=0,
            epochs=1,
            steps_per_epoch=self._train_steps,  # * args["update_cycle"],
            verbose=2,
            callbacks=training_callbacks)
        logging.info(history.history)
Example #10
0
    def run(self):
        """ Sequence generation from an existing model checkpoint.

        Step 1: Build model and restore checkpoints.
        Step 2: Build test dataset.
        Step 3: Sequence generation.
        Step 4: Evaluation using metric.
        """
        # Step 3: Build model.
        with training_utils.get_strategy_scope(self.strategy):
            model = self._build_and_restore_model()
            keras_model = self.build_generation_model(self.task, model,
                                                      self._search_layer)
            tfds = training_utils.build_datasets(compat.ModeKeys.INFER,
                                                 self.strategy,
                                                 self.custom_dataset,
                                                 self.task)
            keras_model.summary()
            summary_model_variables(keras_model)

        # Step 5: Sequence Generation.
        start_time = time.time()
        results = training_utils.make_predictions(
            self.strategy,
            keras_model,
            tfds,
            self.custom_dataset,
            map_func=lambda y: SequenceGenerator.postprocess_generation(
                self.task, y))
        logging.info("Generation elapsed: %.2fs", time.time() - start_time)

        if self._output_file:
            if isinstance(self.custom_dataset, MultipleDataset):
                if isinstance(self._output_file, dict):
                    for name in results:
                        if self._output_file.get(name, None):
                            with tf.io.gfile.GFile(self._output_file[name],
                                                   "w") as fw:
                                fw.write("\n".join(results[name]) + "\n")
                            logging.info(
                                "Saving generation of dataset {} results into {}"
                                .format(name, self._output_file[name]))
                else:
                    logging.info(
                        "Unsupported type of `output_file`={}({}) for MultipleDataset."
                        .format(self._output_file, type(self._output_file)))
            else:
                if isinstance(self._output_file, str):
                    with tf.io.gfile.GFile(self._output_file, "w") as fw:
                        fw.write("\n".join(results) + "\n")
                    logging.info("Saving generation results into {}".format(
                        self._output_file))
                else:
                    logging.info(
                        f"WARNING: No generation results are saved due to unsupported type "
                        f"of `output_file`: {self._output_file} ({type(self._output_file)})"
                    )

        # Step 6: evaluation using metric
        def _display(res, name=None):
            if name:
                logging.info(f"Evaluation Result ({name}):")
            else:
                logging.info("Evaluation Result:")
            for k, v in res.items():
                logging.info("   %s=%.2f", k, v)

        if self._metric is not None:
            saving_metrics = dict()
            if isinstance(self.custom_dataset, MultipleDataset):
                on_average = {}
                mixed_dsnames = []
                mixed_hypos = []
                mixed_refs = []
                for name in tfds:
                    assert isinstance(self.custom_dataset.datasets[name],
                                      TextGenDataset)
                    if self.custom_dataset.datasets[name].targets:
                        metric_result = self._metric(
                            results[name],
                            self.custom_dataset.datasets[name].targets)
                        for k, v in metric_result.items():
                            if k not in on_average:
                                on_average[k] = 0.
                            on_average[
                                k] += self.custom_dataset.sample_weights[
                                    name] * v
                        _display(metric_result, name)
                        mixed_dsnames.append(name)
                        mixed_hypos.extend(results[name])
                        mixed_refs.extend(
                            self.custom_dataset.datasets[name].targets)
                        saving_metrics[name] = metric_result
                if len(mixed_dsnames) > 1:
                    _display(
                        on_average,
                        f"on average by weights {self._custom_dataset.sample_weights}"
                    )
                    mixed_metric_result = self._metric(mixed_refs, mixed_hypos)
                    _display(mixed_metric_result,
                             "mixed of {}".format(",".join(mixed_dsnames)))
                    saving_metrics["MIXED"] = mixed_metric_result

            else:
                assert isinstance(self.custom_dataset, TextGenDataset)
                if self.custom_dataset.targets is not None:
                    metric_result = self._metric(results,
                                                 self.custom_dataset.targets)
                    _display(metric_result)
                    saving_metrics = metric_result
            if self._save_metric is not None:
                logging.info(f"Saving metric results into {self._save_metric}")
                with tf.io.gfile.GFile(self._save_metric, "w") as fw:
                    json.dump(saving_metrics, fw)
Example #11
0
 def build(self, strategy, task, model):
     super(SeqGenerationValidator, self).build(strategy, task, model)
     if self._custom_dataset is None:
         logging.info("WARNING: no validation dataset is provided "
                      "in SeqGenerationValidator for validation process.")
         self._validate_gen = False
         return self
     self._gen_metric = task.get_eval_metric(self.args,
                                             name="eval_metric",
                                             ds=self._custom_dataset)
     if self._gen_metric is None:
         logging.info("WARNING: no metric is provided "
                      "in SeqGenerationValidator for validation process.")
         self._validate_gen = False
         return self
     self._gen_metric.flag = self.args["eval_metric.class"]
     search_layer = build_search_layer(
         self.args["eval_search_method.class"],
         **self.args["eval_search_method.params"])
     if search_layer is None:
         logging.info("WARNING: no search method is provided "
                      "in SeqGenerationValidator for validation process.")
         self._validate_gen = False
         return self
     from neurst.exps.sequence_generator import SequenceGenerator
     with training_utils.get_strategy_scope(strategy):
         self._gen_model = SequenceGenerator.build_generation_model(
             task, model, search_layer)
         self._gen_tfds = training_utils.build_datasets(
             compat.ModeKeys.INFER, strategy, self._custom_dataset, task,
             True, self._eval_task_args)
         if isinstance(self._custom_dataset, MultipleDataset):
             for name in list(self._gen_tfds.keys()):
                 if self._custom_dataset.datasets[name].targets is None:
                     logging.info(
                         f"WARNING: no ground truth found for validation dataset {name}."
                     )
                     self._gen_tfds.pop(name)
             if len(self._gen_tfds) == 0:
                 logging.info(
                     "WARNING: no ground truth found for all validation datasets and "
                     "no validation will be applied.")
                 self._validate_gen = False
                 return self
         else:
             if self._custom_dataset.targets is None:
                 logging.info(
                     "WARNING: no ground truth found for validation dataset and "
                     "no validation will be applied.")
                 self._validate_gen = False
                 return self
     if isinstance(self._custom_dataset, MultipleDataset):
         self._gen_recorder = {
             name:
             training_utils.TrainingStatusRecorder(model=model,
                                                   task=task,
                                                   metric=self._gen_metric)
             for name in self._gen_tfds
         }
         self._mixed_gen_recorder = training_utils.TrainingStatusRecorder(
             model=model, task=task, metric=self._gen_metric)
         self._avg_gen_recorder = training_utils.TrainingStatusRecorder(
             model=model,
             task=task,
             metric=self._gen_metric,
             estop_patience=self.args["eval_estop_patience"],
             best_checkpoint_path=self.args["eval_best_checkpoint_path"],
             auto_average_checkpoints=self.
             args["eval_auto_average_checkpoints"],
             best_avg_checkpoint_path=self.
             args["eval_best_avg_checkpoint_path"],
             top_checkpoints_to_keep=self.
             args["eval_top_checkpoints_to_keep"])
     else:
         self._gen_recorder = training_utils.TrainingStatusRecorder(
             model=model,
             task=task,
             metric=self._gen_metric,
             estop_patience=self.args["eval_estop_patience"],
             best_checkpoint_path=self.args["eval_best_checkpoint_path"],
             auto_average_checkpoints=self.
             args["eval_auto_average_checkpoints"],
             best_avg_checkpoint_path=self.
             args["eval_best_avg_checkpoint_path"],
             top_checkpoints_to_keep=self.
             args["eval_top_checkpoints_to_keep"])
     from neurst.exps.sequence_generator import SequenceGenerator
     self._postprocess_fn = lambda y: SequenceGenerator.postprocess_generation(
         task, y)
     self._gen_start_time = time.time()
     return self
Example #12
0
 def __init__(self, args, **kwargs):
     """ Initializes a util class for training neural models. """
     super(Trainer, self).__init__(**kwargs)
     self._tb_log_dir = args["tb_log_dir"]
     self._train_steps = args["train_steps"]
     self._summary_steps = args["summary_steps"]
     self._save_checkpoint_steps = args["save_checkpoint_steps"]
     self._checkpoints_max_to_keep = args["checkpoints_max_to_keep"]
     self._initial_global_step = args["initial_global_step"]
     self._pretrain_variable_pattern = args["pretrain_variable_pattern"]
     if args["pretrain_model"] and isinstance(args["pretrain_model"][0],
                                              dict):
         self._pretrain_v2 = True
         self._pretrain_model = args["pretrain_model"]
         if self._pretrain_variable_pattern:
             logging.info(
                 "Using pretrain model v2 and ignoring pretrain_variable_pattern: "
                 f"{self._pretrain_variable_pattern}")
     else:
         self._pretrain_v2 = False
         self._pretrain_model = flatten_string_list(args["pretrain_model"])
         if args["mask_dir"]:
             self.mask_dir = args["mask_dir"][0]
             # print(self.mask_dir)
             # self.load_mask = np.load(self.mask_dir, allow_pickle=True)
             with open(self.mask_dir, 'rb') as f:
                 self.load_mask = pickle.load(f)
             # i = 0
             # for weight in self.load_mask:
             #     if  i <= 1000:
             #         tf.print(weight.name, output_stream='file://./mask.txt')
             #         if weight.shape.ndims > 0:
             #             tf.print(weight[:1], output_stream='file://./mask.txt', summarize=-1, name=weight.name)
             #         else:
             #             tf.print(weight, output_stream='file://./mask.txt', summarize=-1, name=weight.name)
             #     else:
             #         i += 1
         else:
             self.mask_dir = os.path.join(self.model_dir, "mask.pkl")
             self.load_mask = None
         if self._pretrain_model:
             if self._pretrain_variable_pattern is None:
                 self._pretrain_variable_pattern = [None] * len(
                     self._pretrain_model)
             elif isinstance(self._pretrain_variable_pattern, str):
                 self._pretrain_variable_pattern = [
                     self._pretrain_variable_pattern
                 ]
         assert (
             (self._pretrain_model is None
              and self._pretrain_variable_pattern is None)
             or len(self._pretrain_model) == len(
                 self._pretrain_variable_pattern)
             or len(self._pretrain_model) == 1
         ), ("`pretrain_variable_pattern` must match with `pretrain_model`."
             )
         if self._pretrain_model is not None and self._pretrain_variable_pattern is None:
             self._pretrain_variable_pattern = [None] * len(
                 self._pretrain_model)
     self._update_cycle = args["update_cycle"]
     self._clip_value = args["clip_value"]
     self._clip_norm = args["clip_norm"]
     self._hvd_backend = self.strategy if self.strategy in [
         "byteps", "horovod"
     ] else None
     with training_utils.get_strategy_scope(self.strategy):
         self._criterion = build_criterion(args)
         self._criterion.set_model(self.model)
         self._lr_schedule_args = args
         if compat.IS_PREV_TF_2_4_0:
             self._optimizer = build_optimizer(args)
         else:
             self._optimizer = build_optimizer(args,
                                               clipnorm=self._clip_norm,
                                               clipvalue=self._clip_value)
         assert self._optimizer is not None, "optimizer parameters must be provided for training."
     self._validator = build_validator(args)
     self._experimental_count_batch_num = args[
         "experimental_count_batch_num"]
     self._freeze_variables = args["freeze_variables"]
     self._pruning_schedule = build_pruning_schedule(args)
     self._partial_tuning = args["partial_tuning"]
     self._pruning_variable_pattern = args["pruning_variable_pattern"]
     self._nopruning_variable_pattern = args["nopruning_variable_pattern"]
Example #13
0
def _main(_):
    # define and parse program flags
    arg_parser = flags_core.define_flags(FLAG_LIST)
    args, remaining_argv = flags_core.parse_flags(FLAG_LIST, arg_parser)
    flags_core.verbose_flags(FLAG_LIST, args, remaining_argv)
    strategy = training_utils.handle_distribution_strategy(
        args["distribution_strategy"])
    training_utils.startup_env(
        dtype=args["dtype"],
        enable_xla=False,
        enable_check_numerics=args["enable_check_numerics"])

    asr_task, asr_model = _build_task_model(strategy,
                                            args["asr_model_dir"],
                                            batch_size=args["batch_size"])
    mt_task, mt_model = _build_task_model(strategy,
                                          args["mt_model_dir"],
                                          batch_size=args["batch_size"])
    audio_dataset = build_dataset(args)
    # ========= ASR ==========
    asr_output_file = args["asr_output_file"]
    if asr_output_file is None:
        asr_output_file = "ram://asr_output_file"
    logging.info("Creating ASR generator.")
    with training_utils.get_strategy_scope(strategy):
        asr_generator = build_exp(
            {
                "class": SequenceGenerator,
                "params": {
                    "output_file": asr_output_file,
                    "search_method.class": args["asr_search_method.class"],
                    "search_method.params": args["asr_search_method.params"],
                }
            },
            strategy=strategy,
            model=asr_model,
            task=asr_task,
            model_dir=args["asr_model_dir"],
            custom_dataset=audio_dataset)
    asr_generator.run()
    if hasattr(audio_dataset,
               "transcripts") and audio_dataset.transcripts is not None:
        asr_metric = asr_task.get_eval_metric(args, "asr_metric")
        with tf.io.gfile.GFile(asr_output_file, "r") as fp:
            metric_result = asr_metric([line.strip() for line in fp],
                                       audio_dataset.transcripts)
        logging.info("Evaluation Result of ASR:")
        for k, v in metric_result.items():
            logging.info("   %s=%.2f", k, v)

    logging.info("Creating MT generator.")
    mt_reference_file = "ram://mt_reference_file"
    with tf.io.gfile.GFile(mt_reference_file, "w") as fw:
        for x in audio_dataset.targets:
            fw.write(x.strip() + "\n")

    with training_utils.get_strategy_scope(strategy):
        mt_generator = build_exp(
            {
                "class": SequenceGenerator,
                "params": {
                    "output_file": args["mt_output_file"],
                    "search_method.class": args["mt_search_method.class"],
                    "search_method.params": args["mt_search_method.params"],
                    "metric.class": args["mt_metric.class"],
                    "metric.params": args["mt_metric.params"]
                }
            },
            strategy=strategy,
            model=mt_model,
            task=mt_task,
            model_dir=args["mt_model_dir"],
            custom_dataset=build_dataset({
                "class": ParallelTextDataset,
                "params": {
                    "src_file": asr_output_file,
                    "trg_file": mt_reference_file
                }
            }))
    mt_generator.run()
Example #14
0
def _build_task_model(strategy, model_dir, batch_size):
    with training_utils.get_strategy_scope(strategy):
        model_configs = ModelConfigs.load(model_dir)
        task = build_task(model_configs, batch_size=batch_size)
        model = task.build_model(model_configs)
        return task, model