def run(self): """ Export savedmodel for sequence generator. Step 1: Build model and restore checkpoints. Step 2: Export. """ with training_utils.get_strategy_scope(self.strategy): model = self._build_and_restore_model() keras_model = self.build_generation_model(self.task, model, self._search_layer) keras_model.summary() summary_model_variables(keras_model) export_path = os.path.join(self._export_path, str(self._version)) logging.info("Saving model to {}".format(export_path)) tf.keras.models.save_model( keras_model, export_path, overwrite=True, include_optimizer=False, save_format=None, signatures=None, options=None) loaded = tf.saved_model.load(export_path) logging.info("========== signatures ==========") for x in loaded.signatures.keys(): logging.info(f"structured outputs for {x}:") logging.info(" {}".format(str(loaded.signatures["serving_default"].structured_outputs)))
def run(self): """ Evaluation on a existing model. Step 1: Build model. Step 2: Builds evaluation dataset. Step 3: Restore checkpoints. Step 4: Evaluate and reduce metric. """ assert not isinstance(self.custom_dataset, MultipleDataset), ( "SequenceEvaluator only supports single dataset.") with training_utils.get_strategy_scope(self.strategy): tfds = training_utils.build_datasets(compat.ModeKeys.EVAL, self.strategy, self.custom_dataset, self.task) keras_model = self.build_evaluation_model(self.task, self.model, self._criterion) keras_model.summary() summary_model_variables(keras_model) # Step 4: Restore checkpoints. stat = restore_checkpoint_if_possible(self.model, self.model_dir) if not stat: logging.info(f"WARNING: Fail to restore checkpoint from {self.model_dir}. " "We assume this was done on purpose. ") # Step 5: Evaluate and reduce metric. predict_fn = keras_model.make_predict_function() iterator = iter(training_utils.maybe_distribution_dataset( self.strategy, tfds.prefetch(tf.data.experimental.AUTOTUNE))) with tf.io.gfile.GFile(self._output_file, "w") as fw: while True: try: preds = predict_fn(iterator) for pred in self._criterion.reduce_sample_metrics(preds): fw.write(str(pred) + "\n") except (StopIteration, tf.errors.OutOfRangeError): break
def run_experiment(args, remaining_argv): strategy = training_utils.handle_distribution_strategy( args["distribution_strategy"]) flags_core.verbose_flags(FLAG_LIST, args, remaining_argv) training_utils.startup_env( dtype=args["dtype"], enable_check_numerics=args["enable_check_numerics"], enable_xla=args["enable_xla"]) # initialize parameters for quantization. if args.get("quant_params", None) is None: args["quant_params"] = {} QuantLayer.global_init(args["enable_quant"], **args["quant_params"]) # create exps: trainer, evaluator or ... with training_utils.get_strategy_scope(strategy): task = build_task(args) custom_dataset = build_dataset(args) try: model = task.build_model(args) training_utils.validate_unique_varname(model.weights) except AttributeError: model = None entry = build_exp(args, strategy=strategy, model=model, task=task, model_dir=args["model_dir"], custom_dataset=custom_dataset) entry.run()
def __init__(self, args, **kwargs): """ Initializes a util class for training neural models. """ super(Trainer, self).__init__(**kwargs) self._tb_log_dir = args["tb_log_dir"] self._train_steps = args["train_steps"] self._summary_steps = args["summary_steps"] self._save_checkpoint_steps = args["save_checkpoint_steps"] self._checkpoints_max_to_keep = args["checkpoints_max_to_keep"] self._initial_global_step = args["initial_global_step"] self._pretrain_variable_pattern = args["pretrain_variable_pattern"] if args["pretrain_model"] and isinstance(args["pretrain_model"][0], dict): self._pretrain_v2 = True self._pretrain_model = args["pretrain_model"] if self._pretrain_variable_pattern: logging.info( "Using pretrain model v2 and ignoring pretrain_variable_pattern: " f"{self._pretrain_variable_pattern}") else: self._pretrain_v2 = False self._pretrain_model = flatten_string_list(args["pretrain_model"]) if self._pretrain_model and self._pretrain_variable_pattern is None: self._pretrain_variable_pattern = [None] * len( self._pretrain_model) assert ( (self._pretrain_model is None and self._pretrain_variable_pattern is None) or len(self._pretrain_model) == len( self._pretrain_variable_pattern) or len(self._pretrain_model) == 1 ), ("`pretrain_variable_pattern` must match with `pretrain_model`." ) if self._pretrain_model is not None and self._pretrain_variable_pattern is None: self._pretrain_variable_pattern = [None] * len( self._pretrain_model) self._update_cycle = args["update_cycle"] self._clip_value = args["clip_value"] self._clip_norm = args["clip_norm"] self._hvd_backend = self.strategy if self.strategy in [ "byteps", "horovod" ] else None with training_utils.get_strategy_scope(self.strategy): self._criterion = build_criterion(args) self._criterion.set_model(self.model) self._lr_schedule_args = args if compat.IS_PREV_TF_2_4_0: self._optimizer = build_optimizer(args) else: self._optimizer = build_optimizer(args, clipnorm=self._clip_norm, clipvalue=self._clip_value) assert self._optimizer is not None, "optimizer parameters must be provided for training." self._validator = build_validator(args) self._experimental_count_batch_num = args[ "experimental_count_batch_num"] self._freeze_variables = args["freeze_variables"]
def run(self): """ Evaluation on a existing model. Step 1: Build model. Step 2: Builds evaluation dataset. Step 3: Restore checkpoints. Step 4: Evaluate and reduce metric. """ with training_utils.get_strategy_scope(self.strategy): tfds = training_utils.build_datasets(compat.ModeKeys.EVAL, self.strategy, self.custom_dataset, self.task, cache=True) keras_model = self.build_evaluation_model(self.task, self.model, self._criterion) keras_model.summary() summary_model_variables(keras_model) # Step 4: Restore checkpoints. stat = restore_checkpoint_if_possible(self.model, self.model_dir) if not stat: logging.info( f"WARNING: Fail to restore checkpoint from {self.model_dir}. " "We assume this was done on purpose. ") # Step 5: Evaluate and reduce metric. start_time = time.time() results, avg_res, whole_res = training_utils.reduce_eval_results( self._criterion, self.custom_dataset, training_utils.make_predictions(self.strategy, keras_model, tfds, self.custom_dataset)) logging.info("Evaluation elapsed: %.2fs", time.time() - start_time) def _display(res, name=None): if name: logging.info(f"Evaluation Results ({name}):") for k, v in res.items(): logging.info(" %s: %.2f", k, v) if not isinstance(self.custom_dataset, MultipleDataset): _display(results) else: for name, res in results.items(): _display(res, name) _display( avg_res, f"on average by weights {self.custom_dataset.sample_weights}") _display(whole_res, "mixed")
def build(self, strategy, task, model): """ Initializes. """ self._strategy = strategy self._criterion: Criterion = build_criterion( self.args["eval_criterion.class"], **self.args["eval_criterion.params"]) self._criterion.set_model(model) if self._criterion is None: logging.info( "WARNING: no criterion is provided in CriterionValidator " "for validation process.") self._validate_criterion = False return self self._custom_dataset = build_dataset( self.args["eval_dataset.class"], **self.args["eval_dataset.params"]) if self._custom_dataset is None: logging.info("WARNING: no validation dataset is provided " "in CriterionValidator for validation process.") self._validate_criterion = False return self from neurst.exps.evaluator import Evaluator with training_utils.get_strategy_scope(strategy): self._criterion_model = Evaluator.build_evaluation_model( task, model, self._criterion) self._eval_tfds = training_utils.build_datasets( compat.ModeKeys.EVAL, strategy, self._custom_dataset, task, True, self._eval_task_args) self._criterion_metric = self._criterion.as_metric() if isinstance(self._custom_dataset, MultipleDataset): self._criterion_recorder = { name: training_utils.TrainingStatusRecorder( model=model, task=task, metric=self._criterion_metric) for name in self._custom_dataset.datasets } self._avg_criterion_recorder = training_utils.TrainingStatusRecorder( model=model, task=task, metric=self._criterion_metric) self._mixed_criterion_recorder = training_utils.TrainingStatusRecorder( model=model, task=task, metric=self._criterion_metric) else: self._criterion_recorder = training_utils.TrainingStatusRecorder( model=model, task=task, metric=self._criterion_metric) self._criterion_start_time = time.time() return self
def __init__(self, args, **kwargs): """ Initializes a util class for training neural models. """ super(Trainer, self).__init__(**kwargs) self._tb_log_dir = args["tb_log_dir"] self._train_steps = args["train_steps"] self._summary_steps = args["summary_steps"] self._save_checkpoint_steps = args["save_checkpoint_steps"] self._checkpoints_max_to_keep = args["checkpoints_max_to_keep"] self._initial_global_step = args["initial_global_step"] self._pretrain_model = flatten_string_list(args["pretrain_model"]) self._pretrain_variable_pattern = args["pretrain_variable_pattern"] if self._pretrain_model and self._pretrain_variable_pattern is None: self._pretrain_variable_pattern = [None] * len( self._pretrain_model) assert ( (self._pretrain_model is None and self._pretrain_variable_pattern is None) or len( self._pretrain_model) == len(self._pretrain_variable_pattern) or len(self._pretrain_model) == 1 ), ("`pretrain_variable_pattern` must match with `pretrain_model`.") if self._pretrain_model is not None and self._pretrain_variable_pattern is None: self._pretrain_variable_pattern = [None] * len( self._pretrain_model) self._update_cycle = args["update_cycle"] self._clip_value = args["clip_value"] self._clip_norm = args["clip_norm"] self._hvd_backend = self.strategy if self.strategy in [ "byteps", "horovod" ] else None with training_utils.get_strategy_scope(self.strategy): self._criterion = build_criterion(args) self._criterion.set_model(self.model) self._lr_schedule = build_lr_schedule(args) optimizer = build_optimizer(args) assert optimizer is not None, "optimizer parameters must be provided for training." self._optimizer = _handle_fp16_and_distributed_optimizer( optimizer, self._lr_schedule, self._hvd_backend) self._validator = build_validator(args) self._experimental_count_batch_num = args[ "experimental_count_batch_num"]
def run_experiment(args, remaining_argv): strategy = training_utils.handle_distribution_strategy( args["distribution_strategy"]) flags_core.verbose_flags(FLAG_LIST, args, remaining_argv) training_utils.startup_env( dtype=args["dtype"], enable_check_numerics=args["enable_check_numerics"], enable_xla=args["enable_xla"]) # create exps: trainer, evaluator or ... with training_utils.get_strategy_scope(strategy): task = build_task(args) custom_dataset = build_dataset(args) try: model = task.build_model(args) except AttributeError: model = None entry = build_exp(args, strategy=strategy, model=model, task=task, model_dir=args["model_dir"], custom_dataset=custom_dataset) entry.run()
def run(self): """ Training a neural model. Step 1: Create training model Step 2: Restore checkpoint/pretrain model/global_step if exists. Step 3: Fetch training data. Step 5: Fetch training training. Step 6: TRAIN!!! """ if self._hvd_backend == "horovod": import horovod.tensorflow.keras as hvd elif self._hvd_backend == "byteps": import byteps.tensorflow.keras as hvd tfds = training_utils.build_datasets(compat.ModeKeys.TRAIN, self.strategy, self.custom_dataset, self.task) if isinstance(self.custom_dataset, MultipleDataset): _tfds = None for _, ds in tfds.items(): if _tfds is None: _tfds = ds else: _tfds = _tfds.concatenate(ds) tfds = _tfds tfds = tfds.prefetch(tf.data.experimental.AUTOTUNE) # Step 1: create a model with training_utils.get_strategy_scope(self.strategy): inps = self.task.create_inputs(compat.ModeKeys.TRAIN) formatted_inps = self.task.example_to_input( inps, compat.ModeKeys.TRAIN) model_out = self.model(formatted_inps, is_training=True) for metric_layer in self.task.build_metric_layer(): model_out = metric_layer([formatted_inps, model_out]) if (LooseVersion(tf.__version__) < LooseVersion("2.3") or LooseVersion(tf.__version__) >= LooseVersion("2.5")): logging.info( f"Warning: Need further check on AccumgradKerasModel when TF version={tf.__version__}. " f"Here we ignore update_cycle={self._update_cycle}, " f"clip_value={self._clip_value}, clip_norm={self._clip_norm}." ) keras_model = tf.keras.Model(inps, model_out) elif compat.IS_PREV_TF_2_4_0: from neurst.training.gradaccum_keras_model import TF23GradAccumKerasModel keras_model = TF23GradAccumKerasModel( inps, model_out, update_cycle=self._update_cycle, clip_value=self._clip_value, clip_norm=self._clip_norm, freeze_variables=self._freeze_variables) else: keras_model = GradAccumKerasModel( inps, model_out, update_cycle=self._update_cycle, clip_value=self._clip_value, clip_norm=self._clip_norm, freeze_variables=self._freeze_variables) loss = self._criterion.reduce_loss(formatted_inps, model_out) if compat.is_tf_tensor(loss) or isinstance(loss, (list, tuple)): keras_model.add_loss(loss) elif isinstance(loss, dict): for _name, _loss in loss.items(): keras_model.add_loss(_loss) keras_model.add_metric(_loss, name=_name + "_mean", aggregation="mean") else: raise ValueError("criterion.reduce_loss returns " "unsupported value of type: {}".format( type(loss))) self._restore_ckpt_or_pretrain() self._lr_schedule = build_lr_schedule(self._lr_schedule_args) if self._pruning_schedule is not None: self._optimizer = create_pruning_optimizer( self._optimizer, self.model, self._pruning_schedule, pruning_variable_pattern=self._pruning_variable_pattern, nopruning_variable_pattern=self. _nopruning_variable_pattern, keep_prune_property=True) self._optimizer = training_utils.handle_fp16_and_distributed_optimizer( self._optimizer, self._lr_schedule, self._hvd_backend) if self._hvd_backend is None: keras_model.compile(self._optimizer) else: # NOTE: we already add Horovod DistributedOptimizer in `_handle_fp16_and_distributed_optimizer`. # Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow # uses hvd.DistributedOptimizer() to compute gradients. keras_model.compile(self._optimizer, experimental_run_tf_function=False) keras_model.summary() summary_model_variables(self.model, self._freeze_variables) # initialize the checkpoint manager _ = compat.get_saver_or_default( self.model, self.model_dir, max_to_keep=self._checkpoints_max_to_keep) # build training training if not self._tb_log_dir: self._tb_log_dir = os.path.join(self.model_dir, "train") training_callbacks = [ MetricReductionCallback(self.strategy, self._summary_steps, self._tb_log_dir, device="GPU:0", lr_schedule=self._lr_schedule) ] if self._hvd_backend is None or hvd.rank() == 0: training_callbacks.append( CustomCheckpointCallback( self.task.model_configs(self.model), save_checkpoint_steps=self._save_checkpoint_steps)) if self._validator is not None: training_callbacks.append( self._validator.build(self.strategy, self.task, self.model)) if self._hvd_backend is not None: # Horovod: average metrics among workers at the end of every epoch. # # Note: This callback must be in the list before the ReduceLROnPlateau, # TensorBoard or other metrics-based training. # NOTE!!! HERE we already integrate the metric averaging behaviour into the MetricReductionCallback. # training_callbacks.insert(0, hvd.callbacks.MetricAverageCallback(device="GPU:0")) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. training_callbacks.insert( 0, hvd.callbacks.BroadcastGlobalVariablesCallback(0, device="GPU:0")) if self._lr_schedule is not None: training_callbacks.append( LearningRateScheduler(self._lr_schedule)) if self._experimental_count_batch_num: logging.info("Scanning the dataset......") iterator = iter( training_utils.maybe_distribution_dataset(self.strategy, tfds)) cnt = 0 for _ in iterator: cnt += 1 logging.info(f"Total {cnt} batches per EPOCH.") history = keras_model.fit( map_data_for_keras(tfds.repeat()), initial_epoch=0, epochs=1, steps_per_epoch=self._train_steps, # * args["update_cycle"], verbose=2, callbacks=training_callbacks) logging.info(history.history)
def run(self): """ Sequence generation from an existing model checkpoint. Step 1: Build model and restore checkpoints. Step 2: Build test dataset. Step 3: Sequence generation. Step 4: Evaluation using metric. """ # Step 3: Build model. with training_utils.get_strategy_scope(self.strategy): model = self._build_and_restore_model() keras_model = self.build_generation_model(self.task, model, self._search_layer) tfds = training_utils.build_datasets(compat.ModeKeys.INFER, self.strategy, self.custom_dataset, self.task) keras_model.summary() summary_model_variables(keras_model) # Step 5: Sequence Generation. start_time = time.time() results = training_utils.make_predictions( self.strategy, keras_model, tfds, self.custom_dataset, map_func=lambda y: SequenceGenerator.postprocess_generation( self.task, y)) logging.info("Generation elapsed: %.2fs", time.time() - start_time) if self._output_file: if isinstance(self.custom_dataset, MultipleDataset): if isinstance(self._output_file, dict): for name in results: if self._output_file.get(name, None): with tf.io.gfile.GFile(self._output_file[name], "w") as fw: fw.write("\n".join(results[name]) + "\n") logging.info( "Saving generation of dataset {} results into {}" .format(name, self._output_file[name])) else: logging.info( "Unsupported type of `output_file`={}({}) for MultipleDataset." .format(self._output_file, type(self._output_file))) else: if isinstance(self._output_file, str): with tf.io.gfile.GFile(self._output_file, "w") as fw: fw.write("\n".join(results) + "\n") logging.info("Saving generation results into {}".format( self._output_file)) else: logging.info( f"WARNING: No generation results are saved due to unsupported type " f"of `output_file`: {self._output_file} ({type(self._output_file)})" ) # Step 6: evaluation using metric def _display(res, name=None): if name: logging.info(f"Evaluation Result ({name}):") else: logging.info("Evaluation Result:") for k, v in res.items(): logging.info(" %s=%.2f", k, v) if self._metric is not None: saving_metrics = dict() if isinstance(self.custom_dataset, MultipleDataset): on_average = {} mixed_dsnames = [] mixed_hypos = [] mixed_refs = [] for name in tfds: assert isinstance(self.custom_dataset.datasets[name], TextGenDataset) if self.custom_dataset.datasets[name].targets: metric_result = self._metric( results[name], self.custom_dataset.datasets[name].targets) for k, v in metric_result.items(): if k not in on_average: on_average[k] = 0. on_average[ k] += self.custom_dataset.sample_weights[ name] * v _display(metric_result, name) mixed_dsnames.append(name) mixed_hypos.extend(results[name]) mixed_refs.extend( self.custom_dataset.datasets[name].targets) saving_metrics[name] = metric_result if len(mixed_dsnames) > 1: _display( on_average, f"on average by weights {self._custom_dataset.sample_weights}" ) mixed_metric_result = self._metric(mixed_refs, mixed_hypos) _display(mixed_metric_result, "mixed of {}".format(",".join(mixed_dsnames))) saving_metrics["MIXED"] = mixed_metric_result else: assert isinstance(self.custom_dataset, TextGenDataset) if self.custom_dataset.targets is not None: metric_result = self._metric(results, self.custom_dataset.targets) _display(metric_result) saving_metrics = metric_result if self._save_metric is not None: logging.info(f"Saving metric results into {self._save_metric}") with tf.io.gfile.GFile(self._save_metric, "w") as fw: json.dump(saving_metrics, fw)
def build(self, strategy, task, model): super(SeqGenerationValidator, self).build(strategy, task, model) if self._custom_dataset is None: logging.info("WARNING: no validation dataset is provided " "in SeqGenerationValidator for validation process.") self._validate_gen = False return self self._gen_metric = task.get_eval_metric(self.args, name="eval_metric", ds=self._custom_dataset) if self._gen_metric is None: logging.info("WARNING: no metric is provided " "in SeqGenerationValidator for validation process.") self._validate_gen = False return self self._gen_metric.flag = self.args["eval_metric.class"] search_layer = build_search_layer( self.args["eval_search_method.class"], **self.args["eval_search_method.params"]) if search_layer is None: logging.info("WARNING: no search method is provided " "in SeqGenerationValidator for validation process.") self._validate_gen = False return self from neurst.exps.sequence_generator import SequenceGenerator with training_utils.get_strategy_scope(strategy): self._gen_model = SequenceGenerator.build_generation_model( task, model, search_layer) self._gen_tfds = training_utils.build_datasets( compat.ModeKeys.INFER, strategy, self._custom_dataset, task, True, self._eval_task_args) if isinstance(self._custom_dataset, MultipleDataset): for name in list(self._gen_tfds.keys()): if self._custom_dataset.datasets[name].targets is None: logging.info( f"WARNING: no ground truth found for validation dataset {name}." ) self._gen_tfds.pop(name) if len(self._gen_tfds) == 0: logging.info( "WARNING: no ground truth found for all validation datasets and " "no validation will be applied.") self._validate_gen = False return self else: if self._custom_dataset.targets is None: logging.info( "WARNING: no ground truth found for validation dataset and " "no validation will be applied.") self._validate_gen = False return self if isinstance(self._custom_dataset, MultipleDataset): self._gen_recorder = { name: training_utils.TrainingStatusRecorder(model=model, task=task, metric=self._gen_metric) for name in self._gen_tfds } self._mixed_gen_recorder = training_utils.TrainingStatusRecorder( model=model, task=task, metric=self._gen_metric) self._avg_gen_recorder = training_utils.TrainingStatusRecorder( model=model, task=task, metric=self._gen_metric, estop_patience=self.args["eval_estop_patience"], best_checkpoint_path=self.args["eval_best_checkpoint_path"], auto_average_checkpoints=self. args["eval_auto_average_checkpoints"], best_avg_checkpoint_path=self. args["eval_best_avg_checkpoint_path"], top_checkpoints_to_keep=self. args["eval_top_checkpoints_to_keep"]) else: self._gen_recorder = training_utils.TrainingStatusRecorder( model=model, task=task, metric=self._gen_metric, estop_patience=self.args["eval_estop_patience"], best_checkpoint_path=self.args["eval_best_checkpoint_path"], auto_average_checkpoints=self. args["eval_auto_average_checkpoints"], best_avg_checkpoint_path=self. args["eval_best_avg_checkpoint_path"], top_checkpoints_to_keep=self. args["eval_top_checkpoints_to_keep"]) from neurst.exps.sequence_generator import SequenceGenerator self._postprocess_fn = lambda y: SequenceGenerator.postprocess_generation( task, y) self._gen_start_time = time.time() return self
def __init__(self, args, **kwargs): """ Initializes a util class for training neural models. """ super(Trainer, self).__init__(**kwargs) self._tb_log_dir = args["tb_log_dir"] self._train_steps = args["train_steps"] self._summary_steps = args["summary_steps"] self._save_checkpoint_steps = args["save_checkpoint_steps"] self._checkpoints_max_to_keep = args["checkpoints_max_to_keep"] self._initial_global_step = args["initial_global_step"] self._pretrain_variable_pattern = args["pretrain_variable_pattern"] if args["pretrain_model"] and isinstance(args["pretrain_model"][0], dict): self._pretrain_v2 = True self._pretrain_model = args["pretrain_model"] if self._pretrain_variable_pattern: logging.info( "Using pretrain model v2 and ignoring pretrain_variable_pattern: " f"{self._pretrain_variable_pattern}") else: self._pretrain_v2 = False self._pretrain_model = flatten_string_list(args["pretrain_model"]) if args["mask_dir"]: self.mask_dir = args["mask_dir"][0] # print(self.mask_dir) # self.load_mask = np.load(self.mask_dir, allow_pickle=True) with open(self.mask_dir, 'rb') as f: self.load_mask = pickle.load(f) # i = 0 # for weight in self.load_mask: # if i <= 1000: # tf.print(weight.name, output_stream='file://./mask.txt') # if weight.shape.ndims > 0: # tf.print(weight[:1], output_stream='file://./mask.txt', summarize=-1, name=weight.name) # else: # tf.print(weight, output_stream='file://./mask.txt', summarize=-1, name=weight.name) # else: # i += 1 else: self.mask_dir = os.path.join(self.model_dir, "mask.pkl") self.load_mask = None if self._pretrain_model: if self._pretrain_variable_pattern is None: self._pretrain_variable_pattern = [None] * len( self._pretrain_model) elif isinstance(self._pretrain_variable_pattern, str): self._pretrain_variable_pattern = [ self._pretrain_variable_pattern ] assert ( (self._pretrain_model is None and self._pretrain_variable_pattern is None) or len(self._pretrain_model) == len( self._pretrain_variable_pattern) or len(self._pretrain_model) == 1 ), ("`pretrain_variable_pattern` must match with `pretrain_model`." ) if self._pretrain_model is not None and self._pretrain_variable_pattern is None: self._pretrain_variable_pattern = [None] * len( self._pretrain_model) self._update_cycle = args["update_cycle"] self._clip_value = args["clip_value"] self._clip_norm = args["clip_norm"] self._hvd_backend = self.strategy if self.strategy in [ "byteps", "horovod" ] else None with training_utils.get_strategy_scope(self.strategy): self._criterion = build_criterion(args) self._criterion.set_model(self.model) self._lr_schedule_args = args if compat.IS_PREV_TF_2_4_0: self._optimizer = build_optimizer(args) else: self._optimizer = build_optimizer(args, clipnorm=self._clip_norm, clipvalue=self._clip_value) assert self._optimizer is not None, "optimizer parameters must be provided for training." self._validator = build_validator(args) self._experimental_count_batch_num = args[ "experimental_count_batch_num"] self._freeze_variables = args["freeze_variables"] self._pruning_schedule = build_pruning_schedule(args) self._partial_tuning = args["partial_tuning"] self._pruning_variable_pattern = args["pruning_variable_pattern"] self._nopruning_variable_pattern = args["nopruning_variable_pattern"]
def _main(_): # define and parse program flags arg_parser = flags_core.define_flags(FLAG_LIST) args, remaining_argv = flags_core.parse_flags(FLAG_LIST, arg_parser) flags_core.verbose_flags(FLAG_LIST, args, remaining_argv) strategy = training_utils.handle_distribution_strategy( args["distribution_strategy"]) training_utils.startup_env( dtype=args["dtype"], enable_xla=False, enable_check_numerics=args["enable_check_numerics"]) asr_task, asr_model = _build_task_model(strategy, args["asr_model_dir"], batch_size=args["batch_size"]) mt_task, mt_model = _build_task_model(strategy, args["mt_model_dir"], batch_size=args["batch_size"]) audio_dataset = build_dataset(args) # ========= ASR ========== asr_output_file = args["asr_output_file"] if asr_output_file is None: asr_output_file = "ram://asr_output_file" logging.info("Creating ASR generator.") with training_utils.get_strategy_scope(strategy): asr_generator = build_exp( { "class": SequenceGenerator, "params": { "output_file": asr_output_file, "search_method.class": args["asr_search_method.class"], "search_method.params": args["asr_search_method.params"], } }, strategy=strategy, model=asr_model, task=asr_task, model_dir=args["asr_model_dir"], custom_dataset=audio_dataset) asr_generator.run() if hasattr(audio_dataset, "transcripts") and audio_dataset.transcripts is not None: asr_metric = asr_task.get_eval_metric(args, "asr_metric") with tf.io.gfile.GFile(asr_output_file, "r") as fp: metric_result = asr_metric([line.strip() for line in fp], audio_dataset.transcripts) logging.info("Evaluation Result of ASR:") for k, v in metric_result.items(): logging.info(" %s=%.2f", k, v) logging.info("Creating MT generator.") mt_reference_file = "ram://mt_reference_file" with tf.io.gfile.GFile(mt_reference_file, "w") as fw: for x in audio_dataset.targets: fw.write(x.strip() + "\n") with training_utils.get_strategy_scope(strategy): mt_generator = build_exp( { "class": SequenceGenerator, "params": { "output_file": args["mt_output_file"], "search_method.class": args["mt_search_method.class"], "search_method.params": args["mt_search_method.params"], "metric.class": args["mt_metric.class"], "metric.params": args["mt_metric.params"] } }, strategy=strategy, model=mt_model, task=mt_task, model_dir=args["mt_model_dir"], custom_dataset=build_dataset({ "class": ParallelTextDataset, "params": { "src_file": asr_output_file, "trg_file": mt_reference_file } })) mt_generator.run()
def _build_task_model(strategy, model_dir, batch_size): with training_utils.get_strategy_scope(strategy): model_configs = ModelConfigs.load(model_dir) task = build_task(model_configs, batch_size=batch_size) model = task.build_model(model_configs) return task, model