def run(self): """ Export savedmodel for sequence generator. Step 1: Build model and restore checkpoints. Step 2: Export. """ with training_utils.get_strategy_scope(self.strategy): model = self._build_and_restore_model() keras_model = self.build_generation_model(self.task, model, self._search_layer) keras_model.summary() summary_model_variables(keras_model) export_path = os.path.join(self._export_path, str(self._version)) logging.info("Saving model to {}".format(export_path)) tf.keras.models.save_model( keras_model, export_path, overwrite=True, include_optimizer=False, save_format=None, signatures=None, options=None) loaded = tf.saved_model.load(export_path) logging.info("========== signatures ==========") for x in loaded.signatures.keys(): logging.info(f"structured outputs for {x}:") logging.info(" {}".format(str(loaded.signatures["serving_default"].structured_outputs)))
def run(self): """ Evaluation on a existing model. Step 1: Build model. Step 2: Builds evaluation dataset. Step 3: Restore checkpoints. Step 4: Evaluate and reduce metric. """ assert not isinstance(self.custom_dataset, MultipleDataset), ( "SequenceEvaluator only supports single dataset.") with training_utils.get_strategy_scope(self.strategy): tfds = training_utils.build_datasets(compat.ModeKeys.EVAL, self.strategy, self.custom_dataset, self.task) keras_model = self.build_evaluation_model(self.task, self.model, self._criterion) keras_model.summary() summary_model_variables(keras_model) # Step 4: Restore checkpoints. stat = restore_checkpoint_if_possible(self.model, self.model_dir) if not stat: logging.info(f"WARNING: Fail to restore checkpoint from {self.model_dir}. " "We assume this was done on purpose. ") # Step 5: Evaluate and reduce metric. predict_fn = keras_model.make_predict_function() iterator = iter(training_utils.maybe_distribution_dataset( self.strategy, tfds.prefetch(tf.data.experimental.AUTOTUNE))) with tf.io.gfile.GFile(self._output_file, "w") as fw: while True: try: preds = predict_fn(iterator) for pred in self._criterion.reduce_sample_metrics(preds): fw.write(str(pred) + "\n") except (StopIteration, tf.errors.OutOfRangeError): break
def run(self): """ Evaluation on a existing model. Step 1: Build model. Step 2: Builds evaluation dataset. Step 3: Restore checkpoints. Step 4: Evaluate and reduce metric. """ with training_utils.get_strategy_scope(self.strategy): tfds = training_utils.build_datasets(compat.ModeKeys.EVAL, self.strategy, self.custom_dataset, self.task, cache=True) keras_model = self.build_evaluation_model(self.task, self.model, self._criterion) keras_model.summary() summary_model_variables(keras_model) # Step 4: Restore checkpoints. stat = restore_checkpoint_if_possible(self.model, self.model_dir) if not stat: logging.info( f"WARNING: Fail to restore checkpoint from {self.model_dir}. " "We assume this was done on purpose. ") # Step 5: Evaluate and reduce metric. start_time = time.time() results, avg_res, whole_res = training_utils.reduce_eval_results( self._criterion, self.custom_dataset, training_utils.make_predictions(self.strategy, keras_model, tfds, self.custom_dataset)) logging.info("Evaluation elapsed: %.2fs", time.time() - start_time) def _display(res, name=None): if name: logging.info(f"Evaluation Results ({name}):") for k, v in res.items(): logging.info(" %s: %.2f", k, v) if not isinstance(self.custom_dataset, MultipleDataset): _display(results) else: for name, res in results.items(): _display(res, name) _display( avg_res, f"on average by weights {self.custom_dataset.sample_weights}") _display(whole_res, "mixed")
def run(self): """ Training a neural model. Step 1: Create training model Step 2: Restore checkpoint/pretrain model/global_step if exists. Step 3: Fetch training data. Step 5: Fetch training training. Step 6: TRAIN!!! """ if self._hvd_backend == "horovod": import horovod.tensorflow.keras as hvd elif self._hvd_backend == "byteps": import byteps.tensorflow.keras as hvd tfds = training_utils.build_datasets(compat.ModeKeys.TRAIN, self.strategy, self.custom_dataset, self.task) if isinstance(self.custom_dataset, MultipleDataset): _tfds = None for _, ds in tfds.items(): if _tfds is None: _tfds = ds else: _tfds = _tfds.concatenate(ds) tfds = _tfds tfds = tfds.prefetch(tf.data.experimental.AUTOTUNE) # Step 1: create a model with training_utils.get_strategy_scope(self.strategy): inps = self.task.create_inputs(compat.ModeKeys.TRAIN) formatted_inps = self.task.example_to_input( inps, compat.ModeKeys.TRAIN) model_out = self.model(formatted_inps, is_training=True) for metric_layer in self.task.build_metric_layer(): model_out = metric_layer([formatted_inps, model_out]) if (LooseVersion(tf.__version__) < LooseVersion("2.3") or LooseVersion(tf.__version__) >= LooseVersion("2.5")): logging.info( f"Warning: Need further check on AccumgradKerasModel when TF version={tf.__version__}. " f"Here we ignore update_cycle={self._update_cycle}, " f"clip_value={self._clip_value}, clip_norm={self._clip_norm}." ) keras_model = tf.keras.Model(inps, model_out) elif compat.IS_PREV_TF_2_4_0: from neurst.training.gradaccum_keras_model import TF23GradAccumKerasModel keras_model = TF23GradAccumKerasModel( inps, model_out, update_cycle=self._update_cycle, clip_value=self._clip_value, clip_norm=self._clip_norm, freeze_variables=self._freeze_variables) else: keras_model = GradAccumKerasModel( inps, model_out, update_cycle=self._update_cycle, clip_value=self._clip_value, clip_norm=self._clip_norm, freeze_variables=self._freeze_variables) loss = self._criterion.reduce_loss(formatted_inps, model_out) if compat.is_tf_tensor(loss) or isinstance(loss, (list, tuple)): keras_model.add_loss(loss) elif isinstance(loss, dict): for _name, _loss in loss.items(): keras_model.add_loss(_loss) keras_model.add_metric(_loss, name=_name + "_mean", aggregation="mean") else: raise ValueError("criterion.reduce_loss returns " "unsupported value of type: {}".format( type(loss))) self._restore_ckpt_or_pretrain() self._lr_schedule = build_lr_schedule(self._lr_schedule_args) if self._pruning_schedule is not None: self._optimizer = create_pruning_optimizer( self._optimizer, self.model, self._pruning_schedule, pruning_variable_pattern=self._pruning_variable_pattern, nopruning_variable_pattern=self. _nopruning_variable_pattern, keep_prune_property=True) self._optimizer = training_utils.handle_fp16_and_distributed_optimizer( self._optimizer, self._lr_schedule, self._hvd_backend) if self._hvd_backend is None: keras_model.compile(self._optimizer) else: # NOTE: we already add Horovod DistributedOptimizer in `_handle_fp16_and_distributed_optimizer`. # Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow # uses hvd.DistributedOptimizer() to compute gradients. keras_model.compile(self._optimizer, experimental_run_tf_function=False) keras_model.summary() summary_model_variables(self.model, self._freeze_variables) # initialize the checkpoint manager _ = compat.get_saver_or_default( self.model, self.model_dir, max_to_keep=self._checkpoints_max_to_keep) # build training training if not self._tb_log_dir: self._tb_log_dir = os.path.join(self.model_dir, "train") training_callbacks = [ MetricReductionCallback(self.strategy, self._summary_steps, self._tb_log_dir, device="GPU:0", lr_schedule=self._lr_schedule) ] if self._hvd_backend is None or hvd.rank() == 0: training_callbacks.append( CustomCheckpointCallback( self.task.model_configs(self.model), save_checkpoint_steps=self._save_checkpoint_steps)) if self._validator is not None: training_callbacks.append( self._validator.build(self.strategy, self.task, self.model)) if self._hvd_backend is not None: # Horovod: average metrics among workers at the end of every epoch. # # Note: This callback must be in the list before the ReduceLROnPlateau, # TensorBoard or other metrics-based training. # NOTE!!! HERE we already integrate the metric averaging behaviour into the MetricReductionCallback. # training_callbacks.insert(0, hvd.callbacks.MetricAverageCallback(device="GPU:0")) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. training_callbacks.insert( 0, hvd.callbacks.BroadcastGlobalVariablesCallback(0, device="GPU:0")) if self._lr_schedule is not None: training_callbacks.append( LearningRateScheduler(self._lr_schedule)) if self._experimental_count_batch_num: logging.info("Scanning the dataset......") iterator = iter( training_utils.maybe_distribution_dataset(self.strategy, tfds)) cnt = 0 for _ in iterator: cnt += 1 logging.info(f"Total {cnt} batches per EPOCH.") history = keras_model.fit( map_data_for_keras(tfds.repeat()), initial_epoch=0, epochs=1, steps_per_epoch=self._train_steps, # * args["update_cycle"], verbose=2, callbacks=training_callbacks) logging.info(history.history)
def run(self): """ Sequence generation from an existing model checkpoint. Step 1: Build model and restore checkpoints. Step 2: Build test dataset. Step 3: Sequence generation. Step 4: Evaluation using metric. """ # Step 3: Build model. with training_utils.get_strategy_scope(self.strategy): model = self._build_and_restore_model() keras_model = self.build_generation_model(self.task, model, self._search_layer) tfds = training_utils.build_datasets(compat.ModeKeys.INFER, self.strategy, self.custom_dataset, self.task) keras_model.summary() summary_model_variables(keras_model) # Step 5: Sequence Generation. start_time = time.time() results = training_utils.make_predictions( self.strategy, keras_model, tfds, self.custom_dataset, map_func=lambda y: SequenceGenerator.postprocess_generation( self.task, y)) logging.info("Generation elapsed: %.2fs", time.time() - start_time) if self._output_file: if isinstance(self.custom_dataset, MultipleDataset): if isinstance(self._output_file, dict): for name in results: if self._output_file.get(name, None): with tf.io.gfile.GFile(self._output_file[name], "w") as fw: fw.write("\n".join(results[name]) + "\n") logging.info( "Saving generation of dataset {} results into {}" .format(name, self._output_file[name])) else: logging.info( "Unsupported type of `output_file`={}({}) for MultipleDataset." .format(self._output_file, type(self._output_file))) else: if isinstance(self._output_file, str): with tf.io.gfile.GFile(self._output_file, "w") as fw: fw.write("\n".join(results) + "\n") logging.info("Saving generation results into {}".format( self._output_file)) else: logging.info( f"WARNING: No generation results are saved due to unsupported type " f"of `output_file`: {self._output_file} ({type(self._output_file)})" ) # Step 6: evaluation using metric def _display(res, name=None): if name: logging.info(f"Evaluation Result ({name}):") else: logging.info("Evaluation Result:") for k, v in res.items(): logging.info(" %s=%.2f", k, v) if self._metric is not None: saving_metrics = dict() if isinstance(self.custom_dataset, MultipleDataset): on_average = {} mixed_dsnames = [] mixed_hypos = [] mixed_refs = [] for name in tfds: assert isinstance(self.custom_dataset.datasets[name], TextGenDataset) if self.custom_dataset.datasets[name].targets: metric_result = self._metric( results[name], self.custom_dataset.datasets[name].targets) for k, v in metric_result.items(): if k not in on_average: on_average[k] = 0. on_average[ k] += self.custom_dataset.sample_weights[ name] * v _display(metric_result, name) mixed_dsnames.append(name) mixed_hypos.extend(results[name]) mixed_refs.extend( self.custom_dataset.datasets[name].targets) saving_metrics[name] = metric_result if len(mixed_dsnames) > 1: _display( on_average, f"on average by weights {self._custom_dataset.sample_weights}" ) mixed_metric_result = self._metric(mixed_refs, mixed_hypos) _display(mixed_metric_result, "mixed of {}".format(",".join(mixed_dsnames))) saving_metrics["MIXED"] = mixed_metric_result else: assert isinstance(self.custom_dataset, TextGenDataset) if self.custom_dataset.targets is not None: metric_result = self._metric(results, self.custom_dataset.targets) _display(metric_result) saving_metrics = metric_result if self._save_metric is not None: logging.info(f"Saving metric results into {self._save_metric}") with tf.io.gfile.GFile(self._save_metric, "w") as fw: json.dump(saving_metrics, fw)