def k_fold_experiment(hparams: Hparams): """ k_fold training :param hparams: :return: """ logger = logging.getLogger(__name__) if hparams.use_mixed_float16: logger.info("Use auto mixed policy") # tf.keras.mixed_precision.experimental.set_policy('mixed_float16') os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1' strategy = tf.distribute.MirroredStrategy( devices=[f"/gpu:{id}" for id in hparams.gpus]) # build dataset model_saved_dirs = [] for idx, (train_dataset, dev_dataset, dataset_info) in enumerate(load_dataset(hparams, ret_test=False)): logger.info(f"Start {idx}th-fold training") with strategy.scope(): # build model model, (losses, loss_weights), metrics, optimizer = build_model(hparams) # build callbacks callbacks = build_callbacks(hparams.training.callbacks) # compile model.compile(optimizer=optimizer, loss=losses, metrics=metrics, loss_weights=loss_weights) # fit if hparams.training.do_eval: validation_data = dev_dataset validation_steps = hparams.training.validation_steps else: logger.info("Do not evaluate.") validation_data = None validation_steps = None model.fit( train_dataset, validation_data=validation_data, epochs=hparams.training.max_epochs, callbacks=callbacks, steps_per_epoch=hparams.training.steps_per_epoch, validation_steps=validation_steps, ) # build archive dir k_fold_dir = os.path.join(hparams.get_workspace_dir(), "k_fold", str(idx)) if not os.path.exists(k_fold_dir): os.makedirs(k_fold_dir) # load best model checkpoint_dir = os.path.join(hparams.get_workspace_dir(), "checkpoint") if hparams.eval_use_best and os.path.exists(checkpoint_dir): logger.info(f"Load best model from {checkpoint_dir}") average_checkpoints(model, checkpoint_dir) logger.info(f"Move {checkpoint_dir, k_fold_dir}") shutil.move(checkpoint_dir, k_fold_dir) # save best model logger.info( f'Save {idx}th model in {hparams.get_model_filename()}') model.save_weights(hparams.get_model_filename(), save_format="tf") # eval on test dataset and make reports evaluation(hparams) logger.info(f"Move {hparams.get_report_dir()} to {k_fold_dir}") shutil.move(hparams.get_report_dir(), k_fold_dir) logger.info(f"Move {hparams.get_saved_model_dir()} to {k_fold_dir}") cur_model_saved_dir = shutil.move(hparams.get_saved_model_dir(), k_fold_dir) logger.info( f"New model saved path for {idx}th fold: {cur_model_saved_dir}") model_saved_dirs.append(cur_model_saved_dir) logger.info(f'{idx}th-fold experiment Finish!') # eval on test dataset after average_checkpoints # logger.info("Average models of all fold models.") checkpoints = [f'{itm}/model' for itm in model_saved_dirs] # average_checkpoints(model, checkpoints) # logger.info(f"Save averaged model in {hparams.get_model_filename()}") # model.save_weights(hparams.get_model_filename(), save_format="tf") if hparams.training.do_eval: evaluation(hparams, checkpoints=checkpoints) logger.info('Experiment Finish!')
def build_model(hparam: Hparams, return_losses=True, return_metrics=True, return_optimizer=True, stage=TRAIN_STAGE): """Build custom keras model, losses, metrics, and optimizer :param hparam: :param return_losses: :param return_metrics: :param return_optimizer: :return: """ logger.info(f"Try to build model {hparam.model_name}") from aispace import models from aispace.models.base_model import BaseModel model = BaseModel.by_name(hparam.model_name)(hparam) # build inputs and model inputs = build_tf_model_inputs(hparam.dataset) model(inputs, training=True) rets = () # build losses if return_losses: losses, loss_weights = build_tf_model_losses(model, hparam.dataset) rets += ((losses, loss_weights), ) # build metrics if return_metrics: metrics = build_tf_model_metrics(hparam.dataset) rets += (metrics, ) # build optimizer if return_optimizer: optimizer = build_tf_model_optimizer(hparam.training) rets += (optimizer, ) # if stage == TRAIN_STAGE: model.summary() # init from pretrained model (language or etc.,) if stage == TRAIN_STAGE and not hparam.model_resume_path and not hparam.model_load_path \ and "pretrained" in hparam and hparam.pretrained.init_from_pretrained: try: logger.info(f"Load weights from {hparam.pretrained.model_path}") if hparam.pretrained.model_path.endswith(".h5"): model.load_weights(hparam.pretrained.model_path, by_name=True) else: logger.info( f"Load weights using model adapter {hparam.pretrained.adapter}" ) adapter = build_model_adapter(hparam.pretrained) if adapter is not None: adapter(model.trainable_variables, hparam.pretrained.model_path) except Exception as e: logging.error("Load weights failure!", exc_info=True) raise e # initializer model if stage == TRAIN_STAGE and not hparam.model_resume_path and hparam.model_load_path is not None: model_saved = os.path.join(hparam.model_load_path, "model_saved", "model") logger.info(f"Initialize model from {model_saved}") model.load_weights(model_saved) # resume model if stage == TRAIN_STAGE and hparam.model_resume_path is not None: model_saved = os.path.join(hparam.get_workspace_dir(), "model_saved", "model") logger.info(f"Resume model from {model_saved}") model.load_weights(model_saved) return (model, ) + rets
def experiment(hparams: Hparams): logger = logging.getLogger(__name__) if hparams.use_mixed_float16: logger.info("Use auto mixed policy") # tf.keras.mixed_precision.experimental.set_policy('mixed_float16') os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1' strategy = tf.distribute.MirroredStrategy( devices=[f"/gpu:{id}" for id in hparams.gpus]) # build dataset train_dataset, dev_dataset, dataset_info = next( load_dataset(hparams, ret_test=False)) with strategy.scope(): # build model model, (losses, loss_weights), metrics, optimizer = build_model(hparams) # build callbacks callbacks = build_callbacks(hparams) # compile model.compile(optimizer=optimizer, loss=losses, metrics=metrics, loss_weights=loss_weights) # fit if hparams.training.do_eval: validation_data = dev_dataset validation_steps = hparams.training.validation_steps else: logger.info("Do not evaluate.") validation_data = None validation_steps = None model.fit( train_dataset, validation_data=validation_data, epochs=hparams.training.max_epochs, callbacks=callbacks, steps_per_epoch=hparams.training.steps_per_epoch, validation_steps=validation_steps, ) # 进行lr finder lr_finder_call_back = [ cb for cb in callbacks if hasattr(cb, "lr_finder_plot") ] if len(lr_finder_call_back) != 0: logger.info( f"Do lr finder, and save result in {hparams.get_lr_finder_jpg_file()}" ) lr_finder_call_back[0].lr_finder_plot(hparams.get_lr_finder_jpg_file()) else: # load best model checkpoint_dir = os.path.join(hparams.get_workspace_dir(), "checkpoint") if hparams.eval_use_best and os.path.exists(checkpoint_dir): logger.info(f"Load best model from {checkpoint_dir}") average_checkpoints(model, checkpoint_dir) # save best model logger.info(f'Save model in {hparams.get_model_filename()}') model.save_weights(hparams.get_model_filename(), save_format="tf") # eval on test dataset and make reports if hparams.training.do_eval: evaluation(hparams) logger.info('Experiment Finish!')