def numerical_check(_net, _cfg: Configuration, train_data, test_data, dump_result=False): # pragma: no cover ctx = _cfg.ctx batch_size = _cfg.batch_size _net.initialize(ctx=ctx) bp_loss_f = get_bp_loss(**_cfg.loss_params) loss_function = {} loss_function.update(bp_loss_f) from longling.ML.MxnetHelper.glue import module from longling.ML.toolkit import EvalFormatter as Formatter from longling.ML.toolkit import MovingLoss from tqdm import tqdm loss_monitor = MovingLoss(loss_function) progress_monitor = tqdm if dump_result: from longling import config_logging validation_logger = config_logging( filename=path_append(_cfg.model_dir, "result.log"), logger="%s-validation" % _cfg.model_name, mode="w", log_format="%(message)s", ) evaluation_formatter = Formatter( logger=validation_logger, dump_file=_cfg.validation_result_file, ) else: evaluation_formatter = Formatter() # train check trainer = module.Module.get_trainer( _net, optimizer=_cfg.optimizer, optimizer_params=_cfg.optimizer_params, select=_cfg.train_select ) for epoch in range(_cfg.begin_epoch, _cfg.end_epoch): for batch_data in progress_monitor(train_data, "Epoch: %s" % epoch): fit_f( net=_net, batch_size=batch_size, batch_data=batch_data, trainer=trainer, bp_loss_f=bp_loss_f, loss_function=loss_function, loss_monitor=loss_monitor, ctx=ctx, ) if epoch % 1 == 0: if epoch % 1 == 0: print( evaluation_formatter( epoch=epoch, loss_name_value=dict(loss_monitor.items()), eval_name_value=eval_f(_net, test_data, ctx=ctx), extra_info=None, dump=True, )[0] )
def toolbox_init( self, evaluation_formatter_parameters=None, validation_logger_mode="w", silent=False, ): from longling import path_append from longling.lib.clock import Clock from longling.lib.utilog import config_logging from longling.ML.toolkit import EvalFormatter as Formatter from longling.ML.toolkit import MovingLoss, ConsoleProgressMonitor as ProgressMonitor self.toolbox = { "monitor": dict(), "timer": None, "formatter": dict(), } mod = self.mod cfg = self.mod.cfg # 4.1 todo 定义损失函数 # bp_loss_f 定义了用来进行 back propagation 的损失函数, # 有且只能有一个,命名中不能为 *_\d+ 型 assert self.loss_function is not None loss_monitor = MovingLoss(self.loss_function) # 4.1 todo 初始化一些训练过程中的交互信息 timer = Clock() progress_monitor = ProgressMonitor( indexes={"Loss": [name for name in self.loss_function]}, values={"Loss": loss_monitor.losses}, end_epoch=cfg.end_epoch - 1, silent=silent) validation_logger = config_logging( filename=path_append(cfg.model_dir, "result.log"), logger="%s-validation" % cfg.model_name, mode=validation_logger_mode, log_format="%(message)s", ) # set evaluation formatter evaluation_formatter_parameters = {} \ if evaluation_formatter_parameters is None \ else evaluation_formatter_parameters evaluation_formatter = Formatter( logger=validation_logger, dump_file=mod.cfg.validation_result_file, **evaluation_formatter_parameters) self.toolbox["monitor"]["loss"] = loss_monitor self.toolbox["monitor"]["progress"] = progress_monitor self.toolbox["timer"] = timer self.toolbox["formatter"]["evaluation"] = evaluation_formatter
def test(cls, test_filename, test_epoch, dump_file=None, **kwargs): from longling.ML.toolkit.formatter import EpochEvalFMT as Formatter formatter = Formatter(dump_file=dump_file) module = cls.load(test_epoch, **kwargs) test_data = module.etl(test_filename) eval_result = module.mod.eval(module.net, test_data) formatter(tips="test", eval_name_value=eval_result) return eval_result
def numerical_check(_net, _cfg: Configuration, train_data, test_data, dump_result=False, reporthook=None, final_reporthook=None): # pragma: no cover ctx = _cfg.ctx batch_size = _cfg.batch_size _net.initialize(ctx=ctx) bp_loss_f = get_bp_loss(**_cfg.loss_params) loss_function = {} loss_function.update(bp_loss_f) from longling.ML.MxnetHelper.glue import module from longling.ML.toolkit import EpochEvalFMT as Formatter from longling.ML.toolkit import MovingLoss from tqdm import tqdm loss_monitor = MovingLoss(loss_function) progress_monitor = tqdm if dump_result: from longling import config_logging validation_logger = config_logging( filename=path_append(_cfg.model_dir, "result.log"), logger="%s-validation" % _cfg.model_name, mode="w", log_format="%(message)s", ) evaluation_formatter = Formatter( logger=validation_logger, dump_file=_cfg.validation_result_file, ) else: evaluation_formatter = Formatter() # train check trainer = module.Module.get_trainer(_net, optimizer=_cfg.optimizer, optimizer_params=_cfg.optimizer_params, select=_cfg.train_select) for epoch in range(_cfg.begin_epoch, _cfg.end_epoch): for i, batch_data in enumerate( progress_monitor(train_data, "Epoch: %s" % epoch)): fit_f( net=_net, batch_size=batch_size, batch_data=batch_data, trainer=trainer, bp_loss_f=bp_loss_f, loss_function=loss_function, loss_monitor=loss_monitor, ctx=ctx, ) if _cfg.lr_params and "update_params" in _cfg.lr_params: _cfg.logger.info("reset trainer") lr_params = _cfg.lr_params.pop("update_params") lr_update_params = dict(batches_per_epoch=i + 1, lr=_cfg.optimizer_params["learning_rate"], update_epoch=lr_params.get( "update_epoch", _cfg.end_epoch - _cfg.begin_epoch - 1)) lr_update_params.update(lr_params) trainer = module.Module.get_trainer( _net, optimizer=_cfg.optimizer, optimizer_params=_cfg.optimizer_params, lr_params=lr_update_params, select=_cfg.train_select, logger=_cfg.logger) if epoch % 1 == 0: msg, data = evaluation_formatter(iteration=epoch, loss_name_value=dict( loss_monitor.items()), eval_name_value=eval_f(_net, test_data, ctx=ctx), extra_info=None, dump=dump_result, keep={"msg", "data"}) print(msg) if reporthook is not None: reporthook(data) if final_reporthook is not None: final_reporthook()
def toolbox_init( self, evaluation_formatter_parameters=None, validation_logger_mode="w", silent=False, ): from longling import path_append from longling.lib.clock import Clock from longling.lib.utilog import config_logging from longling.ML.toolkit import EpochEvalFMT as Formatter from longling.ML.toolkit import MovingLoss, ConsoleProgressMonitor as ProgressMonitor self.toolbox = { "monitor": dict(), "timer": None, "formatter": dict(), } mod = self.mod cfg = self.mod.cfg assert self.loss_function is not None loss_monitor = MovingLoss(self.loss_function) timer = Clock() progress_monitor = ProgressMonitor( indexes={ "Loss": [name for name in self.loss_function] }, values={ "Loss": loss_monitor.losses }, silent=silent, player_type="epoch", total_epoch=cfg.end_epoch - 1 ) validation_logger = config_logging( filename=path_append(cfg.model_dir, "result.log"), logger="%s-validation" % cfg.model_name, mode=validation_logger_mode, log_format="%(message)s", ) # set evaluation formatter evaluation_formatter_parameters = {} \ if evaluation_formatter_parameters is None \ else evaluation_formatter_parameters evaluation_formatter = Formatter( logger=validation_logger, dump_file=mod.cfg.validation_result_file, **evaluation_formatter_parameters ) self.toolbox["monitor"]["loss"] = loss_monitor self.toolbox["monitor"]["progress"] = progress_monitor self.toolbox["timer"] = timer self.toolbox["formatter"]["evaluation"] = evaluation_formatter
def epoch_loop(self, net, begin_epoch, end_epoch, batch_size, train_data, trainer, loss_function, test_data=None, ctx=mx.cpu(), toolbox=None, save_model=True, eval_every_n_epoch=1, **kwargs): """ 此函数包裹批次训练过程,形成轮次训练过程 Parameters ---------- net: HybridBlock The network which has been initialized or loaded from the existed model begin_epoch: int The begin epoch of this train procession end_epoch: int The end epoch of this train procession batch_size: int The size of each batch train_data: Iterable The data used for this train procession, NOTICE: should have been divided to batches trainer: The trainer used to update the parameters of the net loss_function: dict of function The functions to compute the loss for the procession of back propagation test_data: Iterable The data used for the evaluation at the end of each epoch, NOTICE: should have been divided to batches Default to ``None`` ctx: Context or list of Context Defaults to ``mx.cpu()``. toolbox: Toolbox Default to ``None`` save_model: bool Whether save model eval_every_n_epoch: int kwargs """ # 参数修改时需要同步修改 fit 函数中的参数 # 定义轮次训练过程 if toolbox is not None: formatter = toolbox.get('formatter') else: formatter = None for epoch in range(begin_epoch, end_epoch): batch_num, loss_values = self.batch_loop( net=net, epoch=epoch, batch_size=batch_size, train_data=train_data, trainer=trainer, loss_function=loss_function, ctx=ctx, toolbox=toolbox, ) if hasattr(self.cfg, "lr_params") and self.cfg.lr_params \ and "update_params" in self.cfg.lr_params and self.cfg.end_epoch - self.cfg.begin_epoch - 1 > 0: self.cfg.logger.info("reset trainer") lr_params = self.cfg.lr_params.pop("update_params") lr_update_params = dict( batches_per_epoch=batch_num, lr=self.cfg.optimizer_params["learning_rate"], update_epoch=lr_params.get( "update_epoch", self.cfg.end_epoch - self.cfg.begin_epoch - 1)) lr_update_params.update(lr_params) trainer = module.Module.get_trainer( net, optimizer=self.cfg.optimizer, optimizer_params=self.cfg.optimizer_params, lr_params=lr_update_params, select=self.cfg.train_select, logger=self.cfg.logger) try: train_time = toolbox["monitor"]["progress"].iteration_time except (KeyError, TypeError): train_time = None if (epoch - 1) % eval_every_n_epoch == 0 or epoch == end_epoch - 1: # # todo 定义每一轮结束后的模型评估方法 evaluation_result = self.eval(net, test_data, ctx=ctx) evaluation_formatter = formatter.get( 'evaluation', Formatter()) if formatter else Formatter() print( evaluation_formatter( iteration=epoch, train_time=train_time, loss_name_value=loss_values, eval_name_value=evaluation_result, extra_info=None, dump=True, keep="msg", )) # todo 定义模型保存方案 if save_model: if epoch % kwargs.get('save_epoch', 1) == 0: self.save_params(self.epoch_params_filepath(epoch), net)
def train(net, cfg, loss_function, trainer, train_data, test_data=None, params_save=False, dump_result=False, progress_monitor=None, *, fit_f, eval_f=None, net_init=None, get_net=None, get_loss=None, get_trainer=None, save_params=None, enable_hyper_search=False, reporthook=None, final_reporthook=None, primary_key=None, eval_epoch=1, loss_dict2tmt_loss=None, epoch_lr_scheduler=None, batch_lr_scheduler=None, loss_as_dict=False, verbose=None, dump_cfg=None, **cfg_kwargs): if enable_hyper_search: assert get_net is not None cfg_kwargs, reporthook, final_reporthook, tag = prepare_hyper_search( cfg_kwargs, reporthook, final_reporthook, primary_key=primary_key, with_keys="Epoch", dump=params_save) dump_result = tag verbose = tag if verbose is None else verbose cfg.update(**cfg_kwargs) print("hyper search enabled") print(cfg) verbose = True if verbose is None else verbose dump_cfg = dump_cfg if dump_cfg is not None else params_save if dump_cfg: cfg.dump() net = net if get_net is None else get_net(**cfg.hyper_params) if net_init is not None: net_init(net, cfg=cfg, initializer_kwargs=cfg.init_params) train_ctx = cfg.ctx if cfg.train_ctx is None else cfg.train_ctx eval_ctx = cfg.ctx if cfg.eval_ctx is None else cfg.eval_ctx batch_size = cfg.batch_size loss_function = get_loss( **cfg.loss_params) if get_loss is not None else loss_function if isinstance(loss_function, dict): _loss_function = loss_function else: if hasattr(loss_function, "__name__"): loss_name = loss_function.__name__ elif hasattr(loss_function, "__class__"): loss_name = loss_function.__class__.__name__ else: # pragma: no cover loss_name = "loss" loss_function = {loss_name: loss_function} if loss_dict2tmt_loss is not None: loss_function = loss_dict2tmt_loss(loss_function) _loss_function = list(loss_function.values() )[0] if loss_as_dict is False else loss_function loss_monitor = MovingLoss(loss_function) if progress_monitor is None and loss_dict2tmt_loss is not None: progress_monitor = ConsoleProgressMonitor( indexes={"Loss": [name for name in loss_function]}, values={"Loss": loss_monitor.losses}, player_type="epoch", total_epoch=cfg.end_epoch - 1, silent=not verbose) elif progress_monitor is None or progress_monitor == "tqdm": def progress_monitor(x, e): return tqdm(x, "Epoch: %s" % e, disable=not verbose) if dump_result: from longling import config_logging validation_logger = config_logging( filename=path_append(cfg.model_dir, cfg.get("result_log", RESULT_LOG)), logger="%s-validation" % cfg.model_name, mode="w", log_format="%(message)s", ) evaluation_formatter = Formatter( logger=validation_logger, dump_file=cfg.validation_result_file, ) else: evaluation_formatter = Formatter() # train check if get_trainer is not None: trainer = get_trainer(net, optimizer=cfg.optimizer, optimizer_params=cfg.optimizer_params, select=cfg.train_select, lr_params=cfg.lr_params) if batch_lr_scheduler is True: trainer, batch_lr_scheduler = trainer elif epoch_lr_scheduler is True: trainer, epoch_lr_scheduler = trainer for epoch in range(cfg.begin_epoch, cfg.end_epoch): for i, batch_data in enumerate(progress_monitor(train_data, epoch)): fit_f( net, batch_size=batch_size, batch_data=batch_data, trainer=trainer, loss_function=_loss_function, loss_monitor=loss_monitor, ctx=train_ctx, ) if batch_lr_scheduler is not None: batch_lr_scheduler.step() if cfg.lr_params and "update_params" in cfg.lr_params and cfg.end_epoch - cfg.begin_epoch - 1 > 0: cfg.logger.info("reset trainer") lr_params = cfg.lr_params.pop("update_params") lr_update_params = dict(batches_per_epoch=i + 1, lr=cfg.optimizer_params["learning_rate"], update_epoch=lr_params.get( "update_epoch", cfg.end_epoch - cfg.begin_epoch - 1)) lr_update_params.update(lr_params) assert get_trainer is not None trainer = get_trainer(net, optimizer=cfg.optimizer, optimizer_params=cfg.optimizer_params, lr_params=lr_update_params, select=cfg.train_select, logger=cfg.logger) if test_data is not None and epoch % eval_epoch == 0: msg, data = evaluation_formatter( iteration=epoch, loss_name_value=dict(loss_monitor.items()), eval_name_value=eval_f(net, test_data, ctx=eval_ctx, verbose=verbose, **cfg.get("eval_params", {})), extra_info=None, dump=dump_result, keep={"msg", "data"}) print(msg) if reporthook is not None: reporthook(data) # optional loss_monitor.reset() if params_save and (epoch % cfg.save_epoch == 0 or epoch == cfg.end_epoch - 1): assert save_params is not None params_path = get_epoch_params_filepath(cfg.model_name, epoch, cfg.model_dir) cfg.logger.info("save model params to %s, with select='%s'" % (params_path, cfg.save_select)) save_params(params_path, net, select=cfg.save_select) if epoch_lr_scheduler is not None: epoch_lr_scheduler.step() if final_reporthook is not None: final_reporthook()
def numerical_check(_net, _cfg: Configuration, train_data, test_data, dump_result=False, reporthook=None, final_reporthook=None): # pragma: no cover ctx = _cfg.ctx _net = set_device(_net, ctx) bp_loss_f = get_bp_loss(ctx, **_cfg.loss_params) loss_function = {} loss_function.update(bp_loss_f) from longling.ML.toolkit import EpochEvalFMT as Formatter from longling.ML.toolkit import MovingLoss from tqdm import tqdm loss_monitor = MovingLoss(loss_function) progress_monitor = tqdm if dump_result: from longling import config_logging validation_logger = config_logging( filename=path_append(_cfg.model_dir, "result.log"), logger="%s-validation" % _cfg.model_name, mode="w", log_format="%(message)s", ) evaluation_formatter = Formatter( logger=validation_logger, dump_file=_cfg.validation_result_file, ) else: evaluation_formatter = Formatter() # train check from longling.ML.PytorchHelper.toolkit.optimizer import get_trainer trainer = get_trainer( _net, optimizer=_cfg.optimizer, optimizer_params=_cfg.optimizer_params, select=_cfg.train_select ) for epoch in range(_cfg.begin_epoch, _cfg.end_epoch): for batch_data in progress_monitor(train_data, "Epoch: %s" % epoch): fit_f( net=_net, batch_data=batch_data, trainer=trainer, bp_loss_f=bp_loss_f, loss_function=loss_function, loss_monitor=loss_monitor, ) if epoch % 1 == 0: msg, data = evaluation_formatter( epoch=epoch, loss_name_value=dict(loss_monitor.items()), eval_name_value=eval_f(_net, test_data, ctx=ctx), extra_info=None, dump=dump_result, ) print(msg) if reporthook is not None: reporthook(data) # optional, whether reset the loss at the end of each epoch loss_monitor.reset() if final_reporthook is not None: final_reporthook()
def get_default_toolbox(loss_function=None, evaluation_formatter_parameters=None, progress_monitor_parameters=None, validation_logger_mode="w", silent=False, configuration=None): # pragma: no cover """ New in version 1.3.16 todo: consider whether to keep it Notice ------ The developer who modify this document should simultaneously modify the related function in glue """ from longling import path_append from longling.lib.clock import Clock from longling.lib.utilog import config_logging from longling.ML.toolkit import EpochEvalFMT as Formatter from longling.ML.toolkit import MovingLoss, ConsoleProgressMonitor as ProgressMonitor cfg = configuration toolbox = { "monitor": dict(), "timer": None, "formatter": dict(), } loss_monitor = MovingLoss(loss_function) if loss_function else None timer = Clock() progress_monitor = ProgressMonitor( indexes={"Loss": [name for name in loss_function]} if loss_function else {}, values={"Loss": loss_monitor.losses} if loss_monitor else {}, silent=silent, **progress_monitor_parameters if progress_monitor_parameters is not None else {}) validation_logger = config_logging( filename=path_append(cfg.model_dir, "result.log") if hasattr( cfg, "model_dir") else None, logger="%s-validation" % cfg.model_name if hasattr(cfg, "model_name") else "model", mode=validation_logger_mode, log_format="%(message)s", ) # set evaluation formatter evaluation_formatter_parameters = {} \ if evaluation_formatter_parameters is None \ else evaluation_formatter_parameters evaluation_formatter = Formatter(logger=validation_logger, dump_file=getattr( cfg, "validation_result_file", False), **evaluation_formatter_parameters) toolbox["monitor"]["loss"] = loss_monitor toolbox["monitor"]["progress"] = progress_monitor toolbox["timer"] = timer toolbox["formatter"]["evaluation"] = evaluation_formatter return toolbox