Docs. """ num_features = int(1e1) # model, criterion, optimizer, scheduler model = torch.nn.Linear(num_features, 1) criterion = torch.nn.MSELoss() optimizer = torch.optim.Adam(model.parameters()) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [3, 6]) runner = SupervisedRunner() runner.train( model=model, datasets={ "batch_size": 32, "num_workers": 1, "get_datasets_fn": datasets_fn, "num_features": num_features, }, criterion=criterion, optimizer=optimizer, scheduler=scheduler, logdir="./logs/example_3", num_epochs=8, verbose=True, distributed=False, check=True, ) utils.distributed_cmd_run(train)
def train( self, *, model: Model, criterion: Criterion = None, optimizer: Optimizer = None, scheduler: Scheduler = None, datasets: "OrderedDict[str, Union[Dataset, Dict, Any]]" = None, loaders: "OrderedDict[str, DataLoader]" = None, callbacks: "Union[List[Callback], OrderedDict[str, Callback]]" = None, logdir: str = None, resume: str = None, num_epochs: int = 1, valid_loader: str = "valid", main_metric: str = "loss", minimize_metric: bool = True, verbose: bool = False, state_kwargs: Dict = None, checkpoint_data: Dict = None, fp16: Union[Dict, bool] = None, distributed: bool = False, check: bool = False, timeit: bool = False, load_best_on_end: bool = False, initial_seed: int = 42, ) -> None: """ Starts the train stage of the model. Args: model (Model): model to train criterion (Criterion): criterion function for training optimizer (Optimizer): optimizer for training scheduler (Scheduler): scheduler for training datasets (OrderedDict[str, Union[Dataset, Dict, Any]]): dictionary with one or several ``torch.utils.data.Dataset`` for training, validation or inference used for Loaders automatic creation preferred way for distributed training setup loaders (OrderedDict[str, DataLoader]): dictionary with one or several ``torch.utils.data.DataLoader`` for training, validation or inference callbacks (Union[List[Callback], OrderedDict[str, Callback]]): list or dictionary with Catalyst callbacks logdir (str): path to output directory resume (str): path to checkpoint for model num_epochs (int): number of training epochs valid_loader (str): loader name used to calculate the metrics and save the checkpoints. For example, you can pass `train` and then the metrics will be taken from `train` loader. main_metric (str): the key to the name of the metric by which the checkpoints will be selected. minimize_metric (bool): flag to indicate whether the ``main_metric`` should be minimized. verbose (bool): if `True`, it displays the status of the training to the console. state_kwargs (dict): additional state params for ``State`` checkpoint_data (dict): additional data to save in checkpoint, for example: ``class_names``, ``date_of_training``, etc fp16 (Union[Dict, bool]): If not None, then sets training to FP16. See https://nvidia.github.io/apex/amp.html#properties if fp16=True, params by default will be ``{"opt_level": "O1"}`` distributed (bool): if `True` will start training in distributed mode. Note: Works only with python scripts. No jupyter support. check (bool): if True, then only checks that pipeline is working (3 epochs only) timeit (bool): if True, computes the execution time of training process and displays it to the console. load_best_on_end (bool): if True, Runner will load best checkpoint state (model, optimizer, etc) according to validation metrics. Requires specified ``logdir``. initial_seed (int): experiment's initial seed value """ if isinstance(fp16, bool) and fp16: fp16 = {"opt_level": "O1"} if resume is not None or load_best_on_end: load_on_stage_end = None if load_best_on_end: load_on_stage_end = "best_full" assert logdir is not None, ( "For ``load_best_on_end`` feature " "you need to specify ``logdir``" ) callbacks = utils.sort_callbacks_by_order(callbacks) checkpoint_callback_flag = any( isinstance(x, CheckpointCallback) for x in callbacks.values() ) if not checkpoint_callback_flag: callbacks["loader"] = CheckpointCallback( resume=resume, load_on_stage_end=load_on_stage_end, ) else: raise NotImplementedError("CheckpointCallback already exist") experiment = self._experiment_fn( stage="train", model=model, datasets=datasets, loaders=loaders, callbacks=callbacks, logdir=logdir, criterion=criterion, optimizer=optimizer, scheduler=scheduler, num_epochs=num_epochs, valid_loader=valid_loader, main_metric=main_metric, minimize_metric=minimize_metric, verbose=verbose, check_time=timeit, check_run=check, state_kwargs=state_kwargs, checkpoint_data=checkpoint_data, distributed_params=fp16, initial_seed=initial_seed, ) self.experiment = experiment utils.distributed_cmd_run(self.run_experiment, distributed)
def main(args, unknown_args): """Runs the ``catalyst-dl run`` script.""" utils.distributed_cmd_run( main_worker, args.distributed, args, unknown_args )