def _get_callbacks(self, stage: str): callbacks = self.experiment.get_callbacks(stage) # distributed run setting rank = utils.get_rank() if rank == 0: # master node # remove worker-only callbacks on master node for k in list( filter( lambda c: callbacks[c].node == CallbackNode.Worker, callbacks ) ): del callbacks[k] elif rank > 0: # worker node # remove master-only callbacks on worker nodes for k in list( filter( lambda c: callbacks[c].node == CallbackNode.Master, callbacks ) ): del callbacks[k] callbacks = utils.process_callbacks(callbacks) return callbacks
def validate_loaders(loaders: Dict[str, DataLoader]) -> Dict[str, DataLoader]: """ Check pytorch dataloaders for distributed setup. Transfers them to distirbuted mode if necessary. (Experimental feature) Args: loaders (Dict[str, DataLoader]): dictionery with pytorch dataloaders Returns: Dict[str, DataLoader]: dictionery with pytorch dataloaders (with distributed samplers if necessary) """ rank = get_rank() if rank >= 0: for key, value in loaders.items(): if not isinstance( value.sampler, (DistributedSampler, DistributedSamplerWrapper) ): warnings.warn( "With distributed training setup, " "you need ``DistributedSampler`` for your ``DataLoader``." "Transferring to distributed mode. (Experimental feature)" ) loaders[key] = _force_make_distributed_loader(value) return loaders
def _get_logdir(self, config: Dict) -> str: timestamp = utils.get_utcnow_time() config_hash = utils.get_short_hash(config) logdir = f"{timestamp}.{config_hash}" distributed_rank = get_rank() if distributed_rank > -1: logdir = f"{logdir}.rank{distributed_rank:02d}" return logdir
def get_callbacks(self, stage: str) -> "OrderedDict[Callback]": """Returns the callbacks for a given stage""" callbacks_params = (self.stages_config[stage].get( "callbacks_params", {})) callbacks = OrderedDict() for key, callback_params in callbacks_params.items(): callback = self._get_callback(**callback_params) callbacks[key] = callback # ! For compatibility with previous versions. default_callbacks = [] if self._verbose: default_callbacks.append(("verbose", VerboseLogger)) if not stage.startswith("infer"): default_callbacks.append(("_criterion", CriterionCallback)) default_callbacks.append(("_optimizer", OptimizerCallback)) if self.stages_config[stage].get("scheduler_params", {}): default_callbacks.append(("_scheduler", SchedulerCallback)) default_callbacks.append(("_saver", CheckpointCallback)) default_callbacks.append(("console", ConsoleLogger)) default_callbacks.append(("tensorboard", TensorboardLogger)) default_callbacks.append(("exception", RaiseExceptionCallback)) for callback_name, callback_fn in default_callbacks: is_already_present = False for x in callbacks.values(): if isinstance(x, PhaseWrapperCallback): x = x.callback if isinstance(x, callback_fn): is_already_present = True break if not is_already_present: callbacks[callback_name] = callback_fn() # Remove LoggerCallback on worker nodes if get_rank() > 0: to_del = (LoggerCallback, ConfusionMatrixCallback) for k in list( filter(lambda c: isinstance(callbacks[c], to_del), callbacks)): del callbacks[k] return callbacks
def main_worker(args, unknown_args): args, config = utils.parse_args_uargs(args, unknown_args) utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) config.setdefault("distributed_params", {})["apex"] = args.apex Experiment, Runner = utils.import_experiment_and_runner(Path(args.expdir)) runner_params = config.get("runner_params", {}) experiment = Experiment(config) runner = Runner(**runner_params) if experiment.logdir is not None and get_rank() <= 0: utils.dump_environment(config, experiment.logdir, args.configs) utils.dump_code(args.expdir, experiment.logdir) runner.run_experiment(experiment)
def _get_callbacks(self, stage: str): callbacks = self.experiment.get_callbacks(stage) # Remove master-only callbacks on worker nodes if utils.get_rank() > 0: for k in list( filter( lambda c: issubclass(callbacks[c].__class__, MasterOnlyCallback), callbacks)): del callbacks[k] loggers = utils.process_callbacks( OrderedDict([(k, v) for k, v in callbacks.items() if issubclass(v.__class__, LoggerCallback)])) callbacks = utils.process_callbacks( OrderedDict([(k, v) for k, v in callbacks.items() if not issubclass(v.__class__, LoggerCallback)])) return callbacks, loggers
def on_epoch_end(self, state: _State): if state.stage.startswith("infer") or get_rank() > 0: return valid_metrics = dict(state.metric_manager.valid_values) epoch_metrics = dict(state.metric_manager.epoch_values) checkpoint = utils.pack_checkpoint( model=state.model, criterion=state.criterion, optimizer=state.optimizer, scheduler=state.scheduler, epoch_metrics=epoch_metrics, valid_metrics=valid_metrics, stage=state.stage, stage_epoch=state.stage_epoch_log, epoch=state.epoch_log, checkpoint_data=state.checkpoint_data) self.process_checkpoint(logdir=state.logdir, checkpoint=checkpoint, is_best=state.metric_manager.is_best, main_metric=state.main_metric, minimize_metric=state.minimize_metric)
def __init__( self, save_n_best: int = 1, resume: str = None, resume_dir: str = None, metric_filename: str = "_metrics.json" ): """ Args: save_n_best (int): number of best checkpoint to keep resume (str): path to checkpoint to load and initialize runner state metric_filename (str): filename to save metrics in checkpoint folder. Must ends on ``.json`` or ``.yml`` """ super().__init__(metric_filename) self.save_n_best = save_n_best self.resume = resume self.resume_dir = resume_dir self.is_distributed_worker = utils.get_rank() > 0 self.top_best_metrics = [] self.epochs_metrics = [] self._keys_from_state = ["resume", "resume_dir"]
def main_worker(args, unknown_args): """@TODO: Docs. Contribution is welcome.""" args, config = utils.parse_args_uargs(args, unknown_args) utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) config.setdefault("distributed_params", {})["apex"] = args.apex experiment_fn, runner_fn = utils.import_experiment_and_runner( Path(args.expdir)) if experiment_fn is None: experiment_params = config.get("experiment_params", {}) experiment = experiment_params.get("experiment", "Experiment") experiment_fn = EXPERIMENTS.get(experiment) runner_params = config.get("runner_params", {}) experiment = experiment_fn(config) runner = runner_fn(**runner_params) if experiment.logdir is not None and get_rank() <= 0: utils.dump_environment(config, experiment.logdir, args.configs) utils.dump_code(args.expdir, experiment.logdir) runner.run_experiment(experiment)
def __init__( self, *, device: Device = None, model: StateModel = None, criterion: StateCriterion = None, optimizer: StateOptimizer = None, scheduler: StateScheduler = None, callbacks: Dict[str, "Callback"] = None, logdir: str = None, stage: str = STAGE_INFER_PREFIX, num_epochs: int = None, main_metric: str = STATE_MAIN_METRIC, minimize_metric: bool = True, valid_loader: str = LOADER_VALID_PREFIX, checkpoint_data: Dict = None, is_check_run: bool = False, **kwargs, ): # main part # data self.loaders: OrderedDict[str, DataLoader] = None # components self.model: StateModel = model self.criterion: StateCriterion = criterion self.optimizer: StateOptimizer = optimizer self.scheduler: StateScheduler = scheduler # extra components - PyTorch device self.device: Device = device # extra components - Catalyst callbacks self.callbacks: Dict[str, "Callback"] = callbacks # dataflow - model input, model output, metrics self.batch_in = None self.batch_out = None # let's use flatten storage for batch metrics # batch_metrics = {'loss': ..., 'accuracy': ..., 'iou': ...} self.batch_metrics = defaultdict(None) # just aggregated (aka mean over all batches) # batch statistics for loader # and global loader metrics, like AUC # loader_metrics = {'loss': ..., 'accuracy': ..., `auc`: ...} self.loader_metrics = defaultdict(None) # summarized metrics for different loaders # and global epoch metrics, like lr, momentum # epoch_metrics = { # 'train_loss': ..., 'train_auc': ..., 'valid_loss': ..., # 'lr': ..., 'momentum': ..., # } self.epoch_metrics = defaultdict(None) # validation self.is_best_valid = False self.valid_metrics = defaultdict(None) self.best_valid_metrics = defaultdict(None) # pipeline info self.distributed_rank = utils.get_rank() self.is_distributed_worker = self.distributed_rank > 0 self.stage_name: str = stage self.epoch: int = 1 self.num_epochs: int = num_epochs or np.iinfo(np.int32).max self.loader_name: str = None self.loader_step: int = 0 self.loader_len: int = 0 self.batch_size: int = 0 self.global_step: int = 0 self.global_epoch: int = 1 # metrics & validation self.main_metric: str = main_metric self.minimize_metric: bool = minimize_metric self.valid_loader: str = valid_loader # logging self.logdir: Path = Path(logdir) if logdir is not None else None # extra checkpoint data for saving in checkpoint files self.checkpoint_data: Dict = checkpoint_data or {} # other self.is_check_run: bool = is_check_run self.is_train_loader: bool = False self.is_infer_stage: bool = \ self.stage_name.startswith(STAGE_INFER_PREFIX) self.need_early_stop: bool = False self.need_exception_reraise: bool = True self.exception: Optional[Exception] = None # kwargs for k, v in kwargs.items(): setattr(self, k, v) self._freeze()
def get_loaders_from_params( batch_size: int = 1, num_workers: int = 0, drop_last: bool = False, per_gpu_scaling: bool = False, loaders_params: Dict[str, Any] = None, samplers_params: Dict[str, Any] = None, initial_seed: int = 42, get_datasets_fn: Callable = None, **data_params, ) -> "OrderedDict[str, DataLoader]": """ Creates pytorch dataloaders from datasets and additional parameters. Args: batch_size (int): ``batch_size`` parameter from ``torch.utils.data.DataLoader`` num_workers (int): ``num_workers`` parameter from ``torch.utils.data.DataLoader`` drop_last (bool): ``drop_last`` parameter from ``torch.utils.data.DataLoader`` per_gpu_scaling (bool): boolean flag, if ``True``, uses ``batch_size=batch_size*num_available_gpus`` loaders_params (Dict[str, Any]): additional loaders parameters samplers_params (Dict[str, Any]): additional sampler parameters initial_seed (int): initial seed for ``torch.utils.data.DataLoader`` workers get_datasets_fn(Callable): callable function to get dictionary with ``torch.utils.data.Datasets`` **data_params: additional data parameters or dictionary with ``torch.utils.data.Datasets`` to use for pytorch dataloaders creation Returns: OrderedDict[str, DataLoader]: dictionary with ``torch.utils.data.DataLoader`` Raises: NotImplementedError: if datasource is out of `Dataset` or dict ValueError: if batch_sampler option is mutually exclusive with distributed """ default_batch_size = batch_size default_num_workers = num_workers loaders_params = loaders_params or {} assert isinstance(loaders_params, dict), (f"`loaders_params` should be a Dict. " f"Got: {loaders_params}") samplers_params = samplers_params or {} assert isinstance( samplers_params, dict), f"`samplers_params` should be a Dict. Got: {samplers_params}" distributed_rank = get_rank() distributed = distributed_rank > -1 if get_datasets_fn is not None: datasets = get_datasets_fn(**data_params) else: datasets = dict(**data_params) loaders = OrderedDict() for name, datasource in datasets.items(): # noqa: WPS426 assert isinstance( datasource, (Dataset, dict )), f"{datasource} should be Dataset or Dict. Got: {datasource}" loader_params = loaders_params.pop(name, {}) assert isinstance(loader_params, dict), f"{loader_params} should be Dict" sampler_params = samplers_params.pop(name, None) if sampler_params is None: if isinstance(datasource, dict) and "sampler" in datasource: sampler = datasource.pop("sampler", None) else: sampler = None else: sampler = SAMPLER.get_from_params(**sampler_params) if isinstance(datasource, dict) and "sampler" in datasource: datasource.pop("sampler", None) batch_size = loader_params.pop("batch_size", default_batch_size) num_workers = loader_params.pop("num_workers", default_num_workers) if per_gpu_scaling and not distributed: num_gpus = max(1, torch.cuda.device_count()) batch_size *= num_gpus num_workers *= num_gpus loader_params = { "batch_size": batch_size, "num_workers": num_workers, "pin_memory": torch.cuda.is_available(), "drop_last": drop_last, **loader_params, } if isinstance(datasource, Dataset): loader_params["dataset"] = datasource elif isinstance(datasource, dict): assert ( "dataset" in datasource), "You need to specify dataset for dataloader" loader_params = merge_dicts(datasource, loader_params) else: raise NotImplementedError if distributed: if sampler is not None: if not isinstance(sampler, DistributedSampler): sampler = DistributedSamplerWrapper(sampler=sampler) else: sampler = DistributedSampler(dataset=loader_params["dataset"]) loader_params["shuffle"] = name.startswith("train") and sampler is None loader_params["sampler"] = sampler if "batch_sampler" in loader_params: if distributed: raise ValueError("batch_sampler option is mutually " "exclusive with distributed") for k in ("batch_size", "shuffle", "sampler", "drop_last"): loader_params.pop(k, None) if "worker_init_fn" not in loader_params: loader_params["worker_init_fn"] = lambda x: set_global_seed( initial_seed + x) loaders[name] = DataLoader(**loader_params) return loaders
def get_loaders( self, stage: str, epoch: int = None, ) -> "OrderedDict[str, DataLoader]": """Returns the loaders for a given stage""" data_params = dict(self.stages_config[stage]["data_params"]) batch_size = data_params.pop("batch_size", 1) num_workers = data_params.pop("num_workers") drop_last = data_params.pop("drop_last", False) per_gpu_scaling = data_params.pop("per_gpu_scaling", False) distributed_rank = get_rank() distributed = distributed_rank > -1 datasets = self.get_datasets(stage=stage, **data_params) overridden_loaders_params = data_params.pop("loaders_params", {}) assert isinstance( overridden_loaders_params, dict), (f"`overridden_loaders_params` should be a Dict. " f"Got: {overridden_loaders_params}") samplers_params = data_params.pop("samplers_params", {}) assert isinstance(samplers_params, dict), \ f"`samplers_params` should be a Dict. Got: {samplers_params}" loaders = OrderedDict() for name, ds_ in datasets.items(): assert isinstance(ds_, (Dataset, dict)), \ f"{ds_} should be Dataset or Dict" overridden_loader_params = overridden_loaders_params.pop(name, {}) assert isinstance(overridden_loader_params, dict), \ f"{overridden_loader_params} should be Dict" sampler_params = samplers_params.pop(name, None) if sampler_params is None: if isinstance(ds_, dict) and "sampler" in ds_: sampler = ds_.pop("sampler", None) else: sampler = None else: sampler = SAMPLERS.get_from_params(**sampler_params) if isinstance(ds_, dict) and "sampler" in ds_: ds_.pop("sampler", None) batch_size = overridden_loader_params.pop("batch_size", batch_size) num_workers = overridden_loader_params.\ pop("num_workers", num_workers) if per_gpu_scaling and not distributed: num_gpus = max(1, torch.cuda.device_count()) batch_size *= num_gpus num_workers *= num_gpus loader_params = { "batch_size": batch_size, "num_workers": num_workers, "pin_memory": torch.cuda.is_available(), "drop_last": drop_last, **overridden_loader_params } if isinstance(ds_, Dataset): loader_params["dataset"] = ds_ elif isinstance(ds_, dict): assert "dataset" in ds_, \ "You need to specify dataset for dataloader" loader_params = utils.merge_dicts(ds_, loader_params) else: raise NotImplementedError if distributed: if sampler is not None: if not isinstance(sampler, DistributedSampler): loader_params["sampler"] = \ DistributedSamplerWrapper(sampler=sampler) else: sampler = DistributedSampler( dataset=loader_params["dataset"]) loader_params["shuffle"] = (name.startswith("train") and sampler is None) loader_params["sampler"] = sampler if "batch_sampler" in loader_params: if distributed: raise ValueError("batch_sampler option is mutually " "exclusive with distributed") for k in ("batch_size", "shuffle", "sampler", "drop_last"): loader_params.pop(k, None) if "worker_init_fn" not in loader_params: loader_params["worker_init_fn"] = \ lambda x: utils.set_global_seed(self.initial_seed + x) loaders[name] = DataLoader(**loader_params) return loaders
def _get_optimizer(self, stage: str, model: Union[Model, Dict[str, Model]], **params) -> Optimizer: # @TODO 1: refactoring; this method is too long # @TODO 2: load state dicts for schedulers & criterion layerwise_params = \ params.pop("layerwise_params", OrderedDict()) no_bias_weight_decay = \ params.pop("no_bias_weight_decay", True) # linear scaling rule from https://arxiv.org/pdf/1706.02677.pdf lr_scaling_params = params.pop("lr_linear_scaling", None) if lr_scaling_params: data_params = dict(self.stages_config[stage]["data_params"]) batch_size = data_params.get("batch_size") per_gpu_scaling = data_params.get("per_gpu_scaling", False) distributed_rank = get_rank() distributed = distributed_rank > -1 if per_gpu_scaling and not distributed: num_gpus = max(1, torch.cuda.device_count()) batch_size *= num_gpus base_lr = lr_scaling_params.get("lr") base_batch_size = lr_scaling_params.get("base_batch_size", 256) lr_scaling = batch_size / base_batch_size params["lr"] = base_lr * lr_scaling # scale default lr else: lr_scaling = 1.0 # getting model parameters model_key = params.pop("_model", None) if model_key is None: assert isinstance(model, nn.Module), \ "model is keyvalue, but optimizer has no specified model" model_params = utils.process_model_params(model, layerwise_params, no_bias_weight_decay, lr_scaling) elif isinstance(model_key, str): model_params = utils.process_model_params(model[model_key], layerwise_params, no_bias_weight_decay, lr_scaling) elif isinstance(model_key, (list, tuple)): model_params = [] for model_key_ in model_key: model_params_ = utils.process_model_params( model[model_key_], layerwise_params, no_bias_weight_decay, lr_scaling) model_params.extend(model_params_) else: raise ValueError("unknown type of model_params") load_from_previous_stage = \ params.pop("load_from_previous_stage", False) optimizer_key = params.pop("optimizer_key", None) optimizer = OPTIMIZERS.get_from_params(**params, params=model_params) if load_from_previous_stage and self.stages.index(stage) != 0: checkpoint_path = f"{self.logdir}/checkpoints/best_full.pth" checkpoint = utils.load_checkpoint(checkpoint_path) dict2load = optimizer if optimizer_key is not None: dict2load = {optimizer_key: optimizer} utils.unpack_checkpoint(checkpoint, optimizer=dict2load) # move optimizer to device device = utils.get_device() for param in model_params: param = param["params"][0] state = optimizer.state[param] for key, value in state.items(): state[key] = utils.any2device(value, device) # update optimizer params for key, value in params.items(): for pg in optimizer.param_groups: pg[key] = value return optimizer