def set_requires_grad(model: Model, requires_grad: Union[bool, Dict[str, bool]]): """Sets the ``requires_grad`` value for all model parameters. Example:: >>> model = SimpleModel() >>> set_requires_grad(model, requires_grad=True) >>> # or >>> model = SimpleModel() >>> set_requires_grad(model, requires_grad={""}) Args: model (torch.nn.Module): model requires_grad (Union[bool, Dict[str, bool]]): value """ if isinstance(requires_grad, dict): for name, param in model.named_parameters(): assert (name in requires_grad ), f"Parameter `{name}` does not exist in requires_grad" param.requires_grad = requires_grad[name] else: requires_grad = bool(requires_grad) for param in model.parameters(): param.requires_grad = requires_grad
def grad_norm(*, model: Model, prefix: str, norm_type: int) -> Dict: """Computes gradient norms for a given model. Args: model (Model): model which gradients to be saved. prefix (str): prefix for keys in resulting dictionary. norm_type (int): norm type of gradient norm. Returns: Dict: dictionary in which gradient norms are stored. """ if isinstance(model, (DataParallel, DistributedDataParallel)): model = model.module total_norm = 0.0 grad_norm = {} for tag, value in model.named_parameters(): tag = tag.replace(".", "/") metrics_tag = f"{prefix}/{tag}" param_norm = value.grad.data.norm(norm_type).item() total_norm += param_norm ** norm_type grad_norm[metrics_tag] = param_norm total_norm = total_norm ** (1.0 / norm_type) metrics_tag = f"{prefix}/total" grad_norm[metrics_tag] = total_norm return grad_norm
def run_inference( config: Dict, *, model: Model, checkpoint: str, loader: torch.utils.data.DataLoader ) -> np.ndarray: model.eval() runner = CustomRunner() embeddings = [] with tqdm(desc="Running inference", total=len(loader.dataset),) as tq: for batch in runner.predict_loader( model=model, loader=loader, resume=checkpoint ): embeddings.append(batch) tq.update(batch.size(0)) return torch.cat(embeddings).cpu().numpy()
def trace_model( model: Model, predict_fn: Callable, batch=None, method_name: str = "forward", mode: str = "eval", requires_grad: bool = False, opt_level: str = None, device: Device = "cpu", predict_params: dict = None, ) -> jit.ScriptModule: """Traces model using runner and batch. Args: model: Model to trace predict_fn: Function to run prediction with the model provided, takes model, inputs parameters batch: Batch to trace the model method_name (str): Model's method name that will be used as entrypoint during tracing mode (str): Mode for model to trace (``train`` or ``eval``) requires_grad (bool): Flag to use grads opt_level (str): Apex FP16 init level, optional device (str): Torch device predict_params (dict): additional parameters for model forward Returns: jit.ScriptModule: Traced model Raises: ValueError: if both batch and predict_fn must be specified or mode is not in 'eval' or 'train'. """ if batch is None or predict_fn is None: raise ValueError("Both batch and predict_fn must be specified.") if mode not in ["train", "eval"]: raise ValueError(f"Unknown mode '{mode}'. Must be 'eval' or 'train'") predict_params = predict_params or {} tracer = _TracingModelWrapper(model, method_name) if opt_level is not None: assert_fp16_available() # If traced in AMP we need to initialize the model before calling # the jit # https://github.com/NVIDIA/apex/issues/303#issuecomment-493142950 from apex import amp model = model.to(device) model = amp.initialize(model, optimizers=None, opt_level=opt_level) getattr(model, mode)() set_requires_grad(model, requires_grad=requires_grad) predict_fn(tracer, batch, **predict_params) return tracer.tracing_result
def process_model_params( model: Model, layerwise_params: Dict[str, dict] = None, no_bias_weight_decay: bool = True, lr_scaling: float = 1.0, ) -> List[Union[torch.nn.Parameter, dict]]: """Gains model parameters for ``torch.optim.Optimizer``. Args: model (torch.nn.Module): Model to process layerwise_params (Dict): Order-sensitive dict where each key is regex pattern and values are layer-wise options for layers matching with a pattern no_bias_weight_decay (bool): If true, removes weight_decay for all ``bias`` parameters in the model lr_scaling (float): layer-wise learning rate scaling, if 1.0, learning rates will not be scaled Returns: iterable: parameters for an optimizer Example:: >>> model = catalyst.contrib.models.segmentation.ResnetUnet() >>> layerwise_params = collections.OrderedDict([ >>> ("conv1.*", dict(lr=0.001, weight_decay=0.0003)), >>> ("conv.*", dict(lr=0.002)) >>> ]) >>> params = process_model_params(model, layerwise_params) >>> optimizer = torch.optim.Adam(params, lr=0.0003) """ params = list(model.named_parameters()) layerwise_params = layerwise_params or collections.OrderedDict() model_params = [] for name, parameters in params: options = {} for pattern, options_ in layerwise_params.items(): if re.match(pattern, name) is not None: # all new LR rules write on top of the old ones options = merge_dicts(options, options_) # no bias decay from https://arxiv.org/abs/1812.01187 if no_bias_weight_decay and name.endswith("bias"): options["weight_decay"] = 0.0 # lr linear scaling from https://arxiv.org/pdf/1706.02677.pdf if "lr" in options: options["lr"] *= lr_scaling model_params.append({"params": parameters, **options}) return model_params
def set_requires_grad(model: Model, requires_grad: bool): """Sets the ``requires_grad`` value for all model parameters. Example:: >>> model = SimpleModel() >>> set_requires_grad(model, requires_grad=True) Args: model (torch.nn.Module): model requires_grad (bool): value """ requires_grad = bool(requires_grad) for param in model.parameters(): param.requires_grad = requires_grad
def get_requires_grad(model: Model): """Gets the ``requires_grad`` value for all model parameters. Example:: >>> model = SimpleModel() >>> requires_grad = get_requires_grad(model) Args: model (torch.nn.Module): model Returns: requires_grad (Dict[str, bool]): value """ requires_grad = {} for name, param in model.named_parameters(): requires_grad[name] = param.requires_grad return requires_grad
def process_components( model: Model, criterion: Criterion = None, optimizer: Optimizer = None, scheduler: Scheduler = None, distributed_params: Dict = None, device: Device = None, ) -> Tuple[Model, Criterion, Optimizer, Scheduler, Device]: """ Returns the processed model, criterion, optimizer, scheduler and device. Args: model (Model): torch model criterion (Criterion): criterion function optimizer (Optimizer): optimizer scheduler (Scheduler): scheduler distributed_params (dict, optional): dict with the parameters for distributed and FP16 method device (Device, optional): device Returns: tuple with processed model, criterion, optimizer, scheduler and device. Raises: NotImplementedError: if model is not nn.Module or dict for multi-gpu, nn.ModuleDict for DataParallel not implemented yet """ distributed_params = distributed_params or {} distributed_params = copy.deepcopy(distributed_params) distributed_params.update(get_distributed_params()) if device is None: device = get_device() elif isinstance(device, str): device = torch.device(device) is_apex_available = (distributed_params.pop("apex", True) and check_apex_available()) model: Model = maybe_recursive_call(model, "to", device=device) if check_ddp_wrapped(model): pass # distributed data parallel run (ddp) (with apex support) elif get_rank() >= 0: assert isinstance( model, nn.Module), "Distributed training is not available for KV model" local_rank = distributed_params.pop("local_rank", 0) or 0 device = f"cuda:{local_rank}" model = maybe_recursive_call(model, "to", device=device) syncbn = distributed_params.pop("syncbn", False) if is_apex_available: import apex model, optimizer = initialize_apex(model, optimizer, **distributed_params) model = apex.parallel.DistributedDataParallel(model) if syncbn: model = apex.parallel.convert_syncbn_model(model) else: model = nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank) # data parallel run (dp) (with apex support) else: # apex issue https://github.com/deepset-ai/FARM/issues/210 use_apex = (is_apex_available and torch.cuda.device_count() == 1) or ( is_apex_available and torch.cuda.device_count() > 1 and distributed_params.get("opt_level", "O0") == "O1") if use_apex: assert isinstance( model, nn.Module), "Apex training is not available for KV model" model, optimizer = initialize_apex(model, optimizer, **distributed_params) if (torch.cuda.device_count() > 1 and device.type != "cpu" and device.index is None): if isinstance(model, nn.Module): model = nn.DataParallel(model) elif isinstance(model, dict): model = {k: nn.DataParallel(v) for k, v in model.items()} else: raise NotImplementedError() model: Model = maybe_recursive_call(model, "to", device=device) return model, criterion, optimizer, scheduler, device
def trace_model( model: Model, runner: "Runner", batch=None, method_name: str = "forward", mode: str = "eval", requires_grad: bool = False, opt_level: str = None, device: Device = "cpu", predict_params: dict = None, ) -> ScriptModule: """Traces model using runner and batch. Args: model: Model to trace runner: Model's native runner that was used to train model batch: Batch to trace the model method_name (str): Model's method name that will be used as entrypoint during tracing mode (str): Mode for model to trace (``train`` or ``eval``) requires_grad (bool): Flag to use grads opt_level (str): Apex FP16 init level, optional device (str): Torch device predict_params (dict): additional parameters for model forward Returns: (ScriptModule): Traced model """ if batch is None or runner is None: raise ValueError("Both batch and runner must be specified.") if mode not in ["train", "eval"]: raise ValueError(f"Unknown mode '{mode}'. Must be 'eval' or 'train'") predict_params = predict_params or {} tracer = _TracingModelWrapper(model, method_name) if opt_level is not None: assert_fp16_available() # If traced in AMP we need to initialize the model before calling # the jit # https://github.com/NVIDIA/apex/issues/303#issuecomment-493142950 from apex import amp model = model.to(device) model = amp.initialize(model, optimizers=None, opt_level=opt_level) # TODO: remove `check_trace=False` # after fixing this bug https://github.com/pytorch/pytorch/issues/23993 params = {**predict_params, "check_trace": False} else: params = predict_params getattr(model, mode)() set_requires_grad(model, requires_grad=requires_grad) _runner_model, _runner_device = runner.model, runner.device runner.model, runner.device = tracer, device runner.predict_batch(batch, **params) result: ScriptModule = tracer.tracing_result runner.model, runner.device = _runner_model, _runner_device return result
def process_components( model: Model, criterion: Criterion = None, optimizer: Optimizer = None, scheduler: Scheduler = None, distributed_params: Dict = None, device: Device = None, ) -> Tuple[Model, Criterion, Optimizer, Scheduler, Device]: """ Returns the processed model, criterion, optimizer, scheduler and device. Args: model: torch model criterion: criterion function optimizer: optimizer scheduler: scheduler distributed_params (dict, optional): dict with the parameters for distributed and FP16 method device (Device, optional): device Returns: tuple with processed model, criterion, optimizer, scheduler and device. Raises: ValueError: if device is None and TPU available, for using TPU need to manualy move model/optimizer/scheduler to a TPU device and pass device to a function. NotImplementedError: if model is not nn.Module or dict for multi-gpu, nn.ModuleDict for DataParallel not implemented yet """ distributed_params = distributed_params or {} distributed_params = copy.deepcopy(distributed_params) distributed_params.update(get_distributed_params()) if device is None and IS_XLA_AVAILABLE: raise ValueError( "TPU device is available. " "Please move model, optimizer and scheduler (if present) " "to TPU device manualy and specify a device or " "use CPU device.") if device is None: device = get_device() elif isinstance(device, str): device = torch.device(device) is_apex_enabled = (distributed_params.pop("apex", False) and check_apex_available()) is_amp_enabled = (distributed_params.get("amp", False) and check_amp_available()) if is_apex_enabled and is_amp_enabled: raise ValueError("Both NVidia Apex and Torch.Amp are enabled. " "You must choose only one mixed precision backend") model: Model = maybe_recursive_call(model, "to", device=device) if check_ddp_wrapped(model): pass # distributed data parallel run (ddp) (with apex support) elif get_rank() >= 0: assert isinstance( model, nn.Module), "Distributed training is not available for KV model" local_rank = distributed_params.pop("local_rank", 0) or 0 device = f"cuda:{local_rank}" model = maybe_recursive_call(model, "to", device=device) syncbn = distributed_params.pop("syncbn", False) if is_apex_enabled: import apex if syncbn: model = apex.parallel.convert_syncbn_model(model) model, optimizer = initialize_apex(model, optimizer, **distributed_params) model = apex.parallel.DistributedDataParallel(model) else: if syncbn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) model = nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank) # data parallel run (dp) (with apex support) else: # apex issue https://github.com/deepset-ai/FARM/issues/210 use_apex = (is_apex_enabled and torch.cuda.device_count() == 1) or ( is_apex_enabled and torch.cuda.device_count() > 1 and distributed_params.get("opt_level", "O0") == "O1") if use_apex: assert isinstance( model, nn.Module), "Apex training is not available for KV model" model, optimizer = initialize_apex(model, optimizer, **distributed_params) if (torch.cuda.device_count() > 1 and device.type != "cpu" and device.index is None): if isinstance(model, nn.Module): model = nn.DataParallel(model) elif isinstance(model, dict): model = {k: nn.DataParallel(v) for k, v in model.items()} else: raise NotImplementedError() model: Model = maybe_recursive_call(model, "to", device=device) return model, criterion, optimizer, scheduler, device