def _load_states_from_file_map(*, runner: "IRunner", load_map: Dict[str, str]) -> None: """ Load state of a model, criterion, optimizer, scheduler from files specified in ``load_map``. Arguments: runner: current runner load_map (Dict[str, str]): dict with mappings to load. Expected keys - ``'model'``, ``'criterion'`` ``'optimizer'``, ``'scheduler'``, other keys will be ignored. Expected that values will be states (``'best'``, ``"best_full"``, ``"last"``, ``"last_full"``) or path to checkpoint. **NOTE:** for successful load criterion, optimizer, scheduler states required a full checkpoint. Raises: FileNotFoundError: when file/state specified in ``load_map`` is not exist. """ required_files = _required_files(runner.logdir, load_map) for filename in required_files.keys(): if not os.path.isfile(filename): raise FileNotFoundError(f"No checkpoint found at {filename}!") # extracting parts from files for filename, parts_to_load in required_files.items(): print(f"=> Loading {', '.join(parts_to_load)} from {filename}") checkpoint = load_checkpoint(filename) to_unpack = {part: getattr(runner, part) for part in parts_to_load} unpack_checkpoint(checkpoint, **to_unpack) print(f" loaded: {', '.join(parts_to_load)}")
def load_model_from_path(model: Module, path: Path): if os.path.isdir(path): load_model_from_path(model, path / "best.pth") model_sd = load_checkpoint(path) try: unpack_checkpoint(model_sd, model=model) except KeyError: model_sd.load_state_dict(model_sd)
def predict_loader( self, *, loader: DataLoader, model: Model = None, resume: str = None, fp16: Union[Dict, bool] = None, initial_seed: int = 42, ) -> Generator: """ Runs model inference on PyTorch Dataloader and returns python generator with model predictions from `runner.predict_batch`. Cleans up the experiment info to avoid possible collisions. Sets `is_train_loader` and `is_valid_loader` to `False` while keeping `is_infer_loader` as True. Moves model to evaluation mode. Args: loader: loader to predict model: model to use for prediction resume: path to checkpoint to resume fp16 (Union[Dict, bool]): fp16 usage flag initial_seed: seed to use before prediction Yields: bathes with model predictions """ if isinstance(fp16, bool) and fp16: fp16 = {"opt_level": "O1"} if model is not None: self.model = model assert self.model is not None if resume is not None: checkpoint = load_checkpoint(resume) unpack_checkpoint(checkpoint, model=self.model) self.experiment = None set_global_seed(initial_seed) (model, _, _, _, device) = process_components( # noqa: WPS122 model=self.model, distributed_params=fp16, device=self.device, ) self._prepare_inner_state( stage="infer", model=model, device=device, is_train_loader=False, is_valid_loader=False, is_infer_loader=True, ) maybe_recursive_call(self.model, "train", mode=False) set_global_seed(initial_seed) for batch in loader: yield self.predict_batch(batch)
def _load_checkpoint(*, filename, runner: "IRunner", load_full: bool = True) -> None: """ Load checkpoint from a file. Arguments: filename: path to checkpoint runner: current runner load_full: if true (default) then will be performed loading states for criterion, optimizer and scheduler. File should contain keys required for loading model (``'model_state_dict'``), criterion (``'criterion_state_dict'``) (only for full load), optimizer (``'optimizer_state_dict'``), scheduler (``'scheduler_state_dict'``). Raises: FileNotFoundError: when file specified in ``filename`` is not exist. """ if not os.path.isfile(filename): raise FileNotFoundError(f"No checkpoint found at {filename}!") print(f"=> Loading checkpoint {filename}") checkpoint = load_checkpoint(filename) if not runner.stage.startswith("infer") and load_full: runner.stage = checkpoint["stage"] runner.epoch = checkpoint["epoch"] runner.global_epoch = checkpoint["global_epoch"] # @TODO: should we also load, # checkpoint_data, main_metric, minimize_metric, valid_loader ? # epoch_metrics, valid_metrics ? if load_full: unpack_checkpoint( checkpoint, model=runner.model, criterion=runner.criterion, optimizer=runner.optimizer, scheduler=runner.scheduler, ) print(f"loaded state checkpoint {filename} " f"(global epoch {checkpoint['global_epoch']}, " f"epoch {checkpoint['epoch']}, " f"stage {checkpoint['stage']})") else: unpack_checkpoint( checkpoint, model=runner.model, ) print(f"loaded model checkpoint {filename}")
def load_optimizer_from_checkpoint( optimizer: Optimizer, checkpoint_path: str, checkpoint_optimizer_key: str, model_parameters, optimizer_params, ) -> Optimizer: """ Loads optimizer state from checkpoint Args: optimizer: optimizer checkpoint_path: path to checkpoint file checkpoint_optimizer_key: key if optimizer checkpoint in checkpoint state dict model_parameters: model parameters optimizer_params: optimizer config parameters Returns: optimizer loaded from checkpoint """ checkpoint = load_checkpoint(checkpoint_path) dict2load = optimizer if checkpoint_optimizer_key is not None: dict2load = {checkpoint_optimizer_key: optimizer} unpack_checkpoint(checkpoint, optimizer=dict2load) # move optimizer to device device = get_device() for param in model_parameters: param = param["params"][0] optimizer_state = optimizer.state[param] for state_key, state_value in optimizer_state.items(): optimizer_state[state_key] = any2device(state_value, device) # update optimizer params for key, value in optimizer_params.items(): for optimizer_param_group in optimizer.param_groups: optimizer_param_group[key] = value return optimizer
def _get_optimizer(self, stage: str, model: Union[Model, Dict[str, Model]], **params) -> Optimizer: # @TODO 1: refactoring; this method is too long # @TODO 2: load state dicts for schedulers & criterion layerwise_params = params.pop("layerwise_params", OrderedDict()) no_bias_weight_decay = params.pop("no_bias_weight_decay", True) # linear scaling rule from https://arxiv.org/pdf/1706.02677.pdf lr_scaling_params = params.pop("lr_linear_scaling", None) if lr_scaling_params: data_params = dict(self.stages_config[stage]["data_params"]) batch_size = data_params.get("batch_size") per_gpu_scaling = data_params.get("per_gpu_scaling", False) distributed_rank = get_rank() distributed = distributed_rank > -1 if per_gpu_scaling and not distributed: num_gpus = max(1, torch.cuda.device_count()) batch_size *= num_gpus base_lr = lr_scaling_params.get("lr") base_batch_size = lr_scaling_params.get("base_batch_size", 256) lr_scaling = batch_size / base_batch_size params["lr"] = base_lr * lr_scaling # scale default lr else: lr_scaling = 1.0 # getting model parameters model_key = params.pop("_model", None) if model_key is None: assert isinstance( model, nn.Module ), "model is key-value, but optimizer has no specified model" model_params = process_model_params(model, layerwise_params, no_bias_weight_decay, lr_scaling) elif isinstance(model_key, str): model_params = process_model_params( model[model_key], layerwise_params, no_bias_weight_decay, lr_scaling, ) elif isinstance(model_key, (list, tuple)): model_params = [] for model_key_el in model_key: model_params_el = process_model_params( model[model_key_el], layerwise_params, no_bias_weight_decay, lr_scaling, ) model_params.extend(model_params_el) else: raise ValueError("unknown type of model_params") load_from_previous_stage = params.pop("load_from_previous_stage", False) optimizer_key = params.pop("optimizer_key", None) optimizer = OPTIMIZERS.get_from_params(**params, params=model_params) if load_from_previous_stage and self.stages.index(stage) != 0: checkpoint_path = f"{self.logdir}/checkpoints/best_full.pth" checkpoint = load_checkpoint(checkpoint_path) dict2load = optimizer if optimizer_key is not None: dict2load = {optimizer_key: optimizer} unpack_checkpoint(checkpoint, optimizer=dict2load) # move optimizer to device device = get_device() for param in model_params: param = param["params"][0] optimizer_state = optimizer.state[param] for state_key, state_value in optimizer_state.items(): optimizer_state[state_key] = any2device( state_value, device) # update optimizer params for key, value in params.items(): for optimizer_param_group in optimizer.param_groups: optimizer_param_group[key] = value return optimizer
criterion=nn.CrossEntropyLoss(), optimizer=optimizer, scheduler=scheduler, loaders=loaders, logdir=LOGDIR, num_epochs=EPOCHS, fp16=tu.fp16_params, callbacks=callbacks, verbose=True, load_best_on_end=False, resume=RESUME) model = make_model() cp = load_checkpoint(f"{LOGDIR}/checkpoints/best.pth") unpack_checkpoint(cp, model) model = model.eval() labels_fold = [] for b in tqdm(runner.predict_loader(model=model, loader=test_loader, fp16=tu.fp16_params), total=len(test_loader)): labels_batch = nn.functional.softmax(b['logits'], dim=1).data.cpu().numpy() labels_fold.extend(list(labels_batch)) if labels_blend is None: labels_blend = np.array(labels_fold) else: labels_blend += np.array(labels_fold)
def trace_model_from_runner( runner: "IRunner", checkpoint_name: str = None, method_name: str = "forward", mode: str = "eval", requires_grad: bool = False, opt_level: str = None, device: Device = "cpu", ) -> jit.ScriptModule: """ Traces model using created experiment and runner. Args: runner: current runner. checkpoint_name: Name of model checkpoint to use, if None traces current model from runner method_name: Model's method name that will be used as entrypoint during tracing mode: Mode for model to trace (``train`` or ``eval``) requires_grad: Flag to use grads opt_level: AMP FP16 init level device: Torch device Returns: ScriptModule: Traced model """ logdir = runner.logdir model = get_nn_from_ddp_module(runner.model) if checkpoint_name is not None: dumped_checkpoint = pack_checkpoint(model=model) checkpoint_path = logdir / "checkpoints" / f"{checkpoint_name}.pth" checkpoint = load_checkpoint(filepath=checkpoint_path) unpack_checkpoint(checkpoint=checkpoint, model=model) # getting input names of args for method since we don't have Runner # and we don't know input_key to preprocess batch for method call fn = getattr(model, method_name) method_argnames = _get_input_argnames(fn=fn, exclude=["self"]) batch = {} for name in method_argnames: # TODO: We don't know input_keys without runner assert name in runner.input, ( "Input batch should contain the same keys as input argument " "names of `forward` function to be traced correctly") batch[name] = runner.input[name] batch = any2device(batch, device) # Dumping previous runner of the model, we will need it to restore device_dump, is_training_dump, requires_grad_dump = ( runner.device, model.training, get_requires_grad(model), ) model.to(device) # Function to run prediction on batch def predict_fn(model: Model, inputs, **kwargs): # noqa: WPS442 return model(**inputs, **kwargs) traced_model = trace_model( model=model, predict_fn=predict_fn, batch=batch, method_name=method_name, mode=mode, requires_grad=requires_grad, opt_level=opt_level, device=device, ) if checkpoint_name is not None: unpack_checkpoint(checkpoint=dumped_checkpoint, model=model) # Restore previous runner of the model getattr(model, "train" if is_training_dump else "eval")() set_requires_grad(model, requires_grad_dump) model.to(device_dump) return traced_model
def trace_model_from_checkpoint( logdir: Path, method_name: str, checkpoint_name: str, stage: str = None, loader: Union[str, int] = None, mode: str = "eval", requires_grad: bool = False, opt_level: str = None, device: Device = "cpu", ): """ Traces model using created experiment and runner. Args: logdir (Union[str, Path]): Path to Catalyst logdir with model checkpoint_name: Name of model checkpoint to use stage: experiment's stage name loader (Union[str, int]): experiment's loader name or its index method_name: Model's method name that will be used as entrypoint during tracing mode: Mode for model to trace (``train`` or ``eval``) requires_grad: Flag to use grads opt_level: AMP FP16 init level device: Torch device Returns: the traced model """ config_path = logdir / "configs" / "_config.json" checkpoint_path = logdir / "checkpoints" / f"{checkpoint_name}.pth" logging.info("Load config") config: Dict[str, dict] = load_config(config_path) # Get expdir name config_expdir = Path(config["args"]["expdir"]) # We will use copy of expdir from logs for reproducibility expdir = Path(logdir) / "code" / config_expdir.name logger.info("Import experiment and runner from logdir") experiment: ConfigExperiment = None experiment, runner, _ = prepare_config_api_components(expdir=expdir, config=config) logger.info(f"Load model state from checkpoints/{checkpoint_name}.pth") if stage is None: stage = list(experiment.stages)[0] model = experiment.get_model(stage) checkpoint = load_checkpoint(checkpoint_path) unpack_checkpoint(checkpoint, model=model) runner.model, runner.device = model, device if loader is None: loader = 0 batch = get_native_batch_from_loaders( loaders=experiment.get_loaders(stage), loader=loader) # function to run prediction on batch def predict_fn(model, inputs, **kwargs): # noqa: WPS442 model_dump = runner.model runner.model = model result = runner.predict_batch(inputs, **kwargs) runner.model = model_dump return result logger.info("Tracing is running...") traced_model = trace_model( model=model, predict_fn=predict_fn, batch=batch, method_name=method_name, mode=mode, requires_grad=requires_grad, opt_level=opt_level, device=device, ) logger.info("Done") return traced_model
def main(args, _=None): """Run the ``catalyst-contrib text2embeddings`` script.""" batch_size = args.batch_size num_workers = args.num_workers max_length = args.max_length pooling_groups = args.pooling.split(",") bert_level = args.bert_level if bert_level is not None: assert (args.output_hidden_states ), "You need hidden states output for level specification" set_global_seed(args.seed) prepare_cudnn(args.deterministic, args.benchmark) if getattr(args, "in_huggingface", False): model_config = BertConfig.from_pretrained(args.in_huggingface) model_config.output_hidden_states = args.output_hidden_states model = BertModel.from_pretrained(args.in_huggingface, config=model_config) tokenizer = BertTokenizer.from_pretrained(args.in_huggingface) else: model_config = BertConfig.from_pretrained(args.in_config) model_config.output_hidden_states = args.output_hidden_states model = BertModel(config=model_config) tokenizer = BertTokenizer.from_pretrained(args.in_vocab) if getattr(args, "in_model", None) is not None: checkpoint = load_checkpoint(args.in_model) checkpoint = {"model_state_dict": checkpoint} unpack_checkpoint(checkpoint=checkpoint, model=model) model = model.eval() model, _, _, _, device = process_components(model=model) df = pd.read_csv(args.in_csv) df = df.dropna(subset=[args.txt_col]) df.to_csv(f"{args.out_prefix}.df.csv", index=False) df = df.reset_index().drop("index", axis=1) df = list(df.to_dict("index").values()) num_samples = len(df) open_fn = LambdaReader( input_key=args.txt_col, output_key=None, lambda_fn=partial( tokenize_text, strip=args.strip, lowercase=args.lowercase, remove_punctuation=args.remove_punctuation, ), tokenizer=tokenizer, max_length=max_length, ) dataloader = get_loader( df, open_fn, batch_size=batch_size, num_workers=num_workers, ) features = {} dataloader = tqdm(dataloader) if args.verbose else dataloader with torch.no_grad(): for idx, batch_input in enumerate(dataloader): batch_input = any2device(batch_input, device) batch_output = model(**batch_input) mask = (batch_input["attention_mask"].unsqueeze(-1) if args.mask_for_max_length else None) if check_ddp_wrapped(model): # using several gpu hidden_size = model.module.config.hidden_size hidden_states = model.module.config.output_hidden_states else: # using cpu or one gpu hidden_size = model.config.hidden_size hidden_states = model.config.output_hidden_states batch_features = process_bert_output( bert_output=batch_output, hidden_size=hidden_size, output_hidden_states=hidden_states, pooling_groups=pooling_groups, mask=mask, ) # create storage based on network output if idx == 0: for layer_name, layer_value in batch_features.items(): if bert_level is not None and bert_level != layer_name: continue layer_name = (layer_name if isinstance(layer_name, str) else f"{layer_name:02d}") _, embedding_size = layer_value.shape features[layer_name] = np.memmap( f"{args.out_prefix}.{layer_name}.npy", dtype=np.float32, mode="w+", shape=(num_samples, embedding_size), ) indices = np.arange(idx * batch_size, min((idx + 1) * batch_size, num_samples)) for layer_name2, layer_value2 in batch_features.items(): if bert_level is not None and bert_level != layer_name2: continue layer_name2 = (layer_name2 if isinstance(layer_name2, str) else f"{layer_name2:02d}") features[layer_name2][indices] = _detach(layer_value2) if args.force_save: for key, mmap in features.items(): mmap.flush() np.save(f"{args.out_prefix}.{key}.force.npy", mmap, allow_pickle=False)