def _get_experiment_components( experiment: IExperiment, stage: str = None, device: Device = None, ) -> Tuple[Model, Criterion, Optimizer, Scheduler, Device]: """ Inner method for `Experiment` components preparation. Check available torch device, takes model from the experiment and creates stage-specified criterion, optimizer, scheduler for it. Args: stage: experiment stage name of interest like "pretrain" / "train" / "finetune" / etc Returns: tuple: model, criterion, optimizer, scheduler and device for a given stage and model """ model = experiment.get_model(stage) criterion = experiment.get_criterion(stage) optimizer = experiment.get_optimizer(stage, model) scheduler = experiment.get_scheduler(stage, optimizer) model, criterion, optimizer, scheduler, device = process_components( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, distributed_params=experiment.distributed_params, device=device, ) return model, criterion, optimizer, scheduler, device
def predict_loader( self, *, loader: DataLoader, model: Model = None, resume: str = None, fp16: Union[Dict, bool] = None, initial_seed: int = 42, ) -> Generator: """ Runs model inference on PyTorch Dataloader and returns python generator with model predictions from `runner.predict_batch`. Cleans up the experiment info to avoid possible collisions. Sets `is_train_loader` and `is_valid_loader` to `False` while keeping `is_infer_loader` as True. Moves model to evaluation mode. Args: loader: loader to predict model: model to use for prediction resume: path to checkpoint to resume fp16 (Union[Dict, bool]): fp16 usage flag initial_seed: seed to use before prediction Yields: bathes with model predictions """ if isinstance(fp16, bool) and fp16: fp16 = {"opt_level": "O1"} if model is not None: self.model = model assert self.model is not None if resume is not None: checkpoint = load_checkpoint(resume) unpack_checkpoint(checkpoint, model=self.model) self.experiment = None set_global_seed(initial_seed) (model, _, _, _, device) = process_components( # noqa: WPS122 model=self.model, distributed_params=fp16, device=self.device, ) self._prepare_inner_state( stage="infer", model=model, device=device, is_train_loader=False, is_valid_loader=False, is_infer_loader=True, ) maybe_recursive_call(self.model, "train", mode=False) set_global_seed(initial_seed) for batch in loader: yield self.predict_batch(batch)
def main(args, _=None): """Run the ``catalyst-contrib image2embeddings`` script.""" global IMG_SIZE set_global_seed(args.seed) prepare_cudnn(args.deterministic, args.benchmark) IMG_SIZE = (args.img_size, args.img_size) # noqa: WPS442 if args.traced_model is not None: device = get_device() model = torch.jit.load(str(args.traced_model), map_location=device) else: model = ResnetEncoder(arch=args.arch, pooling=args.pooling) model = model.eval() model, _, _, _, device = process_components(model=model) df = pd.read_csv(args.in_csv) df = df.reset_index().drop("index", axis=1) df = list(df.to_dict("index").values()) open_fn = ImageReader(input_key=args.img_col, output_key="image", rootpath=args.rootpath) dataloader = get_loader( df, open_fn, batch_size=args.batch_size, num_workers=args.num_workers, dict_transform=dict_transformer, ) features = [] dataloader = tqdm(dataloader) if args.verbose else dataloader with torch.no_grad(): for batch in dataloader: batch_features = model(batch["image"].to(device)) batch_features = batch_features.cpu().detach().numpy() features.append(batch_features) features = np.concatenate(features, axis=0) np.save(args.out_npy, features)
def on_stage_start(self, runner: "IRunner") -> None: """Event handler for stage start. For the `IStageBasedRunner` case: - prepares loaders - our datasources - prepares model components - model, criterion, optimizer, scheduler - prepares callbacks for the current stage Args: runner: IRunner instance. """ super().on_stage_start(runner) set_global_seed(self.experiment.initial_seed) loaders = self.experiment.get_loaders(stage=self.stage) loaders = validate_loaders(loaders) # self.loaders = loaders set_global_seed(self.experiment.initial_seed) model = self.experiment.get_model(self.stage) criterion = self.experiment.get_criterion(self.stage) optimizer = self.experiment.get_optimizer(self.stage, model) scheduler = self.experiment.get_scheduler(self.stage, optimizer) model, criterion, optimizer, scheduler, device = process_components( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, distributed_params=self.experiment.distributed_params, device=self.device, ) set_global_seed(self.experiment.initial_seed) callbacks = self.experiment.get_callbacks(self.stage) callbacks = filter_callbacks_by_node(callbacks) callbacks = sort_callbacks_by_order(callbacks) migrating_params = dict(**self.experiment.get_stage_params(self.stage)) migrate_from_previous_stage = migrating_params.get( "migrate_from_previous_stage", True) if (migrate_from_previous_stage and getattr(self, "callbacks", None) is not None): for key, value in self.callbacks.items(): if value.scope == CallbackScope.experiment: callbacks[key] = value callbacks = sort_callbacks_by_order(callbacks) if migrate_from_previous_stage: migrating_params.update({ "global_epoch": getattr(self, "global_epoch", 1), "global_batch_step": getattr(self, "global_batch_step", 0), "global_sample_step": getattr(self, "global_sample_step", 0), "resume": getattr(self, "resume", None), }) self._prepare_inner_state( stage=self.stage, model=model, device=device, criterion=criterion, optimizer=optimizer, scheduler=scheduler, callbacks=callbacks, loaders=loaders, **migrating_params, )
def main(args, _=None): """Run the ``catalyst-contrib text2embeddings`` script.""" batch_size = args.batch_size num_workers = args.num_workers max_length = args.max_length pooling_groups = args.pooling.split(",") bert_level = args.bert_level if bert_level is not None: assert (args.output_hidden_states ), "You need hidden states output for level specification" set_global_seed(args.seed) prepare_cudnn(args.deterministic, args.benchmark) if getattr(args, "in_huggingface", False): model_config = BertConfig.from_pretrained(args.in_huggingface) model_config.output_hidden_states = args.output_hidden_states model = BertModel.from_pretrained(args.in_huggingface, config=model_config) tokenizer = BertTokenizer.from_pretrained(args.in_huggingface) else: model_config = BertConfig.from_pretrained(args.in_config) model_config.output_hidden_states = args.output_hidden_states model = BertModel(config=model_config) tokenizer = BertTokenizer.from_pretrained(args.in_vocab) if getattr(args, "in_model", None) is not None: checkpoint = load_checkpoint(args.in_model) checkpoint = {"model_state_dict": checkpoint} unpack_checkpoint(checkpoint=checkpoint, model=model) model = model.eval() model, _, _, _, device = process_components(model=model) df = pd.read_csv(args.in_csv) df = df.dropna(subset=[args.txt_col]) df.to_csv(f"{args.out_prefix}.df.csv", index=False) df = df.reset_index().drop("index", axis=1) df = list(df.to_dict("index").values()) num_samples = len(df) open_fn = LambdaReader( input_key=args.txt_col, output_key=None, lambda_fn=partial( tokenize_text, strip=args.strip, lowercase=args.lowercase, remove_punctuation=args.remove_punctuation, ), tokenizer=tokenizer, max_length=max_length, ) dataloader = get_loader( df, open_fn, batch_size=batch_size, num_workers=num_workers, ) features = {} dataloader = tqdm(dataloader) if args.verbose else dataloader with torch.no_grad(): for idx, batch_input in enumerate(dataloader): batch_input = any2device(batch_input, device) batch_output = model(**batch_input) mask = (batch_input["attention_mask"].unsqueeze(-1) if args.mask_for_max_length else None) if check_ddp_wrapped(model): # using several gpu hidden_size = model.module.config.hidden_size hidden_states = model.module.config.output_hidden_states else: # using cpu or one gpu hidden_size = model.config.hidden_size hidden_states = model.config.output_hidden_states batch_features = process_bert_output( bert_output=batch_output, hidden_size=hidden_size, output_hidden_states=hidden_states, pooling_groups=pooling_groups, mask=mask, ) # create storage based on network output if idx == 0: for layer_name, layer_value in batch_features.items(): if bert_level is not None and bert_level != layer_name: continue layer_name = (layer_name if isinstance(layer_name, str) else f"{layer_name:02d}") _, embedding_size = layer_value.shape features[layer_name] = np.memmap( f"{args.out_prefix}.{layer_name}.npy", dtype=np.float32, mode="w+", shape=(num_samples, embedding_size), ) indices = np.arange(idx * batch_size, min((idx + 1) * batch_size, num_samples)) for layer_name2, layer_value2 in batch_features.items(): if bert_level is not None and bert_level != layer_name2: continue layer_name2 = (layer_name2 if isinstance(layer_name2, str) else f"{layer_name2:02d}") features[layer_name2][indices] = _detach(layer_value2) if args.force_save: for key, mmap in features.items(): mmap.flush() np.save(f"{args.out_prefix}.{key}.force.npy", mmap, allow_pickle=False)