def run(logdir_suffix: str = '', device: str = None, check: bool = False) -> dict: device = device or utils.get_device() print(f"device: {device}") utils.set_global_seed(SEED) # convert parquet ot zip parquet_to_images(TRAIN, ZIP_TRAIN_FILE, SIZE) parquet_to_images(TEST, ZIP_TEST_FILE, SIZE) # run experiment RunnerClass = SupervisedRunner if check else SupervisedWandbRunner runner = RunnerClass( device=device, input_key="images", output_key=["features"] + ["logit_" + c for c in output_classes.keys()], input_target_key=list(output_classes.keys()), ) experiment = Experiment(logdir='./logs' + logdir_suffix) runner.run_experiment(experiment, check=check) return { 'runner': runner, 'experiment': experiment, }
def run(name: str = None, config: dict = None, device: str = None, check: bool = False) -> dict: config = config or experiment_config device = device or utils.get_device() print(f"device: {device}") utils.set_global_seed(SEED) # inititalize weigths & biases name = name or '_'.join( filter(None, [experiment_name, f"{datetime.datetime.now():%Y-%m-%d-%S}"])) # convert parquet ot zip parquet_to_images(TRAIN, ZIP_TRAIN_FILE, SIZE) parquet_to_images(TEST, ZIP_TEST_FILE, SIZE) # run experiment runner = SupervisedRunner( device=device, input_key="images", output_key=["logit_" + c for c in output_classes.keys()], input_target_key=list(output_classes.keys()), ) experiment = Experiment(config) runner.run_experiment(experiment, check=check) return { 'runner': runner, 'experiment': experiment, 'config': config, }
def _run_epoch(self, loaders): # @TODO: better solution with train/inference handling ? if not self.state.stage.startswith("infer"): assert self.state.valid_loader in loaders.keys(), \ f"'{self.state.valid_loader}' " \ f"should be in provided loaders: {list(loaders.keys())}" else: assert not any(x.startswith("train") for x in loaders.keys()), \ "for inference no train loader should be passed" for loader_name, loader in loaders.items(): self.state.loader_name = loader_name self.state.loader_len = len(loader) self.state.need_backward = loader_name.startswith("train") utils.maybe_recursive_call(self.model, "train", mode=self.state.need_backward) if isinstance(loader.sampler, DistributedSampler) \ and loader_name.startswith("train"): loader.sampler.set_epoch(self.state.stage_epoch) utils.set_global_seed(self.experiment.initial_seed + self.state.epoch + 1) self._run_event("loader_start") with torch.set_grad_enabled(self.state.need_backward): self._run_loader(loader) self._run_event("loader_end")
def run(config: dict = None, logdir_suffix: str = '', device: str = None, check: bool = False) -> dict: config = config or experiment_config device = device or utils.get_device() print(f"device: {device}") utils.set_global_seed(SEED) config['monitoring_params']['name'] = EXPERIMENT_NAME config['stages']['state_params']['checkpoint_data']['image_size'] = SIZE config['args']['logdir'] += logdir_suffix # convert parquet ot zip parquet_to_images(TRAIN, ZIP_TRAIN_FILE, SIZE) parquet_to_images(TEST, ZIP_TEST_FILE, SIZE) # run experiment RunnerClass = SupervisedRunner if check else SupervisedWandbRunner runner = RunnerClass( device=device, input_key="images", output_key=["logit_" + c for c in output_classes.keys()], input_target_key=list(output_classes.keys()), ) experiment = Experiment(config) runner.run_experiment(experiment, check=check) return { 'runner': runner, 'experiment': experiment, 'config': config, }
def _get_experiment_components( self, stage: str = None ) -> Tuple[_Model, _Criterion, _Optimizer, _Scheduler, torch.device]: """ Inner method for children's classes for model specific initialization. As baseline, checks device support and puts model on it. :return: """ utils.set_global_seed(self.experiment.initial_seed) model = self.experiment.get_model(stage) criterion, optimizer, scheduler = \ self.experiment.get_experiment_components(model, stage) model, criterion, optimizer, scheduler, device = \ utils.process_components( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, distributed_params=self.experiment.distributed_params ) return model, criterion, optimizer, scheduler, device
def run(name: str = None, config: dict = None, device: str = None, check: bool = False) -> dict: config = config or experiment_config device = device or utils.get_device() print(f"device: {device}") utils.set_global_seed(SEED) config['monitoring_params']['name'] = EXPERIMENT_NAME # convert parquet ot zip parquet_to_images(TRAIN, ZIP_TRAIN_FILE, SIZE) parquet_to_images(TEST, ZIP_TEST_FILE, SIZE) # run experiment runner = SupervisedRunner( device=device, input_key="images", output_key=["logit_" + c for c in output_classes.keys()], input_target_key=list(output_classes.keys()), ) experiment = Experiment(config) runner.run_experiment(experiment, check=check) return { 'runner': runner, 'experiment': experiment, 'config': config, }
def _prepare_for_stage(self, stage: str): super()._prepare_for_stage(stage=stage) # @TODO: remove this trick utils.set_global_seed(self.experiment.initial_seed) loaders = self.experiment.get_loaders(stage=stage) self.loaders = loaders
def run(max_lr: float = 1e-1, steps_per_epoch: int = 1413, device: str = None, check: bool = False) -> dict: config = copy.deepcopy(experiment_config) device = device or utils.get_device() print(f"device: {device}") utils.set_global_seed(SEED) # convert parquet ot zip parquet_to_images(TRAIN, ZIP_TRAIN_FILE, SIZE) parquet_to_images(TEST, ZIP_TEST_FILE, SIZE) config['monitoring_params']['name'] = EXPERIMENT_NAME config['stages']['state_params']['checkpoint_data']['image_size'] = SIZE # add scheduler to config config["stages"]["scheduler_params"] = { "scheduler": "OneCycleLR", "max_lr": max_lr, "epochs": config["stages"]["state_params"]["num_epochs"], "steps_per_epoch": steps_per_epoch, "div_factor": 200, "final_div_factor": 1e5, } experiment = Experiment(config) # run experiment runner = SupervisedWandbRunner( device=device, input_key="images", output_key=["logit_" + c for c in output_classes.keys()], input_target_key=list(output_classes.keys()),) runner.run_experiment(experiment, check=check) return experiment, runner
def main_worker(args, unknown_args): """Runs main worker thread from model training.""" args, config = utils.parse_args_uargs(args, unknown_args) utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) config.setdefault("distributed_params", {})["apex"] = args.apex config.setdefault("distributed_params", {})["amp"] = args.amp expdir = Path(args.expdir) # optuna objective def objective(trial: optuna.trial): trial, trial_config = _process_trial_config(trial, config.copy()) experiment, runner, trial_config = utils.prepare_config_api_components( expdir=expdir, config=trial_config) # @TODO: here we need better solution. experiment._trial = trial # noqa: WPS437 if experiment.logdir is not None and utils.get_rank() <= 0: utils.dump_environment(trial_config, experiment.logdir, args.configs) utils.dump_code(args.expdir, experiment.logdir) runner.run_experiment(experiment) return runner.best_valid_metrics[runner.main_metric] # optuna direction direction = ("minimize" if config.get("stages", {}).get( "stage_params", {}).get("minimize_metric", True) else "maximize") # optuna sampler sampler_params = config.pop("optuna_sampler_params", {}) optuna_sampler_type = sampler_params.pop("sampler", None) optuna_sampler = (optuna.samplers.__dict__[optuna_sampler_type]( **sampler_params) if optuna_sampler_type is not None else None) # optuna pruner pruner_params = config.pop("optuna_pruner_params", {}) optuna_pruner_type = pruner_params.pop("pruner", None) optuna_pruner = (optuna.pruners.__dict__[optuna_pruner_type]( **pruner_params) if optuna_pruner_type is not None else None) study = optuna.create_study( direction=direction, storage=args.storage, study_name=args.study_name, sampler=optuna_sampler, pruner=optuna_pruner, ) study.optimize( objective, n_trials=args.n_trials, timeout=args.timeout, n_jobs=args.n_jobs or 1, gc_after_trial=args.gc_after_trial, show_progress_bar=args.show_progress_bar, )
def predict_loader( self, *, loader: DataLoader, model: Model = None, resume: str = None, fp16: Union[Dict, bool] = None, initial_seed: int = 42, ) -> Generator: """ Runs model inference on PyTorch Dataloader and returns python generator with model predictions from `runner.predict_batch`. Cleans up the experiment info to avoid possible collisions. Sets `is_train_loader` and `is_valid_loader` to `False` while keeping `is_infer_loader` as True. Moves model to evaluation mode. Args: loader: loader to predict model: model to use for prediction resume: path to checkpoint to resume fp16 (Union[Dict, bool]): fp16 usage flag initial_seed: seed to use before prediction Yields: bathes with model predictions """ if isinstance(fp16, bool) and fp16: fp16 = {"opt_level": "O1"} if model is not None: self.model = model assert self.model is not None if resume is not None: checkpoint = utils.load_checkpoint(resume) utils.unpack_checkpoint(checkpoint, model=self.model) self.experiment = None utils.set_global_seed(initial_seed) (model, _, _, _, device) = utils.process_components( # noqa: WPS122 model=self.model, distributed_params=fp16, device=self.device, ) self._prepare_inner_state( stage="infer", model=model, device=device, is_train_loader=False, is_valid_loader=False, is_infer_loader=True, ) utils.maybe_recursive_call(self.model, "train", mode=False) utils.set_global_seed(initial_seed) for batch in loader: yield self.predict_batch(batch)
def predict_loader( self, *, loader: DataLoader, model: Model = None, resume: str = None, fp16: Union[Dict, bool] = None, initial_seed: int = 42, ) -> Generator: """ Runs model inference on PyTorch Dataloader and returns python Generator with model predictions from `runner.predict_batch` Args: loader (DataLoader): loader to predict model (Model): model to use for prediction resume (str): path to checkpoint to resume fp16 (Union[Dict, bool]): fp16 usage flag initial_seed (int): seed to use before prediction Yields: bathes with model predictions """ if isinstance(fp16, bool) and fp16: fp16 = {"opt_level": "O1"} if model is not None: self.model = model assert self.model is not None if resume is not None: checkpoint = utils.load_checkpoint(resume) utils.unpack_checkpoint(checkpoint, model=self.model) ( # noqa: WPS122 self.model, _, _, _, self.device, ) = utils.process_components( model=self.model, distributed_params=fp16, device=self.device, ) utils.set_global_seed(initial_seed) for batch in loader: yield self.predict_batch(batch)
def run(config: dict = None, model_filepath: str = None, logdir_suffix: str = '_' + EXPERIMENT_NAME, max_lr: float = 1e-1, steps_per_epoch: int = 1413, device: str = None, check: bool = False) -> dict: config = config or experiment_config device = device or utils.get_device() print(f"device: {device}") utils.set_global_seed(SEED) config['monitoring_params']['name'] = EXPERIMENT_NAME config['stages']['state_params']['checkpoint_data']['image_size'] = SIZE config['args']['logdir'] += logdir_suffix # convert parquet ot zip parquet_to_images(TRAIN, ZIP_TRAIN_FILE, SIZE) parquet_to_images(TEST, ZIP_TEST_FILE, SIZE) # add scheduler to config config["stages"]["scheduler_params"] = { "scheduler": "OneCycleLR", "max_lr": max_lr, "epochs": config["stages"]["state_params"]["num_epochs"], "steps_per_epoch": steps_per_epoch, "div_factor": 500, "final_div_factor": 1e5, "max_momentum": 0.999 } # run experiment RunnerClass = SupervisedRunner if check else SupervisedWandbRunner runner = RunnerClass( device=device, input_key="images", output_key=["logit_" + c for c in output_classes.keys()], input_target_key=list(output_classes.keys()), ) experiment = Experiment(config, model_filepath) runner.run_experiment(experiment, check=check) return { 'runner': runner, 'experiment': experiment, 'config': config, }
def main(args, unknown_args): """Run the ``catalyst-dl run`` script""" args, config = utils.parse_args_uargs(args, unknown_args) utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) Experiment, Runner = utils.import_experiment_and_runner(Path(args.expdir)) runner_params = config.pop("runner_params", {}) or {} experiment = Experiment(config) runner = Runner(**runner_params) if experiment.logdir is not None: utils.dump_environment(config, experiment.logdir, args.configs) utils.dump_code(args.expdir, experiment.logdir) check_run = safitty.get(config, "args", "check", default=False) runner.run_experiment(experiment, check=check_run)
def main_worker(args, unknown_args): """Runs main worker thread from model training.""" args, config = utils.parse_args_uargs(args, unknown_args) utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) config.setdefault("distributed_params", {})["apex"] = args.apex config.setdefault("distributed_params", {})["amp"] = args.amp experiment, runner, config = utils.prepare_config_api_components( expdir=Path(args.expdir), config=config ) if experiment.logdir is not None and utils.get_rank() <= 0: utils.dump_environment(config, experiment.logdir, args.configs) utils.dump_code(args.expdir, experiment.logdir) runner.run_experiment(experiment)
def main_worker(args, unknown_args): args, config = utils.parse_args_uargs(args, unknown_args) utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) config.setdefault("distributed_params", {})["apex"] = args.apex Experiment, Runner = utils.import_experiment_and_runner(Path(args.expdir)) runner_params = config.get("runner_params", {}) experiment = Experiment(config) runner = Runner(**runner_params) if experiment.logdir is not None and get_rank() <= 0: utils.dump_environment(config, experiment.logdir, args.configs) utils.dump_code(args.expdir, experiment.logdir) runner.run_experiment(experiment)
def main(args, _=None): """Run the ``catalyst-data image2embeddings`` script.""" global IMG_SIZE utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) IMG_SIZE = (args.img_size, args.img_size) # noqa: WPS442 if args.traced_model is not None: device = utils.get_device() model = torch.jit.load(str(args.traced_model), map_location=device) else: model = ResnetEncoder(arch=args.arch, pooling=args.pooling) model = model.eval() model, _, _, _, device = utils.process_components(model=model) df = pd.read_csv(args.in_csv) df = df.reset_index().drop("index", axis=1) df = list(df.to_dict("index").values()) open_fn = ImageReader(input_key=args.img_col, output_key="image", rootpath=args.rootpath) dataloader = utils.get_loader( df, open_fn, batch_size=args.batch_size, num_workers=args.num_workers, dict_transform=dict_transformer, ) features = [] dataloader = tqdm(dataloader) if args.verbose else dataloader with torch.no_grad(): for batch in dataloader: batch_features = model(batch["image"].to(device)) batch_features = batch_features.cpu().detach().numpy() features.append(batch_features) features = np.concatenate(features, axis=0) np.save(args.out_npy, features)
def _prepare_for_stage(self, stage: str): utils.set_global_seed(self.experiment.initial_seed) migrating_params = {} if self.state is not None: migrating_params.update({ "step": self.state.step, "epoch": self.state.epoch + 1 }) self.model, criterion, optimizer, scheduler, self.device = \ self._get_experiment_components(stage) self.state = RunnerState(stage=stage, model=self.model, device=self.device, criterion=criterion, optimizer=optimizer, scheduler=scheduler, **self.experiment.get_state_params(stage), **migrating_params) utils.set_global_seed(self.experiment.initial_seed)
def main_worker(args, unknown_args): """@TODO: Docs. Contribution is welcome.""" args, config = utils.parse_args_uargs(args, unknown_args) utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) config.setdefault("distributed_params", {})["apex"] = args.apex experiment_fn, runner_fn = utils.import_experiment_and_runner( Path(args.expdir)) if experiment_fn is None: experiment_params = config.get("experiment_params", {}) experiment = experiment_params.get("experiment", "Experiment") experiment_fn = EXPERIMENTS.get(experiment) runner_params = config.get("runner_params", {}) experiment = experiment_fn(config) runner = runner_fn(**runner_params) if experiment.logdir is not None and get_rank() <= 0: utils.dump_environment(config, experiment.logdir, args.configs) utils.dump_code(args.expdir, experiment.logdir) runner.run_experiment(experiment)
parser.add_argument('--sample', default=0, type=int) params = parser.parse_args() import torch from torch.utils.data import DataLoader from torchvision import transforms from catalyst.dl import SupervisedRunner from catalyst.dl.utils import set_global_seed, prepare_cudnn from catalyst.dl.callbacks import AccuracyCallback, AUCCallback, PrecisionRecallF1ScoreCallback from .dataset import BIOMETRY from .model import * from .transform import Normalize, ToTensor # Seed & CUDA deterministic set_global_seed(params.seed) prepare_cudnn(deterministic=params.deterministic) # Init custom transforms transform = transforms.Compose([ Normalize(params.sample == 0), ToTensor(), ]) # Init custom dataset data_dir = DIR_DATA_PROCESSED.joinpath('BIOMETRY') traindir = data_dir.joinpath('train').as_posix() validdir = data_dir.joinpath('valid').as_posix() train_dataset = BIOMETRY(traindir, transform=transform) valid_dataset = BIOMETRY(traindir, transform=transform)
def post_transforms(): # we use ImageNet image normalization # and convert it to torch.Tensor return [A.Normalize(p=1.0), ToTensorV2(p=1.0), ] if __name__ == "__main__": warnings.simplefilter("ignore", UserWarning) warnings.simplefilter("ignore", DeprecationWarning) warnings.filterwarnings('ignore') os.environ["PYTHONWARNINGS"] = "ignore" config = ConfigExperiment() config.size = EfficientNet.get_image_size(config.model_name) os.environ["CUDA_VISIBLE_DEVICES"] = "0" utils.set_global_seed(config.seed) utils.prepare_cudnn(deterministic=True) train_transforms = plant.compose([ pre_transforms(config.size), hard_transforms(), post_transforms() ]) valid_transforms = plant.compose([ pre_transforms(config.size), post_transforms() ]) show_transforms = plant.compose([ pre_transforms(config.size),
def _worker_init_fn(self, x): # can not be lambda if we want to run num_workers > 0 on windows set_global_seed(self.initial_seed + x)
def main(): args = get_args() os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus SEED = 42 utils.set_global_seed(SEED) utils.prepare_cudnn(deterministic=True) num_classes = 14 #define datasets train_dataset = ChestXrayDataSet( data_dir=args.path_to_images, image_list_file=args.train_list, transform=transforms_train, ) val_dataset = ChestXrayDataSet( data_dir=args.path_to_images, image_list_file=args.val_list, transform=transforms_val, ) loaders = { 'train': DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers), 'valid': DataLoader(val_dataset, batch_size=2, shuffle=False, num_workers=args.num_workers) } logdir = args.log_dir #where model weights and logs are stored #define model model = DenseNet121(num_classes) if len(args.gpus) > 1: model = nn.DataParallel(model) device = utils.get_device() runner = SupervisedRunner(device=device) optimizer = RAdam(model.parameters(), lr=args.lr, weight_decay=0.0003) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.25, patience=2) weights = torch.Tensor( [10, 100, 30, 8, 40, 40, 330, 140, 35, 155, 110, 250, 155, 200]).to(device) criterion = BCEWithLogitsLoss(pos_weight=weights) class_names = [ 'Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltration', 'Mass', 'Nodule', 'Pneumonia', 'Pneumothorax', 'Consolidation', 'Edema', 'Emphysema', 'Fibrosis', 'Pleural_Thickening', 'Hernia' ] runner.train( model=model, logdir=logdir, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, num_epochs=args.epochs, # We can specify the callbacks list for the experiment; # For this task, we will check AUC and accuracy callbacks=[ AUCCallback( input_key="targets", output_key='logits', prefix='auc', class_names=class_names, num_classes=num_classes, activation='Sigmoid', ), AccuracyCallback( input_key="targets", output_key="logits", prefix="accuracy", accuracy_args=[1], num_classes=14, threshold=0.5, activation='Sigmoid', ), ], main_metric='auc/_mean', minimize_metric=False, verbose=True, )
def _prepare_for_stage(self, stage: str): super()._prepare_for_stage(stage=stage) utils.set_global_seed(self.experiment.initial_seed) loaders = self.experiment.get_loaders(stage=stage) self.state.loaders = loaders
def get_loaders(self, stage: str) -> "OrderedDict[str, DataLoader]": """Returns the loaders for a given stage""" data_params = dict(self.stages_config[stage]["data_params"]) batch_size = data_params.pop("batch_size", 1) num_workers = data_params.pop("num_workers") drop_last = data_params.pop("drop_last", False) per_gpu_scaling = data_params.pop("per_gpu_scaling", False) distributed_rank = self.distributed_params.get("rank", -1) distributed = distributed_rank > -1 datasets = self.get_datasets(stage=stage, **data_params) overridden_loaders_params = data_params.pop("loaders_params", {}) assert isinstance(overridden_loaders_params, dict), \ f"{overridden_loaders_params} should be Dict" loaders = OrderedDict() for name, ds_ in datasets.items(): assert isinstance(ds_, (Dataset, dict)), \ f"{ds_} should be Dataset or Dict" overridden_loader_params = overridden_loaders_params.pop(name, {}) assert isinstance(overridden_loader_params, dict), \ f"{overridden_loader_params} should be Dict" batch_size = overridden_loader_params.pop("batch_size", batch_size) num_workers = overridden_loader_params.\ pop("num_workers", num_workers) if per_gpu_scaling and not distributed: num_gpus = max(1, torch.cuda.device_count()) batch_size *= num_gpus num_workers *= num_gpus loader_params = { "batch_size": batch_size, "num_workers": num_workers, "pin_memory": torch.cuda.is_available(), "drop_last": drop_last, **overridden_loader_params } if isinstance(ds_, Dataset): loader_params["dataset"] = ds_ elif isinstance(ds_, dict): assert "dataset" in ds_, \ "You need to specify dataset for dataloader" loader_params = utils.merge_dicts(ds_, loader_params) else: raise NotImplementedError if distributed: sampler = loader_params.get("sampler") if sampler is not None: assert isinstance(sampler, DistributedSampler) else: loader_params["sampler"] = DistributedSampler( dataset=loader_params["dataset"]) loader_params["shuffle"] = (name.startswith("train") and loader_params.get("sampler") is None) if "batch_sampler" in loader_params: if distributed: raise ValueError("batch_sampler option is mutually " "exclusive with distributed") for k in ("batch_size", "shuffle", "sampler", "drop_last"): loader_params.pop(k, None) if "worker_init_fn" not in loader_params: loader_params["worker_init_fn"] = \ lambda x: utils.set_global_seed(self.initial_seed + x) loaders[name] = DataLoader(**loader_params) return loaders
from catalyst.dl import utils SEED = 42 utils.set_global_seed(SEED) utils.prepare_cudnn(deterministic=True) import its_training_utils as tu import numpy as np import pandas as pd from datetime import datetime import torch from torch import nn import os import json from sklearn.model_selection import train_test_split import cv2 from collections import OrderedDict from catalyst import dl from catalyst.core import Callback, CallbackOrder from catalyst.dl.callbacks import AccuracyCallback, CheckpointCallback, AUCCallback, CriterionCallback, MetricAggregationCallback, MeterMetricsCallback, VerboseLogger, SchedulerCallback, OptimizerCallback, MixupCallback from catalyst.dl.callbacks.metrics.iou import IouCallback from catalyst.utils.checkpoint import load_checkpoint, unpack_checkpoint from albumentations.pytorch.transforms import ToTensor import base64 from tqdm import tqdm import torchvision.models as models from catalyst.contrib.nn.optimizers.radam import RAdam from catalyst.contrib.nn.optimizers.lookahead import Lookahead from sklearn.model_selection import KFold, StratifiedKFold from torch.utils.data import Dataset, DataLoader import albumentations as albu from efficientnet_pytorch import EfficientNet
def main(args, _=None): """Run the ``catalyst-data text2embeddings`` script.""" batch_size = args.batch_size num_workers = args.num_workers max_length = args.max_length pooling_groups = args.pooling.split(",") utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) if hasattr(args, "in_huggingface"): model_config = BertConfig.from_pretrained(args.in_huggingface) model_config.output_hidden_states = args.output_hidden_states model = BertModel.from_pretrained(args.in_huggingface, config=model_config) tokenizer = BertTokenizer.from_pretrained(args.in_huggingface) else: model_config = BertConfig.from_pretrained(args.in_config) model_config.output_hidden_states = args.output_hidden_states model = BertModel(config=model_config) tokenizer = BertTokenizer.from_pretrained(args.in_vocab) if hasattr(args, "in_model"): checkpoint = utils.load_checkpoint(args.in_model) checkpoint = {"model_state_dict": checkpoint} utils.unpack_checkpoint(checkpoint=checkpoint, model=model) model = model.eval() model, _, _, _, device = utils.process_components(model=model) df = pd.read_csv(args.in_csv) df = df.dropna(subset=[args.txt_col]) df.to_csv(f"{args.out_prefix}.df.csv", index=False) df = df.reset_index().drop("index", axis=1) df = list(df.to_dict("index").values()) num_samples = len(df) open_fn = LambdaReader( input_key=args.txt_col, output_key=None, lambda_fn=partial( tokenize_text, strip=args.strip, lowercase=args.lowercase, remove_punctuation=args.remove_punctuation, ), tokenizer=tokenizer, max_length=max_length, ) dataloader = utils.get_loader( df, open_fn, batch_size=batch_size, num_workers=num_workers, ) features = {} dataloader = tqdm(dataloader) if args.verbose else dataloader with torch.no_grad(): for idx, batch in enumerate(dataloader): batch = utils.any2device(batch, device) bert_output = model(**batch) mask = (batch["attention_mask"].unsqueeze(-1) if args.mask_for_max_length else None) if utils.check_ddp_wrapped(model): # using several gpu hidden_size = model.module.config.hidden_size hidden_states = model.module.config.output_hidden_states else: # using cpu or one gpu hidden_size = model.config.hidden_size hidden_states = model.config.output_hidden_states features_ = process_bert_output( bert_output=bert_output, hidden_size=hidden_size, output_hidden_states=hidden_states, pooling_groups=pooling_groups, mask=mask, ) # create storage based on network output if idx == 0: for key, value in features_.items(): name_ = key if isinstance(key, str) else f"{key:02d}" _, embedding_size = value.shape features[name_] = np.memmap( f"{args.out_prefix}.{name_}.npy", dtype=np.float32, mode="w+", shape=(num_samples, embedding_size), ) indices = np.arange(idx * batch_size, min((idx + 1) * batch_size, num_samples)) for key, value in features_.items(): name_ = key if isinstance(key, str) else f"{key:02d}" features[name_][indices] = _detach(value)
def main(args, _=None): batch_size = args.batch_size num_workers = args.num_workers max_length = args.max_length pooling_groups = args.pooling.split(",") utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) model_config = BertConfig.from_pretrained(args.in_config) model_config.output_hidden_states = args.output_hidden_states model = BertModel(config=model_config) checkpoint = utils.load_checkpoint(args.in_model) checkpoint = {"model_state_dict": checkpoint} utils.unpack_checkpoint(checkpoint=checkpoint, model=model) model = model.eval() model, _, _, _, device = utils.process_components(model=model) tokenizer = BertTokenizer.from_pretrained(args.in_vocab) df = pd.read_csv(args.in_csv) df = df.dropna(subset=[args.txt_col]) df.to_csv(f"{args.out_prefix}.df.csv", index=False) df = df.reset_index().drop("index", axis=1) df = list(df.to_dict("index").values()) num_samples = len(df) open_fn = LambdaReader( input_key=args.txt_col, output_key=None, lambda_fn=get_features, tokenizer=tokenizer, max_length=max_length, ) dataloader = utils.get_loader( df, open_fn, batch_size=batch_size, num_workers=num_workers, ) features = {} poolings = {} dataloader = tqdm(dataloader) if args.verbose else dataloader with torch.no_grad(): for idx, batch in enumerate(dataloader): batch = utils.any2device(batch, device) features_ = model(**batch) # create storage based on network output if idx == 0: # class _, embedding_size = features_[1].shape features["class"] = np.memmap( f"{args.out_prefix}.class.npy", dtype=np.float32, mode="w+", shape=(num_samples, embedding_size), ) if args.output_hidden_states: # all embeddings for i, feature_ in enumerate(features_[2]): name_ = f"embeddings_{i + 1:02d}" _, _, embedding_size = feature_.shape poolings[name_] = LamaPooling( features_in=embedding_size, groups=pooling_groups, ) features[name_] = np.memmap( f"{args.out_prefix}.{name_}.npy", dtype=np.float32, mode="w+", shape=(num_samples, embedding_size), ) else: # last _, _, embedding_size = features_[0].shape poolings["last"] = LamaPooling( features_in=embedding_size, groups=pooling_groups, ) features["last"] = np.memmap( f"{args.out_prefix}.last.npy", dtype=np.float32, mode="w+", shape=(num_samples, embedding_size), ) indices = np.arange(idx * batch_size, min((idx + 1) * batch_size, num_samples)) features["class"][indices] = _detach(features_[1]) if args.output_hidden_states: # all embeddings for i, feature_ in enumerate(features_[2]): name_ = f"embeddings_{i + 1:02d}" feature_ = poolings[name_](feature_) features[name_][indices] = _detach(feature_) else: feature_ = poolings[name_](features_[0]) features["last"][indices] = _detach(feature_)