def main_worker(args, unknown_args): """Runs main worker thread from model training.""" args, config = utils.parse_args_uargs(args, unknown_args) utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) config.setdefault("distributed_params", {})["apex"] = args.apex config.setdefault("distributed_params", {})["amp"] = args.amp expdir = Path(args.expdir) # optuna objective def objective(trial: optuna.trial): trial, trial_config = _process_trial_config(trial, config.copy()) experiment, runner, trial_config = utils.prepare_config_api_components( expdir=expdir, config=trial_config) # @TODO: here we need better solution. experiment._trial = trial # noqa: WPS437 if experiment.logdir is not None and utils.get_rank() <= 0: utils.dump_environment(trial_config, experiment.logdir, args.configs) utils.dump_code(args.expdir, experiment.logdir) runner.run_experiment(experiment) return runner.best_valid_metrics[runner.main_metric] # optuna direction direction = ("minimize" if config.get("stages", {}).get( "stage_params", {}).get("minimize_metric", True) else "maximize") # optuna sampler sampler_params = config.pop("optuna_sampler_params", {}) optuna_sampler_type = sampler_params.pop("sampler", None) optuna_sampler = (optuna.samplers.__dict__[optuna_sampler_type]( **sampler_params) if optuna_sampler_type is not None else None) # optuna pruner pruner_params = config.pop("optuna_pruner_params", {}) optuna_pruner_type = pruner_params.pop("pruner", None) optuna_pruner = (optuna.pruners.__dict__[optuna_pruner_type]( **pruner_params) if optuna_pruner_type is not None else None) study = optuna.create_study( direction=direction, storage=args.storage, study_name=args.study_name, sampler=optuna_sampler, pruner=optuna_pruner, ) study.optimize( objective, n_trials=args.n_trials, timeout=args.timeout, n_jobs=args.n_jobs or 1, gc_after_trial=args.gc_after_trial, show_progress_bar=args.show_progress_bar, )
def main_worker(args, unknown_args): args, config = utils.parse_args_uargs(args, unknown_args) utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) config.setdefault("distributed_params", {})["apex"] = args.apex Experiment, Runner = utils.import_experiment_and_runner(Path(args.expdir)) runner_params = config.get("runner_params", {}) experiment = Experiment(config) runner = Runner(**runner_params) if experiment.logdir is not None and get_rank() <= 0: utils.dump_environment(config, experiment.logdir, args.configs) utils.dump_code(args.expdir, experiment.logdir) runner.run_experiment(experiment)
def main(args, unknown_args): """Run the ``catalyst-dl run`` script""" args, config = utils.parse_args_uargs(args, unknown_args) utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) Experiment, Runner = utils.import_experiment_and_runner(Path(args.expdir)) runner_params = config.pop("runner_params", {}) or {} experiment = Experiment(config) runner = Runner(**runner_params) if experiment.logdir is not None: utils.dump_environment(config, experiment.logdir, args.configs) utils.dump_code(args.expdir, experiment.logdir) check_run = safitty.get(config, "args", "check", default=False) runner.run_experiment(experiment, check=check_run)
def main_worker(args, unknown_args): """Runs main worker thread from model training.""" args, config = utils.parse_args_uargs(args, unknown_args) utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) config.setdefault("distributed_params", {})["apex"] = args.apex config.setdefault("distributed_params", {})["amp"] = args.amp experiment, runner, config = utils.prepare_config_api_components( expdir=Path(args.expdir), config=config ) if experiment.logdir is not None and utils.get_rank() <= 0: utils.dump_environment(config, experiment.logdir, args.configs) utils.dump_code(args.expdir, experiment.logdir) runner.run_experiment(experiment)
def main(args, _=None): """Run the ``catalyst-data image2embeddings`` script.""" global IMG_SIZE utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) IMG_SIZE = (args.img_size, args.img_size) # noqa: WPS442 if args.traced_model is not None: device = utils.get_device() model = torch.jit.load(str(args.traced_model), map_location=device) else: model = ResnetEncoder(arch=args.arch, pooling=args.pooling) model = model.eval() model, _, _, _, device = utils.process_components(model=model) df = pd.read_csv(args.in_csv) df = df.reset_index().drop("index", axis=1) df = list(df.to_dict("index").values()) open_fn = ImageReader(input_key=args.img_col, output_key="image", rootpath=args.rootpath) dataloader = utils.get_loader( df, open_fn, batch_size=args.batch_size, num_workers=args.num_workers, dict_transform=dict_transformer, ) features = [] dataloader = tqdm(dataloader) if args.verbose else dataloader with torch.no_grad(): for batch in dataloader: batch_features = model(batch["image"].to(device)) batch_features = batch_features.cpu().detach().numpy() features.append(batch_features) features = np.concatenate(features, axis=0) np.save(args.out_npy, features)
def main_worker(args, unknown_args): """@TODO: Docs. Contribution is welcome.""" args, config = utils.parse_args_uargs(args, unknown_args) utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) config.setdefault("distributed_params", {})["apex"] = args.apex experiment_fn, runner_fn = utils.import_experiment_and_runner( Path(args.expdir)) if experiment_fn is None: experiment_params = config.get("experiment_params", {}) experiment = experiment_params.get("experiment", "Experiment") experiment_fn = EXPERIMENTS.get(experiment) runner_params = config.get("runner_params", {}) experiment = experiment_fn(config) runner = runner_fn(**runner_params) if experiment.logdir is not None and get_rank() <= 0: utils.dump_environment(config, experiment.logdir, args.configs) utils.dump_code(args.expdir, experiment.logdir) runner.run_experiment(experiment)
def post_transforms(): # we use ImageNet image normalization # and convert it to torch.Tensor return [A.Normalize(p=1.0), ToTensorV2(p=1.0), ] if __name__ == "__main__": warnings.simplefilter("ignore", UserWarning) warnings.simplefilter("ignore", DeprecationWarning) warnings.filterwarnings('ignore') os.environ["PYTHONWARNINGS"] = "ignore" config = ConfigExperiment() config.size = EfficientNet.get_image_size(config.model_name) os.environ["CUDA_VISIBLE_DEVICES"] = "0" utils.set_global_seed(config.seed) utils.prepare_cudnn(deterministic=True) train_transforms = plant.compose([ pre_transforms(config.size), hard_transforms(), post_transforms() ]) valid_transforms = plant.compose([ pre_transforms(config.size), post_transforms() ]) show_transforms = plant.compose([ pre_transforms(config.size), hard_transforms()
def main(args, _=None): """Run the ``catalyst-data text2embeddings`` script.""" batch_size = args.batch_size num_workers = args.num_workers max_length = args.max_length pooling_groups = args.pooling.split(",") utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) if hasattr(args, "in_huggingface"): model_config = BertConfig.from_pretrained(args.in_huggingface) model_config.output_hidden_states = args.output_hidden_states model = BertModel.from_pretrained(args.in_huggingface, config=model_config) tokenizer = BertTokenizer.from_pretrained(args.in_huggingface) else: model_config = BertConfig.from_pretrained(args.in_config) model_config.output_hidden_states = args.output_hidden_states model = BertModel(config=model_config) tokenizer = BertTokenizer.from_pretrained(args.in_vocab) if hasattr(args, "in_model"): checkpoint = utils.load_checkpoint(args.in_model) checkpoint = {"model_state_dict": checkpoint} utils.unpack_checkpoint(checkpoint=checkpoint, model=model) model = model.eval() model, _, _, _, device = utils.process_components(model=model) df = pd.read_csv(args.in_csv) df = df.dropna(subset=[args.txt_col]) df.to_csv(f"{args.out_prefix}.df.csv", index=False) df = df.reset_index().drop("index", axis=1) df = list(df.to_dict("index").values()) num_samples = len(df) open_fn = LambdaReader( input_key=args.txt_col, output_key=None, lambda_fn=partial( tokenize_text, strip=args.strip, lowercase=args.lowercase, remove_punctuation=args.remove_punctuation, ), tokenizer=tokenizer, max_length=max_length, ) dataloader = utils.get_loader( df, open_fn, batch_size=batch_size, num_workers=num_workers, ) features = {} dataloader = tqdm(dataloader) if args.verbose else dataloader with torch.no_grad(): for idx, batch in enumerate(dataloader): batch = utils.any2device(batch, device) bert_output = model(**batch) mask = (batch["attention_mask"].unsqueeze(-1) if args.mask_for_max_length else None) if utils.check_ddp_wrapped(model): # using several gpu hidden_size = model.module.config.hidden_size hidden_states = model.module.config.output_hidden_states else: # using cpu or one gpu hidden_size = model.config.hidden_size hidden_states = model.config.output_hidden_states features_ = process_bert_output( bert_output=bert_output, hidden_size=hidden_size, output_hidden_states=hidden_states, pooling_groups=pooling_groups, mask=mask, ) # create storage based on network output if idx == 0: for key, value in features_.items(): name_ = key if isinstance(key, str) else f"{key:02d}" _, embedding_size = value.shape features[name_] = np.memmap( f"{args.out_prefix}.{name_}.npy", dtype=np.float32, mode="w+", shape=(num_samples, embedding_size), ) indices = np.arange(idx * batch_size, min((idx + 1) * batch_size, num_samples)) for key, value in features_.items(): name_ = key if isinstance(key, str) else f"{key:02d}" features[name_][indices] = _detach(value)
def main(args, _=None): batch_size = args.batch_size num_workers = args.num_workers max_length = args.max_length pooling_groups = args.pooling.split(",") utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) model_config = BertConfig.from_pretrained(args.in_config) model_config.output_hidden_states = args.output_hidden_states model = BertModel(config=model_config) checkpoint = utils.load_checkpoint(args.in_model) checkpoint = {"model_state_dict": checkpoint} utils.unpack_checkpoint(checkpoint=checkpoint, model=model) model = model.eval() model, _, _, _, device = utils.process_components(model=model) tokenizer = BertTokenizer.from_pretrained(args.in_vocab) df = pd.read_csv(args.in_csv) df = df.dropna(subset=[args.txt_col]) df.to_csv(f"{args.out_prefix}.df.csv", index=False) df = df.reset_index().drop("index", axis=1) df = list(df.to_dict("index").values()) num_samples = len(df) open_fn = LambdaReader( input_key=args.txt_col, output_key=None, lambda_fn=get_features, tokenizer=tokenizer, max_length=max_length, ) dataloader = utils.get_loader( df, open_fn, batch_size=batch_size, num_workers=num_workers, ) features = {} poolings = {} dataloader = tqdm(dataloader) if args.verbose else dataloader with torch.no_grad(): for idx, batch in enumerate(dataloader): batch = utils.any2device(batch, device) features_ = model(**batch) # create storage based on network output if idx == 0: # class _, embedding_size = features_[1].shape features["class"] = np.memmap( f"{args.out_prefix}.class.npy", dtype=np.float32, mode="w+", shape=(num_samples, embedding_size), ) if args.output_hidden_states: # all embeddings for i, feature_ in enumerate(features_[2]): name_ = f"embeddings_{i + 1:02d}" _, _, embedding_size = feature_.shape poolings[name_] = LamaPooling( features_in=embedding_size, groups=pooling_groups, ) features[name_] = np.memmap( f"{args.out_prefix}.{name_}.npy", dtype=np.float32, mode="w+", shape=(num_samples, embedding_size), ) else: # last _, _, embedding_size = features_[0].shape poolings["last"] = LamaPooling( features_in=embedding_size, groups=pooling_groups, ) features["last"] = np.memmap( f"{args.out_prefix}.last.npy", dtype=np.float32, mode="w+", shape=(num_samples, embedding_size), ) indices = np.arange(idx * batch_size, min((idx + 1) * batch_size, num_samples)) features["class"][indices] = _detach(features_[1]) if args.output_hidden_states: # all embeddings for i, feature_ in enumerate(features_[2]): name_ = f"embeddings_{i + 1:02d}" feature_ = poolings[name_](feature_) features[name_][indices] = _detach(feature_) else: feature_ = poolings[name_](features_[0]) features["last"][indices] = _detach(feature_)
def main(): args = get_args() os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus SEED = 42 utils.set_global_seed(SEED) utils.prepare_cudnn(deterministic=True) num_classes = 14 #define datasets train_dataset = ChestXrayDataSet( data_dir=args.path_to_images, image_list_file=args.train_list, transform=transforms_train, ) val_dataset = ChestXrayDataSet( data_dir=args.path_to_images, image_list_file=args.val_list, transform=transforms_val, ) loaders = { 'train': DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers), 'valid': DataLoader(val_dataset, batch_size=2, shuffle=False, num_workers=args.num_workers) } logdir = args.log_dir #where model weights and logs are stored #define model model = DenseNet121(num_classes) if len(args.gpus) > 1: model = nn.DataParallel(model) device = utils.get_device() runner = SupervisedRunner(device=device) optimizer = RAdam(model.parameters(), lr=args.lr, weight_decay=0.0003) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.25, patience=2) weights = torch.Tensor( [10, 100, 30, 8, 40, 40, 330, 140, 35, 155, 110, 250, 155, 200]).to(device) criterion = BCEWithLogitsLoss(pos_weight=weights) class_names = [ 'Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltration', 'Mass', 'Nodule', 'Pneumonia', 'Pneumothorax', 'Consolidation', 'Edema', 'Emphysema', 'Fibrosis', 'Pleural_Thickening', 'Hernia' ] runner.train( model=model, logdir=logdir, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, num_epochs=args.epochs, # We can specify the callbacks list for the experiment; # For this task, we will check AUC and accuracy callbacks=[ AUCCallback( input_key="targets", output_key='logits', prefix='auc', class_names=class_names, num_classes=num_classes, activation='Sigmoid', ), AccuracyCallback( input_key="targets", output_key="logits", prefix="accuracy", accuracy_args=[1], num_classes=14, threshold=0.5, activation='Sigmoid', ), ], main_metric='auc/_mean', minimize_metric=False, verbose=True, )
params = parser.parse_args() import torch from torch.utils.data import DataLoader from torchvision import transforms from catalyst.dl import SupervisedRunner from catalyst.dl.utils import set_global_seed, prepare_cudnn from catalyst.dl.callbacks import AccuracyCallback, AUCCallback, PrecisionRecallF1ScoreCallback from .dataset import BIOMETRY from .model import * from .transform import Normalize, ToTensor # Seed & CUDA deterministic set_global_seed(params.seed) prepare_cudnn(deterministic=params.deterministic) # Init custom transforms transform = transforms.Compose([ Normalize(params.sample == 0), ToTensor(), ]) # Init custom dataset data_dir = DIR_DATA_PROCESSED.joinpath('BIOMETRY') traindir = data_dir.joinpath('train').as_posix() validdir = data_dir.joinpath('valid').as_posix() train_dataset = BIOMETRY(traindir, transform=transform) valid_dataset = BIOMETRY(traindir, transform=transform) # Init data loaders