Beispiel #1
0
def main_worker(args, unknown_args):
    """Runs main worker thread from model training."""
    args, config = utils.parse_args_uargs(args, unknown_args)
    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    config.setdefault("distributed_params", {})["apex"] = args.apex
    config.setdefault("distributed_params", {})["amp"] = args.amp
    expdir = Path(args.expdir)

    # optuna objective
    def objective(trial: optuna.trial):
        trial, trial_config = _process_trial_config(trial, config.copy())
        experiment, runner, trial_config = utils.prepare_config_api_components(
            expdir=expdir, config=trial_config)
        # @TODO: here we need better solution.
        experiment._trial = trial  # noqa: WPS437

        if experiment.logdir is not None and utils.get_rank() <= 0:
            utils.dump_environment(trial_config, experiment.logdir,
                                   args.configs)
            utils.dump_code(args.expdir, experiment.logdir)

        runner.run_experiment(experiment)

        return runner.best_valid_metrics[runner.main_metric]

    # optuna direction
    direction = ("minimize" if config.get("stages", {}).get(
        "stage_params", {}).get("minimize_metric", True) else "maximize")

    # optuna sampler
    sampler_params = config.pop("optuna_sampler_params", {})
    optuna_sampler_type = sampler_params.pop("sampler", None)
    optuna_sampler = (optuna.samplers.__dict__[optuna_sampler_type](
        **sampler_params) if optuna_sampler_type is not None else None)

    # optuna pruner
    pruner_params = config.pop("optuna_pruner_params", {})
    optuna_pruner_type = pruner_params.pop("pruner", None)
    optuna_pruner = (optuna.pruners.__dict__[optuna_pruner_type](
        **pruner_params) if optuna_pruner_type is not None else None)

    study = optuna.create_study(
        direction=direction,
        storage=args.storage,
        study_name=args.study_name,
        sampler=optuna_sampler,
        pruner=optuna_pruner,
    )
    study.optimize(
        objective,
        n_trials=args.n_trials,
        timeout=args.timeout,
        n_jobs=args.n_jobs or 1,
        gc_after_trial=args.gc_after_trial,
        show_progress_bar=args.show_progress_bar,
    )
Beispiel #2
0
def main_worker(args, unknown_args):
    args, config = utils.parse_args_uargs(args, unknown_args)
    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    config.setdefault("distributed_params", {})["apex"] = args.apex

    Experiment, Runner = utils.import_experiment_and_runner(Path(args.expdir))

    runner_params = config.get("runner_params", {})
    experiment = Experiment(config)
    runner = Runner(**runner_params)

    if experiment.logdir is not None and get_rank() <= 0:
        utils.dump_environment(config, experiment.logdir, args.configs)
        utils.dump_code(args.expdir, experiment.logdir)

    runner.run_experiment(experiment)
Beispiel #3
0
def main(args, unknown_args):
    """Run the ``catalyst-dl run`` script"""
    args, config = utils.parse_args_uargs(args, unknown_args)
    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    Experiment, Runner = utils.import_experiment_and_runner(Path(args.expdir))

    runner_params = config.pop("runner_params", {}) or {}
    experiment = Experiment(config)
    runner = Runner(**runner_params)

    if experiment.logdir is not None:
        utils.dump_environment(config, experiment.logdir, args.configs)
        utils.dump_code(args.expdir, experiment.logdir)

    check_run = safitty.get(config, "args", "check", default=False)
    runner.run_experiment(experiment, check=check_run)
Beispiel #4
0
def main_worker(args, unknown_args):
    """Runs main worker thread from model training."""
    args, config = utils.parse_args_uargs(args, unknown_args)
    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    config.setdefault("distributed_params", {})["apex"] = args.apex
    config.setdefault("distributed_params", {})["amp"] = args.amp

    experiment, runner, config = utils.prepare_config_api_components(
        expdir=Path(args.expdir), config=config
    )

    if experiment.logdir is not None and utils.get_rank() <= 0:
        utils.dump_environment(config, experiment.logdir, args.configs)
        utils.dump_code(args.expdir, experiment.logdir)

    runner.run_experiment(experiment)
Beispiel #5
0
def main(args, _=None):
    """Run the ``catalyst-data image2embeddings`` script."""
    global IMG_SIZE

    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    IMG_SIZE = (args.img_size, args.img_size)  # noqa: WPS442

    if args.traced_model is not None:
        device = utils.get_device()
        model = torch.jit.load(str(args.traced_model), map_location=device)
    else:
        model = ResnetEncoder(arch=args.arch, pooling=args.pooling)
        model = model.eval()
        model, _, _, _, device = utils.process_components(model=model)

    df = pd.read_csv(args.in_csv)
    df = df.reset_index().drop("index", axis=1)
    df = list(df.to_dict("index").values())

    open_fn = ImageReader(input_key=args.img_col,
                          output_key="image",
                          rootpath=args.rootpath)

    dataloader = utils.get_loader(
        df,
        open_fn,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        dict_transform=dict_transformer,
    )

    features = []
    dataloader = tqdm(dataloader) if args.verbose else dataloader
    with torch.no_grad():
        for batch in dataloader:
            batch_features = model(batch["image"].to(device))
            batch_features = batch_features.cpu().detach().numpy()
            features.append(batch_features)

    features = np.concatenate(features, axis=0)
    np.save(args.out_npy, features)
def main_worker(args, unknown_args):
    """@TODO: Docs. Contribution is welcome."""
    args, config = utils.parse_args_uargs(args, unknown_args)
    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    config.setdefault("distributed_params", {})["apex"] = args.apex

    experiment_fn, runner_fn = utils.import_experiment_and_runner(
        Path(args.expdir))
    if experiment_fn is None:
        experiment_params = config.get("experiment_params", {})
        experiment = experiment_params.get("experiment", "Experiment")
        experiment_fn = EXPERIMENTS.get(experiment)

    runner_params = config.get("runner_params", {})
    experiment = experiment_fn(config)
    runner = runner_fn(**runner_params)

    if experiment.logdir is not None and get_rank() <= 0:
        utils.dump_environment(config, experiment.logdir, args.configs)
        utils.dump_code(args.expdir, experiment.logdir)

    runner.run_experiment(experiment)
def post_transforms():
    # we use ImageNet image normalization
    # and convert it to torch.Tensor
    return [A.Normalize(p=1.0), ToTensorV2(p=1.0), ]


if __name__ == "__main__":
    warnings.simplefilter("ignore", UserWarning)
    warnings.simplefilter("ignore", DeprecationWarning)
    warnings.filterwarnings('ignore')
    os.environ["PYTHONWARNINGS"] = "ignore"
    config = ConfigExperiment()
    config.size = EfficientNet.get_image_size(config.model_name)
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    utils.set_global_seed(config.seed)
    utils.prepare_cudnn(deterministic=True)


    train_transforms = plant.compose([
        pre_transforms(config.size),
        hard_transforms(),
        post_transforms()
    ])
    valid_transforms = plant.compose([
        pre_transforms(config.size),
        post_transforms()
    ])

    show_transforms = plant.compose([
        pre_transforms(config.size),
        hard_transforms()
Beispiel #8
0
def main(args, _=None):
    """Run the ``catalyst-data text2embeddings`` script."""
    batch_size = args.batch_size
    num_workers = args.num_workers
    max_length = args.max_length
    pooling_groups = args.pooling.split(",")

    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    if hasattr(args, "in_huggingface"):
        model_config = BertConfig.from_pretrained(args.in_huggingface)
        model_config.output_hidden_states = args.output_hidden_states
        model = BertModel.from_pretrained(args.in_huggingface,
                                          config=model_config)
        tokenizer = BertTokenizer.from_pretrained(args.in_huggingface)
    else:
        model_config = BertConfig.from_pretrained(args.in_config)
        model_config.output_hidden_states = args.output_hidden_states
        model = BertModel(config=model_config)
        tokenizer = BertTokenizer.from_pretrained(args.in_vocab)
    if hasattr(args, "in_model"):
        checkpoint = utils.load_checkpoint(args.in_model)
        checkpoint = {"model_state_dict": checkpoint}
        utils.unpack_checkpoint(checkpoint=checkpoint, model=model)

    model = model.eval()
    model, _, _, _, device = utils.process_components(model=model)

    df = pd.read_csv(args.in_csv)
    df = df.dropna(subset=[args.txt_col])
    df.to_csv(f"{args.out_prefix}.df.csv", index=False)
    df = df.reset_index().drop("index", axis=1)
    df = list(df.to_dict("index").values())
    num_samples = len(df)

    open_fn = LambdaReader(
        input_key=args.txt_col,
        output_key=None,
        lambda_fn=partial(
            tokenize_text,
            strip=args.strip,
            lowercase=args.lowercase,
            remove_punctuation=args.remove_punctuation,
        ),
        tokenizer=tokenizer,
        max_length=max_length,
    )

    dataloader = utils.get_loader(
        df,
        open_fn,
        batch_size=batch_size,
        num_workers=num_workers,
    )

    features = {}
    dataloader = tqdm(dataloader) if args.verbose else dataloader
    with torch.no_grad():
        for idx, batch in enumerate(dataloader):
            batch = utils.any2device(batch, device)
            bert_output = model(**batch)
            mask = (batch["attention_mask"].unsqueeze(-1)
                    if args.mask_for_max_length else None)

            if utils.check_ddp_wrapped(model):
                # using several gpu
                hidden_size = model.module.config.hidden_size
                hidden_states = model.module.config.output_hidden_states

            else:
                # using cpu or one gpu
                hidden_size = model.config.hidden_size
                hidden_states = model.config.output_hidden_states

            features_ = process_bert_output(
                bert_output=bert_output,
                hidden_size=hidden_size,
                output_hidden_states=hidden_states,
                pooling_groups=pooling_groups,
                mask=mask,
            )

            # create storage based on network output
            if idx == 0:
                for key, value in features_.items():
                    name_ = key if isinstance(key, str) else f"{key:02d}"
                    _, embedding_size = value.shape
                    features[name_] = np.memmap(
                        f"{args.out_prefix}.{name_}.npy",
                        dtype=np.float32,
                        mode="w+",
                        shape=(num_samples, embedding_size),
                    )

            indices = np.arange(idx * batch_size,
                                min((idx + 1) * batch_size, num_samples))
            for key, value in features_.items():
                name_ = key if isinstance(key, str) else f"{key:02d}"
                features[name_][indices] = _detach(value)
Beispiel #9
0
def main(args, _=None):
    batch_size = args.batch_size
    num_workers = args.num_workers
    max_length = args.max_length
    pooling_groups = args.pooling.split(",")

    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    model_config = BertConfig.from_pretrained(args.in_config)
    model_config.output_hidden_states = args.output_hidden_states
    model = BertModel(config=model_config)

    checkpoint = utils.load_checkpoint(args.in_model)
    checkpoint = {"model_state_dict": checkpoint}
    utils.unpack_checkpoint(checkpoint=checkpoint, model=model)

    model = model.eval()
    model, _, _, _, device = utils.process_components(model=model)

    tokenizer = BertTokenizer.from_pretrained(args.in_vocab)

    df = pd.read_csv(args.in_csv)
    df = df.dropna(subset=[args.txt_col])
    df.to_csv(f"{args.out_prefix}.df.csv", index=False)
    df = df.reset_index().drop("index", axis=1)
    df = list(df.to_dict("index").values())
    num_samples = len(df)

    open_fn = LambdaReader(
        input_key=args.txt_col,
        output_key=None,
        lambda_fn=get_features,
        tokenizer=tokenizer,
        max_length=max_length,
    )

    dataloader = utils.get_loader(
        df,
        open_fn,
        batch_size=batch_size,
        num_workers=num_workers,
    )

    features = {}
    poolings = {}
    dataloader = tqdm(dataloader) if args.verbose else dataloader
    with torch.no_grad():
        for idx, batch in enumerate(dataloader):
            batch = utils.any2device(batch, device)
            features_ = model(**batch)

            # create storage based on network output
            if idx == 0:
                # class
                _, embedding_size = features_[1].shape
                features["class"] = np.memmap(
                    f"{args.out_prefix}.class.npy",
                    dtype=np.float32,
                    mode="w+",
                    shape=(num_samples, embedding_size),
                )
                if args.output_hidden_states:
                    # all embeddings
                    for i, feature_ in enumerate(features_[2]):
                        name_ = f"embeddings_{i + 1:02d}"
                        _, _, embedding_size = feature_.shape
                        poolings[name_] = LamaPooling(
                            features_in=embedding_size,
                            groups=pooling_groups,
                        )
                        features[name_] = np.memmap(
                            f"{args.out_prefix}.{name_}.npy",
                            dtype=np.float32,
                            mode="w+",
                            shape=(num_samples, embedding_size),
                        )
                else:
                    # last
                    _, _, embedding_size = features_[0].shape
                    poolings["last"] = LamaPooling(
                        features_in=embedding_size,
                        groups=pooling_groups,
                    )
                    features["last"] = np.memmap(
                        f"{args.out_prefix}.last.npy",
                        dtype=np.float32,
                        mode="w+",
                        shape=(num_samples, embedding_size),
                    )

            indices = np.arange(idx * batch_size,
                                min((idx + 1) * batch_size, num_samples))
            features["class"][indices] = _detach(features_[1])
            if args.output_hidden_states:
                # all embeddings
                for i, feature_ in enumerate(features_[2]):
                    name_ = f"embeddings_{i + 1:02d}"
                    feature_ = poolings[name_](feature_)
                    features[name_][indices] = _detach(feature_)
            else:
                feature_ = poolings[name_](features_[0])
                features["last"][indices] = _detach(feature_)
Beispiel #10
0
def main():
    args = get_args()
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
    SEED = 42
    utils.set_global_seed(SEED)
    utils.prepare_cudnn(deterministic=True)
    num_classes = 14

    #define datasets
    train_dataset = ChestXrayDataSet(
        data_dir=args.path_to_images,
        image_list_file=args.train_list,
        transform=transforms_train,
    )

    val_dataset = ChestXrayDataSet(
        data_dir=args.path_to_images,
        image_list_file=args.val_list,
        transform=transforms_val,
    )

    loaders = {
        'train':
        DataLoader(train_dataset,
                   batch_size=args.batch_size,
                   shuffle=True,
                   num_workers=args.num_workers),
        'valid':
        DataLoader(val_dataset,
                   batch_size=2,
                   shuffle=False,
                   num_workers=args.num_workers)
    }

    logdir = args.log_dir  #where model weights and logs are stored

    #define model
    model = DenseNet121(num_classes)
    if len(args.gpus) > 1:
        model = nn.DataParallel(model)
    device = utils.get_device()
    runner = SupervisedRunner(device=device)

    optimizer = RAdam(model.parameters(), lr=args.lr, weight_decay=0.0003)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                     factor=0.25,
                                                     patience=2)

    weights = torch.Tensor(
        [10, 100, 30, 8, 40, 40, 330, 140, 35, 155, 110, 250, 155,
         200]).to(device)
    criterion = BCEWithLogitsLoss(pos_weight=weights)

    class_names = [
        'Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltration', 'Mass',
        'Nodule', 'Pneumonia', 'Pneumothorax', 'Consolidation', 'Edema',
        'Emphysema', 'Fibrosis', 'Pleural_Thickening', 'Hernia'
    ]

    runner.train(
        model=model,
        logdir=logdir,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=loaders,
        num_epochs=args.epochs,

        # We can specify the callbacks list for the experiment;
        # For this task, we will check AUC and accuracy
        callbacks=[
            AUCCallback(
                input_key="targets",
                output_key='logits',
                prefix='auc',
                class_names=class_names,
                num_classes=num_classes,
                activation='Sigmoid',
            ),
            AccuracyCallback(
                input_key="targets",
                output_key="logits",
                prefix="accuracy",
                accuracy_args=[1],
                num_classes=14,
                threshold=0.5,
                activation='Sigmoid',
            ),
        ],
        main_metric='auc/_mean',
        minimize_metric=False,
        verbose=True,
    )
Beispiel #11
0
    params = parser.parse_args()

    import torch
    from torch.utils.data import DataLoader
    from torchvision import transforms
    from catalyst.dl import SupervisedRunner
    from catalyst.dl.utils import set_global_seed, prepare_cudnn
    from catalyst.dl.callbacks import AccuracyCallback, AUCCallback, PrecisionRecallF1ScoreCallback

    from .dataset import BIOMETRY
    from .model import *
    from .transform import Normalize, ToTensor

    # Seed & CUDA deterministic
    set_global_seed(params.seed)
    prepare_cudnn(deterministic=params.deterministic)

    # Init custom transforms
    transform = transforms.Compose([
        Normalize(params.sample == 0),
        ToTensor(),
    ])

    # Init custom dataset
    data_dir = DIR_DATA_PROCESSED.joinpath('BIOMETRY')
    traindir = data_dir.joinpath('train').as_posix()
    validdir = data_dir.joinpath('valid').as_posix()
    train_dataset = BIOMETRY(traindir, transform=transform)
    valid_dataset = BIOMETRY(traindir, transform=transform)

    # Init data loaders