예제 #1
0
def main_worker(args, unknown_args):
    """Runs main worker thread from model training."""
    args, config = parse_args_uargs(args, unknown_args)
    set_global_seed(args.seed)
    prepare_cudnn(args.deterministic, args.benchmark)

    config.setdefault("distributed_params", {})["apex"] = args.apex
    config.setdefault("distributed_params", {})["amp"] = args.amp
    expdir = Path(args.expdir)

    # optuna objective
    def objective(trial: optuna.trial):
        trial, trial_config = _process_trial_config(trial, config.copy())
        experiment, runner, trial_config = prepare_config_api_components(
            expdir=expdir, config=trial_config)
        # @TODO: here we need better solution.
        experiment._trial = trial  # noqa: WPS437

        if experiment.logdir is not None and get_rank() <= 0:
            dump_environment(trial_config, experiment.logdir, args.configs)
            dump_code(args.expdir, experiment.logdir)

        runner.run_experiment(experiment)

        return runner.best_valid_metrics[runner.main_metric]

    # optuna direction
    direction = ("minimize" if config.get("stages", {}).get(
        "stage_params", {}).get("minimize_metric", True) else "maximize")

    # optuna study
    study_params = config.pop("study_params", {})

    # optuna sampler
    sampler_params = study_params.pop("sampler_params", {})
    optuna_sampler_type = sampler_params.pop("sampler", None)
    optuna_sampler = (optuna.samplers.__dict__[optuna_sampler_type](
        **sampler_params) if optuna_sampler_type is not None else None)

    # optuna pruner
    pruner_params = study_params.pop("pruner_params", {})
    optuna_pruner_type = pruner_params.pop("pruner", None)
    optuna_pruner = (optuna.pruners.__dict__[optuna_pruner_type](
        **pruner_params) if optuna_pruner_type is not None else None)

    study = optuna.create_study(
        direction=direction,
        storage=args.storage or study_params.pop("storage", None),
        study_name=args.study_name or study_params.pop("study_name", None),
        sampler=optuna_sampler,
        pruner=optuna_pruner,
    )
    study.optimize(
        objective,
        n_trials=args.n_trials,
        timeout=args.timeout,
        n_jobs=args.n_jobs or 1,
        gc_after_trial=args.gc_after_trial,
        show_progress_bar=args.show_progress_bar,
    )
예제 #2
0
def main(args, unknown_args):
    """Runs the ``catalyst-dl tune`` script."""
    args, config = parse_args_uargs(args, unknown_args)
    set_global_seed(args.seed)
    prepare_cudnn(args.deterministic, args.benchmark)

    # optuna objective
    def objective(trial: optuna.trial):
        trial, trial_config = _process_trial_config(trial, config.copy())
        runner: ConfigRunner = get_config_runner(expdir=Path(args.expdir), config=trial_config)
        # @TODO: here we need better solution.
        runner._trial = trial  # noqa: WPS437

        if get_rank() <= 0:
            dump_environment(logdir=runner.logdir, config=config, configs_path=args.configs)
            dump_code(expdir=args.expdir, logdir=runner.logdir)

        runner.run()

        return trial.best_score

    # optuna study
    study_params = config.pop("study", {})

    # optuna sampler
    sampler_params = study_params.pop("sampler", {})
    optuna_sampler_type = sampler_params.pop("_target_", None)
    optuna_sampler = (
        optuna.samplers.__dict__[optuna_sampler_type](**sampler_params)
        if optuna_sampler_type is not None
        else None
    )

    # optuna pruner
    pruner_params = study_params.pop("pruner", {})
    optuna_pruner_type = pruner_params.pop("_target_", None)
    optuna_pruner = (
        optuna.pruners.__dict__[optuna_pruner_type](**pruner_params)
        if optuna_pruner_type is not None
        else None
    )

    study = optuna.create_study(
        direction=args.direction or study_params.pop("direction", "minimize"),
        storage=args.storage or study_params.pop("storage", None),
        study_name=args.study_name or study_params.pop("study_name", None),
        sampler=optuna_sampler,
        pruner=optuna_pruner,
        **study_params,
    )
    study.optimize(
        objective,
        n_trials=args.n_trials,
        timeout=args.timeout,
        n_jobs=args.n_jobs or 1,
        gc_after_trial=args.gc_after_trial,
        show_progress_bar=args.show_progress_bar,
    )
예제 #3
0
def config_main(args, unknown_args):
    """Yaml config catalyst-dl run entry point."""
    args, config = parse_args_uargs(args, unknown_args)
    set_global_seed(args.seed)
    prepare_cudnn(args.deterministic, args.benchmark)

    runner: ConfigRunner = get_config_runner(expdir=args.expdir, config=config)

    if get_rank() <= 0:
        dump_environment(logdir=runner.logdir, config=config, configs_path=args.configs)
        dump_code(expdir=args.expdir, logdir=runner.logdir)

    runner.run()
예제 #4
0
def main_worker(cfg: DictConfig):
    set_global_seed(cfg.args.seed)
    prepare_cudnn(cfg.args.deterministic, cfg.args.benchmark)

    import_module(hydra.utils.to_absolute_path(cfg.args.expdir))

    experiment = hydra.utils.instantiate(cfg.experiment, cfg=cfg)
    runner = hydra.utils.instantiate(cfg.runner)

    if experiment.logdir is not None and get_rank() <= 0:
        dump_environment(cfg, experiment.logdir)
        dump_code(
            hydra.utils.to_absolute_path(cfg.args.expdir), experiment.logdir
        )

    runner.run_experiment(experiment)
예제 #5
0
def main_worker(args, unknown_args):
    """Runs main worker thread from model training."""
    args, config = parse_args_uargs(args, unknown_args)
    set_global_seed(args.seed)
    prepare_cudnn(args.deterministic, args.benchmark)

    config.setdefault("distributed_params", {})["apex"] = args.apex
    config.setdefault("distributed_params", {})["amp"] = args.amp

    experiment, runner, config = prepare_config_api_components(expdir=Path(
        args.expdir),
                                                               config=config)

    if experiment.logdir is not None and get_rank() <= 0:
        dump_environment(config, experiment.logdir, args.configs)
        dump_code(args.expdir, experiment.logdir)

    runner.run_experiment(experiment)
예제 #6
0
def main(args, _=None):
    """Run the ``catalyst-contrib image2embeddings`` script."""
    global IMG_SIZE

    set_global_seed(args.seed)
    prepare_cudnn(args.deterministic, args.benchmark)

    IMG_SIZE = (args.img_size, args.img_size)  # noqa: WPS442

    if args.traced_model is not None:
        device = get_device()
        model = torch.jit.load(str(args.traced_model), map_location=device)
    else:
        model = ResnetEncoder(arch=args.arch, pooling=args.pooling)
        model = model.eval()
        model, _, _, _, device = process_components(model=model)

    df = pd.read_csv(args.in_csv)
    df = df.reset_index().drop("index", axis=1)
    df = list(df.to_dict("index").values())

    open_fn = ImageReader(input_key=args.img_col,
                          output_key="image",
                          rootpath=args.rootpath)

    dataloader = get_loader(
        df,
        open_fn,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        dict_transform=dict_transformer,
    )

    features = []
    dataloader = tqdm(dataloader) if args.verbose else dataloader
    with torch.no_grad():
        for batch in dataloader:
            batch_features = model(batch["image"].to(device))
            batch_features = batch_features.cpu().detach().numpy()
            features.append(batch_features)

    features = np.concatenate(features, axis=0)
    np.save(args.out_npy, features)
예제 #7
0
def main(cfg: DictConfig):
    """
    Hydra config catalyst-dl run entry point

    Args:
        cfg: (DictConfig) configuration

    """
    cfg = prepare_hydra_config(cfg)
    set_global_seed(cfg.args.seed)
    prepare_cudnn(cfg.args.deterministic, cfg.args.benchmark)

    import_module(hydra.utils.to_absolute_path(cfg.args.expdir))
    runner = hydra.utils.instantiate(cfg.runner, cfg=cfg)

    if get_rank() <= 0:
        dump_environment(logdir=runner.logdir, config=cfg)
        dump_code(expdir=hydra.utils.to_absolute_path(cfg.args.expdir),
                  logdir=runner.logdir)

    runner.run()
예제 #8
0
import argparse
import pandas as pd
from catalyst.dl.runner import SupervisedRunner
from catalyst.dl import utils
from catalyst.dl.callbacks import (EarlyStoppingCallback, CriterionCallback,
                                   OptimizerCallback, DiceCallback,
                                   CheckpointCallback)
# import torch
from torch import optim
from torch.utils.data import DataLoader
from dataloader import CloudDataset
import segmentation_models_pytorch as smp
from catalyst.utils.seed import set_global_seed
from catalyst.utils.torch import prepare_cudnn
set_global_seed(2019)
prepare_cudnn(deterministic=True)

parser = argparse.ArgumentParser("PyTorch Segmentation Pipeline")
args = parser.add_argument('-E', '--epochs', default=1, type=int)
args = parser.add_argument('-F', '--fold', default=1, type=int)
args = parser.add_argument('-C', '--checkpoint', default=False, type=bool)
args = parser.add_argument('-M', '--model', default='AlbuNet', type=str)
args = parser.add_argument('-A', '--encoder', default='resnet18', type=str)
args = parser.add_argument('-P', '--pretrained', default=True, type=bool)
args = parser.add_argument('--lr', default=1e-4, type=float)
args = parser.add_argument('--lr_e', default=1e-4, type=float)
args = parser.add_argument('--lr_d', default=1e-4, type=float)
args = parser.add_argument('--bs', default=4, type=int)
args = parser.add_argument('--size', default=320, type=int)
args = parser.add_argument('--dice-weight', default=0.5, type=float)
args = parser.parse_args()
예제 #9
0
def main(args, _=None):
    """Run the ``catalyst-contrib text2embeddings`` script."""
    batch_size = args.batch_size
    num_workers = args.num_workers
    max_length = args.max_length
    pooling_groups = args.pooling.split(",")
    bert_level = args.bert_level

    if bert_level is not None:
        assert (args.output_hidden_states
                ), "You need hidden states output for level specification"

    set_global_seed(args.seed)
    prepare_cudnn(args.deterministic, args.benchmark)

    if getattr(args, "in_huggingface", False):
        model_config = BertConfig.from_pretrained(args.in_huggingface)
        model_config.output_hidden_states = args.output_hidden_states
        model = BertModel.from_pretrained(args.in_huggingface,
                                          config=model_config)
        tokenizer = BertTokenizer.from_pretrained(args.in_huggingface)
    else:
        model_config = BertConfig.from_pretrained(args.in_config)
        model_config.output_hidden_states = args.output_hidden_states
        model = BertModel(config=model_config)
        tokenizer = BertTokenizer.from_pretrained(args.in_vocab)
    if getattr(args, "in_model", None) is not None:
        checkpoint = load_checkpoint(args.in_model)
        checkpoint = {"model_state_dict": checkpoint}
        unpack_checkpoint(checkpoint=checkpoint, model=model)

    model = model.eval()
    model, _, _, _, device = process_components(model=model)

    df = pd.read_csv(args.in_csv)
    df = df.dropna(subset=[args.txt_col])
    df.to_csv(f"{args.out_prefix}.df.csv", index=False)
    df = df.reset_index().drop("index", axis=1)
    df = list(df.to_dict("index").values())
    num_samples = len(df)

    open_fn = LambdaReader(
        input_key=args.txt_col,
        output_key=None,
        lambda_fn=partial(
            tokenize_text,
            strip=args.strip,
            lowercase=args.lowercase,
            remove_punctuation=args.remove_punctuation,
        ),
        tokenizer=tokenizer,
        max_length=max_length,
    )

    dataloader = get_loader(
        df,
        open_fn,
        batch_size=batch_size,
        num_workers=num_workers,
    )

    features = {}
    dataloader = tqdm(dataloader) if args.verbose else dataloader
    with torch.no_grad():
        for idx, batch_input in enumerate(dataloader):
            batch_input = any2device(batch_input, device)
            batch_output = model(**batch_input)
            mask = (batch_input["attention_mask"].unsqueeze(-1)
                    if args.mask_for_max_length else None)

            if check_ddp_wrapped(model):
                # using several gpu
                hidden_size = model.module.config.hidden_size
                hidden_states = model.module.config.output_hidden_states

            else:
                # using cpu or one gpu
                hidden_size = model.config.hidden_size
                hidden_states = model.config.output_hidden_states

            batch_features = process_bert_output(
                bert_output=batch_output,
                hidden_size=hidden_size,
                output_hidden_states=hidden_states,
                pooling_groups=pooling_groups,
                mask=mask,
            )

            # create storage based on network output
            if idx == 0:
                for layer_name, layer_value in batch_features.items():
                    if bert_level is not None and bert_level != layer_name:
                        continue
                    layer_name = (layer_name if isinstance(layer_name, str)
                                  else f"{layer_name:02d}")
                    _, embedding_size = layer_value.shape
                    features[layer_name] = np.memmap(
                        f"{args.out_prefix}.{layer_name}.npy",
                        dtype=np.float32,
                        mode="w+",
                        shape=(num_samples, embedding_size),
                    )

            indices = np.arange(idx * batch_size,
                                min((idx + 1) * batch_size, num_samples))
            for layer_name2, layer_value2 in batch_features.items():
                if bert_level is not None and bert_level != layer_name2:
                    continue
                layer_name2 = (layer_name2 if isinstance(layer_name2, str) else
                               f"{layer_name2:02d}")
                features[layer_name2][indices] = _detach(layer_value2)

    if args.force_save:
        for key, mmap in features.items():
            mmap.flush()
            np.save(f"{args.out_prefix}.{key}.force.npy",
                    mmap,
                    allow_pickle=False)