Example #1
0
def run(
    seed=543,
    data_path="/tmp/cifar10",
    output_path="/tmp/output-cifar10/",
    model="resnet18_QAT_8b",
    batch_size=512,
    momentum=0.9,
    weight_decay=1e-4,
    num_workers=12,
    num_epochs=24,
    learning_rate=0.4,
    num_warmup_epochs=4,
    validate_every=3,
    checkpoint_every=1000,
    backend=None,
    resume_from=None,
    log_every_iters=15,
    nproc_per_node=None,
    with_clearml=False,
    with_amp=False,
    **spawn_kwargs,
):
    """Main entry to train an model on CIFAR10 dataset.

    Args:
        seed (int): random state seed to set. Default, 543.
        data_path (str): input dataset path. Default, "/tmp/cifar10".
        output_path (str): output path. Default, "/tmp/output-cifar10".
        model (str): model name (from torchvision) to setup model to train. Default, "resnet18".
        batch_size (int): total batch size. Default, 512.
        momentum (float): optimizer's momentum. Default, 0.9.
        weight_decay (float): weight decay. Default, 1e-4.
        num_workers (int): number of workers in the data loader. Default, 12.
        num_epochs (int): number of epochs to train the model. Default, 24.
        learning_rate (float): peak of piecewise linear learning rate scheduler. Default, 0.4.
        num_warmup_epochs (int): number of warm-up epochs before learning rate decay. Default, 4.
        validate_every (int): run model's validation every ``validate_every`` epochs. Default, 3.
        checkpoint_every (int): store training checkpoint every ``checkpoint_every`` iterations. Default, 200.
        backend (str, optional): backend to use for distributed configuration. Possible values: None, "nccl", "xla-tpu",
            "gloo" etc. Default, None.
        nproc_per_node (int, optional): optional argument to setup number of processes per node. It is useful,
            when main python process is spawning training as child processes.
        resume_from (str, optional): path to checkpoint to use to resume the training from. Default, None.
        log_every_iters (int): argument to log batch loss every ``log_every_iters`` iterations.
            It can be 0 to disable it. Default, 15.
        with_clearml (bool): if True, experiment ClearML logger is setup. Default, False.
        with_amp (bool): if True, enables native automatic mixed precision. Default, False.
        **spawn_kwargs: Other kwargs to spawn run in child processes: master_addr, master_port, node_rank, nnodes

    """
    # catch all local parameters
    config = locals()
    config.update(config["spawn_kwargs"])
    del config["spawn_kwargs"]

    spawn_kwargs["nproc_per_node"] = nproc_per_node

    with idist.Parallel(backend=backend, **spawn_kwargs) as parallel:

        parallel.run(training, config)
Example #2
0
def run_evaluation(config_filepath, backend="nccl", with_clearml=True):
    """Main entry to run model's evaluation:
        - compute validation metrics

    Args:
        config_filepath (str): evaluation configuration .py file
        backend (str): distributed backend: nccl, gloo, horovod or None to run without distributed config
        with_clearml (bool): if True, uses ClearML as experiment tracking system
    """
    assert torch.cuda.is_available(), torch.cuda.is_available()
    assert torch.backends.cudnn.enabled
    torch.backends.cudnn.benchmark = True

    config_filepath = Path(config_filepath)
    assert config_filepath.exists(), f"File '{config_filepath.as_posix()}' is not found"

    with idist.Parallel(backend=backend) as parallel:
        logger = setup_logger(name="Pascal-VOC12 Evaluation", distributed_rank=idist.get_rank())

        config = ConfigObject(config_filepath)
        InferenceConfigSchema.validate(config)
        config.script_filepath = Path(__file__)

        output_path = setup_experiment_tracking(config, with_clearml=with_clearml, task_type="testing")
        config.output_path = output_path

        utils.log_basic_info(logger, get_params(config, InferenceConfigSchema))

        try:
            parallel.run(evaluation, config, logger=logger, with_clearml=with_clearml)
        except KeyboardInterrupt:
            logger.info("Catched KeyboardInterrupt -> exit")
        except Exception as e:  # noqa
            logger.exception("")
            raise e
Example #3
0
def test_idist_parallel_no_dist():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    with idist.Parallel(backend=None) as parallel:
        parallel.run(_test_func,
                     ws=1,
                     device=device,
                     backend=None,
                     true_init_method=None)
Example #4
0
def test_idist_parallel_n_procs_native(init_method, backend, get_fixed_dirname, local_rank, world_size):
    if init_method == "FILE":
        init_method = f"file://{get_fixed_dirname('idist_parallel_n_procs_native')}/shared"

    os.environ["RANK"] = str(local_rank)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    with idist.Parallel(backend=backend, init_method=init_method) as parallel:
        parallel.run(_test_func, ws=world_size, device=device, backend=backend, true_init_method=init_method)
Example #5
0
def test_idist_parallel_spawn_n_procs_native(init_method, backend, dirname):
    if init_method == "FILE":
        init_method = f"file://{dirname}/shared"

    nproc_per_node = torch.cuda.device_count() if torch.cuda.is_available() else 4
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    with idist.Parallel(backend=backend, nproc_per_node=nproc_per_node, init_method=init_method) as parallel:
        parallel.run(_test_func, ws=nproc_per_node, device=device, backend=backend, true_init_method=init_method)
Example #6
0
def test_idist_parallel_spawn_n_procs_native(init_method, backend, dirname):
    if init_method == "FILE":
        init_method = f"file://{dirname}/shared"

    nproc_per_node = 4 if "gloo" == backend else torch.cuda.device_count()
    device = "cpu" if "gloo" == backend else "cuda"
    with idist.Parallel(backend=backend, nproc_per_node=nproc_per_node, init_method=init_method) as parallel:
        parallel.run(_test_func, ws=nproc_per_node, device=device, backend=backend, true_init_method=init_method)
Example #7
0
def test_parallel_wrong_inputs():
    with pytest.raises(ValueError,
                       match=r"Unknown backend 'abc'. Available backends:"):
        idist.Parallel(backend="abc")

    with pytest.raises(
            ValueError,
            match=r"If backend is None, argument 'nnodes' should be also None"
    ):
        idist.Parallel(nnodes=2)

    with pytest.raises(ValueError,
                       match=r"Argument nproc_per_node should positive"):
        idist.Parallel(backend="gloo", nproc_per_node=-1)

    with pytest.raises(ValueError, match=r"Argument nnodes should positive"):
        idist.Parallel(backend="gloo", nproc_per_node=1, nnodes=-1)

    with pytest.raises(ValueError,
                       match=r"If number of nodes larger than one"):
        idist.Parallel(backend="gloo", nproc_per_node=1, nnodes=2)

    with pytest.raises(ValueError,
                       match=r"Argument node_rank should be between 0 and"):
        idist.Parallel(backend="gloo", nproc_per_node=1, nnodes=2, node_rank=2)

    with pytest.raises(
            ValueError,
            match=
            r"If number of nodes larger than one, arguments master_addr and master_port"
    ):
        idist.Parallel(backend="gloo", nproc_per_node=1, nnodes=2, node_rank=1)
Example #8
0
def test_idist_parallel_spawn_params_xla():

    res = idist.Parallel._setup_spawn_params(
        nproc_per_node=8, nnodes=None, node_rank=None, master_addr=None, master_port=None, start_method="fork"
    )
    assert "nproc_per_node" in res and res["nproc_per_node"] == 8
    assert "start_method" in res and res["start_method"] == "fork"

    with idist.Parallel(backend="xla-tpu", nproc_per_node=8, start_method="fork") as parallel:
        assert parallel.backend == "xla-tpu"
        res = parallel._spawn_params
        assert "nproc_per_node" in res and res["nproc_per_node"] == 8
        assert "start_method" in res and res["start_method"] == "fork"
Example #9
0
def run(config, **kwargs):
    """This is the main method to run the training. As this training script is launched with `py_config_runner`
    it should obligatory contain `run(config, **kwargs)` method.

    """

    assert torch.cuda.is_available(), torch.cuda.is_available()
    assert torch.backends.cudnn.enabled, "Nvidia/Amp requires cudnn backend to be enabled."

    with idist.Parallel(backend="nccl") as parallel:

        logger = setup_logger(name="Pascal-VOC12 Training",
                              distributed_rank=idist.get_rank())

        assert_config(config, TRAINVAL_CONFIG)
        # The following attributes are automatically added by py_config_runner
        assert hasattr(config, "config_filepath") and isinstance(
            config.config_filepath, Path)
        assert hasattr(config, "script_filepath") and isinstance(
            config.script_filepath, Path)

        if idist.get_rank() == 0 and exp_tracking.has_clearml:
            try:
                from clearml import Task
            except ImportError:
                # Backwards-compatibility for legacy Trains SDK
                from trains import Task

            task = Task.init("Pascal-VOC12 Training",
                             config.config_filepath.stem)
            task.connect_configuration(config.config_filepath.as_posix())

        log_basic_info(logger, config)

        config.output_path = Path(exp_tracking.get_output_path())
        # dump python files to reproduce the run
        exp_tracking.log_artifact(config.config_filepath.as_posix())
        exp_tracking.log_artifact(config.script_filepath.as_posix())
        exp_tracking.log_params(get_params(config, TRAINVAL_CONFIG))

        try:
            parallel.run(training, config, logger=logger)
        except KeyboardInterrupt:
            logger.info("Catched KeyboardInterrupt -> exit")
        except Exception as e:  # noqa
            logger.exception("")
            raise e
Example #10
0
def main():
    parser = ArgumentParser(parents=[get_default_parser()])
    config = parser.parse_args()

    with idist.Parallel(
        backend=config.backend,
{% if use_distributed_training and not use_distributed_launcher %}
        nproc_per_node=config.nproc_per_node,
{% if nnodes > 1 and not use_distributed_launcher%}
        node_rank=config.node_rank,
        nnodes=config.nnodes,
        master_addr=config.master_addr,
        master_port=config.master_port,
{% endif %}
{% endif %}
    ) as parallel:
        parallel.run(run, config=config)
def run(config, **kwargs):
    """This is the main method to run the training. As this training script is launched with `py_config_runner`
    it should obligatory contain `run(config, **kwargs)` method.

    """

    assert torch.cuda.is_available(), torch.cuda.is_available()
    assert torch.backends.cudnn.enabled, "Nvidia/Amp requires cudnn backend to be enabled."

    with idist.Parallel(backend="nccl") as parallel:

        logger = setup_logger(name="Satellite segmentation Training",
                              distributed_rank=idist.get_rank())

        assert_config(config, TRAINVAL_CONFIG)
        # The following attributes are automatically added by py_config_runner
        assert hasattr(config, "config_filepath") and isinstance(
            config.config_filepath, Path)
        assert hasattr(config, "script_filepath") and isinstance(
            config.script_filepath, Path)

        log_basic_info(logger, config)

        config.output_path = Path(tracking.get_output_path())
        # dump python files to reproduce the run
        tracking.log_artifact(config.config_filepath.as_posix())
        tracking.log_artifact(config.script_filepath.as_posix())
        tracking.log_params(get_params(config, TRAINVAL_CONFIG))

        try:
            parallel.run(training, config, logger=logger)
        except KeyboardInterrupt:
            logger.info("Catched KeyboardInterrupt -> exit")
        except Exception as e:  # noqa
            logger.exception("")
            raise e
Example #12
0
def main(cfg: DictConfig) -> None:

    with idist.Parallel(
        backend=cfg.distributed.backend, nproc_per_node=cfg.distributed.nproc_per_node
    ) as parallel:
        parallel.run(training, cfg)
def main(hyperparams):
    with idist.Parallel(**hyperparams.dist_params) as parallel:
        parallel.run(run, hyperparams)
Example #14
0
def main(
    experiment_name: str,
    gpus: Union[str, List[str], str] = "auto",
    nproc_per_node: Union[int, str] = "auto",
    dataset_root: str = "./dataset",
    log_dir: str = "./log",
    model: str = "fasterrcnn_resnet50_fpn",
    epochs: int = 13,
    batch_size: int = 4,
    lr: int = 0.01,
    download: bool = False,
    image_size: int = 256,
    resume_from: str = None,
) -> None:
    """
    Args:
        experiment_name: the name of each run
        dataset_root: dataset root directory for VOC2012 Dataset
        gpus: can be "auto", "none" or number of gpu device ids like "0,1"
        log_dir: where to put all the logs
        epochs: number of epochs to train
        model: model to use, possible options are
            "fasterrcnn_resnet50_fpn",
            "fasterrcnn_mobilenet_v3_large_fpn",
            "fasterrcnn_mobilenet_v3_large_320_fpn"
        batch_size: batch size
        lr: initial learning rate
        download: whether to automatically download dataset
        device: either cuda or cpu
        image_size: image size for training and validation
        resume_from: path of checkpoint to resume from
    """
    if model not in AVAILABLE_MODELS:
        raise RuntimeError(f"Invalid model name: {model}")

    if isinstance(gpus, int):
        gpus = (gpus, )
    if isinstance(gpus, tuple):
        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
            [str(gpu) for gpu in gpus])
    elif gpus == "auto":
        gpus = tuple(range(torch.cuda.device_count()))
    elif gpus == "none":
        os.environ["CUDA_VISIBLE_DEVICES"] = ""
        gpus = tuple()

    ngpu = len(gpus)

    backend = "nccl" if ngpu > 0 else "gloo"
    if nproc_per_node == "auto":
        nproc_per_node = ngpu if ngpu > 0 else max(
            multiprocessing.cpu_count() // 2, 1)

    # to precent multiple download for preatrined checkpoint, create model in the main process
    model = getattr(detection, model)(pretrained=True)

    if model.__class__.__name__ == "FasterRCNN":
        in_features = model.roi_heads.box_predictor.cls_score.in_features
        model.roi_heads.box_predictor = FastRCNNPredictor(in_features, 21)
    elif model.__class__.__name__ == "RetinaNet":
        head = RetinaNetClassificationHead(
            model.backbone.out_channels,
            model.anchor_generator.num_anchors_per_location()[0],
            num_classes=21)
        model.head.classification_head = head

    with idist.Parallel(backend=backend,
                        nproc_per_node=nproc_per_node) as parallel:
        parallel.run(
            run,
            "cuda" if ngpu > 0 else "cpu",
            experiment_name,
            gpus,
            dataset_root,
            log_dir,
            model,
            epochs,
            batch_size,
            lr,
            download,
            image_size,
            resume_from,
        )
Example #15
0
def run(
    seed=543,
    data_dir="/tmp/data",
    output_dir="/tmp/output-imdb/",
    model="bert-base-uncased",
    model_dir="/tmp/model",
    tokenizer_dir="/tmp/tokenizer",
    num_classes=1,
    dropout=0.3,
    n_fc=768,
    max_length=256,
    batch_size=128,
    weight_decay=0.01,
    num_workers=4,
    num_epochs=3,
    learning_rate=5e-5,
    num_warmup_epochs=0,
    validate_every=1,
    checkpoint_every=1000,
    backend=None,
    resume_from=None,
    log_every_iters=15,
    nproc_per_node=None,
    with_clearml=False,
    with_amp=False,
    **spawn_kwargs,
):
    """Main entry to fintune a transformer model on the IMDB dataset for sentiment classification.
    Args:
        seed (int): random state seed to set. Default, 543.
        data_dir (str): dataset cache directory. Default, "/tmp/data".
        output_path (str): output path. Default, "/tmp/output-IMDB".
        model (str): model name (from transformers) to setup model,tokenize and config to train. Default,
        "bert-base-uncased".
        model_dir (str): cache directory to download the pretrained model. Default, "/tmp/model".
        tokenizer_dir (str) : tokenizer cache directory. Default, "/tmp/tokenizer".
        num_classes (int) : number of target classes. Default, 1 (binary classification).
        dropout (float) : dropout probability. Default, 0.3.
        n_fc (int) : number of neurons in the last fully connected layer. Default, 768.
        max_length (int) : maximum number of tokens for the inputs to the transformer model. Default,256
        batch_size (int): total batch size. Default, 128 .
        weight_decay (float): weight decay. Default, 0.01 .
        num_workers (int): number of workers in the data loader. Default, 12.
        num_epochs (int): number of epochs to train the model. Default, 5.
        learning_rate (float): peak of piecewise linear learning rate scheduler. Default, 5e-5.
        num_warmup_epochs (int): number of warm-up epochs before learning rate decay. Default, 3.
        validate_every (int): run model's validation every ``validate_every`` epochs. Default, 3.
        checkpoint_every (int): store training checkpoint every ``checkpoint_every`` iterations. Default, 1000.
        backend (str, optional): backend to use for distributed configuration. Possible values: None, "nccl", "xla-tpu",
            "gloo" etc. Default, None.
        nproc_per_node (int, optional): optional argument to setup number of processes per node. It is useful,
            when main python process is spawning training as child processes.
        resume_from (str, optional): path to checkpoint to use to resume the training from. Default, None.
        log_every_iters (int): argument to log batch loss every ``log_every_iters`` iterations.
            It can be 0 to disable it. Default, 15.
        with_clearml (bool): if True, experiment ClearML logger is setup. Default, False.
        with_amp (bool): if True, enables native automatic mixed precision. Default, False.
        **spawn_kwargs: Other kwargs to spawn run in child processes: master_addr, master_port, node_rank, nnodes
    """
    # catch all local parameters
    config = locals()
    config.update(config["spawn_kwargs"])
    del config["spawn_kwargs"]

    spawn_kwargs["nproc_per_node"] = nproc_per_node

    with idist.Parallel(backend=backend, **spawn_kwargs) as parallel:

        parallel.run(training, config)
def run(
    seed=42,
    data_path="./data",
    subset_train="train",
    subset_val="val",
    output_path="./output",
    architecture="FPN",
    encoder="resnet50",
    encoder_weights="imagenet",
    encoder_freeze_at=None,
    batch_size=6,
    optimizer="Adam",
    weight_decay=1e-4,
    num_workers=12,
    num_iterations=10000,
    learning_rate=0.0001,
    learning_rate_milestone_iterations=(2000, 8000),
    gamma=0.1,
    num_warmup_iterations=1000,
    warmup_factor=0.001,
    validate_every=10,
    checkpoint_every=200,
    backend=None,
    resume_from=None,
    log_every_iters=0,
    nproc_per_node=None,
    stop_iteration=None,
    with_trains=False,
    active_gpu_ids=(0,),
    **spawn_kwargs,
):
    """Main entry to train a model on the semantic segmentation of carbon black agglomerate TEM images.

    Args:
        seed (int): random state seed to set. Default, 42.
        data_path (str): input dataset path. Default, "./data".
        subset_train (str): name of training subset. Default, "train".
        subset_val (str): name of validation subset. Default, "val".
        architecture (str): architecture (see https://github.com/qubvel/segmentation_models.pytorch#architectures-).
            Default, "FPN".
        encoder (str): encoder architecture (see https://github.com/qubvel/segmentation_models.pytorch#encoders-).
            Default, "resnet50".
        encoder_weights (str): pretrained weights (see https://github.com/qubvel/segmentation_models.pytorch#encoders-).
            Default, "imagenet".
        encoder_freeze_at (int or None): defines stages of the encoder which are frozen before the training (e.g. 2
            means all stages including stage 2 and beyond). Default, None.
        output_path (str): output path. Default, "./output".
        batch_size (int): total batch size. Default, 6.
        optimizer (str): optimizer. Default, "Adam".
        weight_decay (float): weight decay. Default, 1e-4.
        num_workers (int): number of workers in the data loader. Default, 12.
        num_iterations (int): number of iterations to train the model. Default, 10000.
        learning_rate (float): peak of piecewise linear learning rate scheduler. Default, 0.0001.
        learning_rate_milestone_iterations (iterable of int): numbers of iterations where learning rate is each time
            decreased by a factor gamma. Default, (2000, 8000).
        gamma (float): factor to multiply learning rate with at each milestone. Default, 0.1.
        num_warmup_iterations (int): number of warm-up iterations before learning rate decay. Default, 1000.
        warmup_factor (float): learning rate starts at warmup_factor * learning_rate. Default, 0.001.
        validate_every (int): run model's validation every ``validate_every`` epochs. Default, 10.
        checkpoint_every (int): store training checkpoint every ``checkpoint_every`` iterations. Default, 200.
        backend (str, optional): backend to use for distributed configuration. Possible values: None, "nccl", "xla-tpu",
            "gloo" etc. Default, None.
        nproc_per_node (int, optional): optional argument to setup number of processes per node. It is useful,
            when main python process is spawning training as child processes. Default, None.
        resume_from (str, optional): path to checkpoint to use to resume the training from. Default, None.
        log_every_iters (int): argument to log batch loss every ``log_every_iters`` iterations.
            It can be 0 to disable it. Default, 0.
        stop_iteration (int, optional): iteration to stop the training. Can be used to check resume from checkpoint.
            Default, None.
        with_trains (bool): if True, experiment Trains logger is setup. Default, False.
        active_gpu_ids (tuple of int): ids of GPUs to use. Default, (0,).
        **spawn_kwargs: Other kwargs to spawn run in child processes: master_addr, master_port, node_rank, nnodes

    """
    # catch all local parameters
    config = locals()
    config.update(config["spawn_kwargs"])
    del config["spawn_kwargs"]

    utils.select_active_gpus(config["active_gpu_ids"])

    spawn_kwargs["nproc_per_node"] = nproc_per_node

    with idist.Parallel(backend=backend, **spawn_kwargs) as parallel:
        parallel.run(training, config)
def run(config: ConfigSchema) -> None:
    spawn_kwargs = config.spawn_kwargs
    spawn_kwargs["nproc_per_node"] = config.nproc_per_node

    with idist.Parallel(backend=config.backend, **spawn_kwargs) as parallel:
        parallel.run(run_training, config)
Example #18
0
            trainer.state.output,
        ))

    trainer.run(train_loader, max_epochs=1)


if __name__ == "__main__":
    parser = argparse.ArgumentParser("Pytorch Ignite - idist")
    parser.add_argument("--backend", type=str, default="nccl")
    parser.add_argument("--nproc_per_node", type=int)
    parser.add_argument("--log_interval", type=int, default=4)
    parser.add_argument("--nb_samples", type=int, default=128)
    parser.add_argument("--batch_size", type=int, default=16)
    args_parsed = parser.parse_args()

    # idist from ignite handles multiple backend (gloo, nccl, horovod, xla)
    # and launcher (torch.distributed.launch, horovodrun, slurm)
    config = {
        "log_interval": args_parsed.log_interval,
        "batch_size": args_parsed.batch_size,
        "nb_samples": args_parsed.nb_samples,
    }

    spawn_kwargs = dict()
    spawn_kwargs["nproc_per_node"] = args_parsed.nproc_per_node

    # Specific ignite.distributed
    with idist.Parallel(backend=args_parsed.backend,
                        **spawn_kwargs) as parallel:
        parallel.run(training, config)
Example #19
0
def run(
    seed=543,
    data_path="/tmp/cifar10",
    output_path="/tmp/output-cifar10/",
    model="vit_tiny_patch4_32x32",
    rescale_size=None,
    rand_aug=None,
    rand_erasing=None,
    optimizer="adam",
    batch_size=128,
    weight_decay=1e-4,
    num_workers=4,
    num_epochs=200,
    learning_rate=0.001,
    num_warmup_epochs=0,
    validate_every=3,
    checkpoint_every=1000,
    backend=None,
    resume_from=None,
    nproc_per_node=None,
    with_pbar=False,
    with_amp=False,
    cutmix_beta=0.0,
    cutmix_prob=0.5,
    rescaled_size=None,
    with_clearml=False,
    smoke_test=False,
    **spawn_kwargs,
):
    """Main entry to train an model on CIFAR10 dataset.

    Args:
        seed (int): random state seed to set.
        data_path (str): input dataset path. Default, "/tmp/cifar10".
        output_path (str): output path. Default, "/tmp/output-cifar10".
        model (str): model name (from torchvision) to setup model to train.
        batch_size (int): total batch size.
        optimizer (str): optimizer name. Possible values: "sgd", "adam", "adamw". Default, "adam".
        weight_decay (float): weight decay.
        num_workers (int): number of workers in the data loader.
        num_epochs (int): number of epochs to train the model.
        learning_rate (float): peak of piecewise linear learning rate scheduler.
        num_warmup_epochs (int): number of warm-up epochs before learning rate decay.
        validate_every (int): run model's validation every ``validate_every`` epochs.
        checkpoint_every (int): store training checkpoint every ``checkpoint_every`` iterations.
        backend (str, optional): backend to use for distributed configuration. Possible values: None, "nccl",
            "gloo" etc.
        nproc_per_node (int, optional): optional argument to setup number of processes per node. It is useful,
            when main python process is spawning training as child processes.
        resume_from (str, optional): path to checkpoint to use to resume the training from.
        with_pbar(bool): if True adds a progress bar on training iterations.
        with_amp(bool): if True uses torch native AMP
        rescale_size (int, optional): if provided then input image will be rescaled to that value.
        cutmix_beta : beta value for the distribution of the cutmix
        cutmix_prob : cutmix probablity
        with_clearml (bool): if True, experiment ClearML logger is setup.
        smoke_test (bool): run 5 iters and quit
        **spawn_kwargs: Other kwargs to spawn run in child processes: master_addr, master_port, node_rank, nnodes

    """
    # catch all local parameters
    config = locals()
    config.update(config["spawn_kwargs"])
    del config["spawn_kwargs"]

    spawn_kwargs["nproc_per_node"] = nproc_per_node
    if backend == "xla-tpu" and with_amp:
        raise RuntimeError(
            "The value of with_amp should be False if backend is xla")

    with idist.Parallel(backend=backend, **spawn_kwargs) as parallel:

        parallel.run(training, config)
Example #20
0
def test_idist_parallel_gloo():
    with idist.Parallel(backend="gloo", nproc_per_node=4) as parallel:
        parallel.run(_test_func, ws=4, device="cpu")
Example #21
0
def test_idist_parallel_nccl():
    with idist.Parallel(backend="nccl",
                        nproc_per_node=torch.cuda.device_count()) as parallel:
        parallel.run(_test_func, ws=torch.cuda.device_count(), device="cuda")
Example #22
0
def test_idist_parallel_nccl_nprocs(local_rank, world_size):
    os.environ["RANK"] = str(local_rank)
    with idist.Parallel(backend="nccl") as parallel:
        parallel.run(_test_func, ws=world_size, device="cuda")
Example #23
0
    parser.add_argument("--backend", type=str, default=None)
    parser.add_argument("--nproc_per_node", type=int, default=None)
    parser.add_argument("--nnodes", type=int, default=None)
    parser.add_argument("--node_rank", type=int, default=None)
    parser.add_argument("--master_addr", type=str, default=None)
    parser.add_argument("--master_port", type=str, default=None)
    parser.add_argument("--init_method", type=str, default=None)

    args = parser.parse_args()

    config = {
        "model": "resnet18",
        "lr": 0.01,
    }
    if args.backend in ["gloo", "nccl"]:
        config[
            "true_init_method"] = args.init_method if args.init_method is not None else "env://"

    dist_config = dict(
        nproc_per_node=args.nproc_per_node,
        nnodes=args.nnodes,
        node_rank=args.node_rank,
        master_addr=args.master_addr,
        master_port=args.master_port,
    )
    if args.init_method is not None:
        dist_config["init_method"] = args.init_method

    with idist.Parallel(backend=args.backend, **dist_config) as parallel:
        parallel.run(training, config, a=1, b=2)
Example #24
0
def test_idist_parallel_no_dist():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    with idist.Parallel(backend=None) as parallel:
        parallel.run(_test_func, ws=1, device=device)
Example #25
0
# Simply run everything on your infrastructure


# --- Single computation device ---
# $ python main.py
#
if __name__ == "__main__" and not (in_colab or with_torch_launch):

    backend = None  # or "nccl", "gloo", "xla-tpu" ...
    nproc_per_node = None  # or N to spawn N processes
    config = {
        "model": "resnet18",
        "dataset": "cifar10",
    }

    with idist.Parallel(backend=backend, nproc_per_node=nproc_per_node) as parallel:
        parallel.run(training, config)


# --- Multiple GPUs ---
# $ python -m torch.distributed.launch --nproc_per_node=2 --use_env main.py
#
if __name__ == "__main__" and with_torch_launch:

    backend = "nccl"  # or "nccl", "gloo", "xla-tpu" ...
    nproc_per_node = None  # or N to spawn N processes
    config = {
        "model": "resnet18",
        "dataset": "cifar10",
    }
Example #26
0
                        help="Override train batch size")
    parser.add_argument("--lr",
                        type=float,
                        default=None,
                        help="Override train learning rate")
    parser.add_argument("--ep",
                        type=int,
                        default=None,
                        help="Override number of epochs")
    args = parser.parse_args()

    assert args.config is not None
    assert args.config.exists()

    # Define configuration mutations if certain cmd args are defined
    mutations = {}
    if args.bs is not None:
        mutations["train_batch_size"] = args.bs
    if args.lr is not None:
        mutations["learning_rate"] = args.lr
    if args.ep is not None:
        mutations["num_epochs"] = args.ep

    # Pass configuration file into py_config_runner.ConfigObject
    # and fetch configuration parameters as attributes
    config = ConfigObject(args.config, mutations=mutations)

    with idist.Parallel(backend=args.backend,
                        nproc_per_node=args.nproc_per_node) as parallel:
        parallel.run(training, config)