Exemple #1
0
def get_dataflow(config):
    # - Get train/test datasets
    if idist.get_rank() > 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    train_dataset, test_dataset = utils.get_train_test_datasets(
        config["data_path"],
        **{k: config[k]
           for k in ["rescale_size", "rand_aug", "rand_erasing"]},
    )

    if idist.get_rank() == 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu
    train_loader = idist.auto_dataloader(
        train_dataset,
        batch_size=config["batch_size"],
        num_workers=config["num_workers"],
        shuffle=True,
        drop_last=True,
    )

    test_loader = idist.auto_dataloader(
        test_dataset,
        batch_size=2 * config["batch_size"],
        num_workers=config["num_workers"],
        shuffle=False,
    )
    return train_loader, test_loader
    def __init__(self,
                 logger: TrainsLogger = None,
                 output_uri: str = None,
                 dirname: str = None,
                 *args,
                 **kwargs):

        self._setup_check_trains(logger, output_uri)

        if not dirname:
            dirname = ""
            if idist.get_rank() == 0:
                dirname = tempfile.mkdtemp(
                    prefix="ignite_checkpoints_{}".format(
                        datetime.now().strftime("%Y_%m_%d_%H_%M_%S_")))
            if idist.get_world_size() > 1:
                dirname = idist.all_gather(dirname)[0]

            warnings.warn(
                "TrainsSaver created a temporary checkpoints directory: {}".
                format(dirname))
            idist.barrier()

        # Let's set non-atomic tmp dir saving behaviour
        if "atomic" not in kwargs:
            kwargs["atomic"] = False

        super(TrainsSaver, self).__init__(dirname=dirname, *args, **kwargs)
Exemple #3
0
def get_dataflow(config):
    # - Get train/test datasets
    if idist.get_local_rank() > 0:
        # Ensure that only local rank 0 download the dataset
        # Thus each node will download a copy of the dataset
        idist.barrier()

    train_dataset, test_dataset = utils.get_train_test_datasets(
        config["data_path"])

    if idist.get_local_rank() == 0:
        # Ensure that only local rank 0 download the dataset
        idist.barrier()

    # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu
    train_loader = idist.auto_dataloader(
        train_dataset,
        batch_size=config["batch_size"],
        num_workers=config["num_workers"],
        shuffle=True,
        drop_last=True,
    )

    test_loader = idist.auto_dataloader(
        test_dataset,
        batch_size=2 * config["batch_size"],
        num_workers=config["num_workers"],
        shuffle=False,
    )
    return train_loader, test_loader
Exemple #4
0
def get_dataflow(config):

    # - Get train/test datasets
    if idist.get_rank() > 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    train_dataset, test_dataset = get_train_test_datasets(config.get("data_path", "."))

    if idist.get_rank() == 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu
    train_loader = idist.auto_dataloader(
        train_dataset,
        batch_size=config.get("batch_size", 512),
        num_workers=config.get("num_workers", 8),
        shuffle=True,
        drop_last=True,
    )
    config["num_iters_per_epoch"] = len(train_loader)

    test_loader = idist.auto_dataloader(
        test_dataset,
        batch_size=2 * config.get("batch_size", 512),
        num_workers=config.get("num_workers", 8),
        shuffle=False,
    )
    return train_loader, test_loader
def get_dataflow(config: ConfigSchema, wlm: WeakLabelManager) -> Dict[str, DataLoader]:
    # - Get train/test datasets
    if idist.get_rank() > 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    dataset = get_dataset(config.dataset, config.data_path)
    train_split = wlm.convert_targets(dataset["train"])

    if idist.get_rank() == 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu
    train_loader = idist.auto_dataloader(
        train_split,
        batch_size=config.batch_size,
        num_workers=config.num_workers,
        shuffle=True,
        drop_last=True,
    )
    val_loader = idist.auto_dataloader(
        dataset["val"],
        batch_size=2 * config.batch_size,
        num_workers=config.num_workers,
        shuffle=False,
    )
    test_loader = idist.auto_dataloader(
        dataset["test"],
        batch_size=2 * config.batch_size,
        num_workers=config.num_workers,
        shuffle=False,
    )
    return {"train": train_loader, "val": val_loader, "test": test_loader}
Exemple #6
0
def get_dataflow(config):
    # - Get train/test datasets
    if idist.get_rank() > 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    train_dataset, test_dataset = utils.get_train_test_datasets(
        config["data_path"])

    if idist.get_rank() == 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu
    train_loader = idist.auto_dataloader(
        train_dataset,
        batch_size=config["batch_size"],
        num_workers=config["num_workers"],
        shuffle=True,
        pin_memory="cuda" in idist.device().type,
        drop_last=True,
    )

    test_loader = idist.auto_dataloader(
        test_dataset,
        batch_size=2 * config["batch_size"],
        num_workers=config["num_workers"],
        shuffle=False,
        pin_memory="cuda" in idist.device().type,
    )
    return train_loader, test_loader
    def __init__(
        self,
        logger: Optional[ClearMLLogger] = None,
        output_uri: Optional[str] = None,
        dirname: Optional[str] = None,
        *args: Any,
        **kwargs: Any,
    ) -> None:

        self._setup_check_clearml(logger, output_uri)

        if not dirname:
            dirname = ""
            if idist.get_rank() == 0:
                dirname = tempfile.mkdtemp(prefix=f"ignite_checkpoints_{datetime.now().strftime('%Y_%m_%d_%H_%M_%S_')}")
            if idist.get_world_size() > 1:
                dirname = idist.all_gather(dirname)[0]  # type: ignore[index, assignment]

            warnings.warn(f"ClearMLSaver created a temporary checkpoints directory: {dirname}")
            idist.barrier()

        # Let's set non-atomic tmp dir saving behaviour
        if "atomic" not in kwargs:
            kwargs["atomic"] = False

        self._checkpoint_slots = defaultdict(list)  # type: DefaultDict[Union[str, Tuple[str, str]], List[Any]]

        super(ClearMLSaver, self).__init__(dirname=dirname, *args, **kwargs)  # type: ignore[misc]
Exemple #8
0
def get_dataflow(config):
    # - Get train/test datasets
    if idist.get_rank() > 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    train_dataset, test_dataset = utils.get_dataset(config["data_dir"],
                                                    config["model"],
                                                    config["tokenizer_dir"],
                                                    config["max_length"])

    if idist.get_rank() == 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu
    train_loader = idist.auto_dataloader(
        train_dataset,
        batch_size=config["batch_size"],
        num_workers=config["num_workers"],
        shuffle=True,
        drop_last=True,
    )

    test_loader = idist.auto_dataloader(
        test_dataset,
        batch_size=2 * config["batch_size"],
        num_workers=config["num_workers"],
        shuffle=False,
    )
    return train_loader, test_loader
Exemple #9
0
def _test_distrib_barrier(device):

    t = torch.tensor([idist.get_rank()], device=device, dtype=torch.float)
    true_res = sum([i for i in range(idist.get_world_size())])

    if idist.get_rank() == 0:
        t += 10.0
    idist.barrier()

    tt = idist.all_reduce(t)
    assert tt.item() == true_res + 10.0
Exemple #10
0
def get_datasets(path):
    local_rank = idist.get_local_rank()

    if local_rank > 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    train_ds = datasets.CIFAR10(root=path, train=True, download=True, transform=train_transform)
    eval_ds = datasets.CIFAR10(root=path, train=False, download=True, transform=eval_transform)

    if local_rank == 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    return train_ds, eval_ds
def get_datasets(*args, **kwargs):
    local_rank = idist.get_local_rank()

    if local_rank > 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    # CUSTOM DATASETS GO HERE
    train_dataset = ...
    eval_dataset = ...

    if local_rank == 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    return train_dataset, eval_dataset
Exemple #12
0
def _test_tpu_saves_to_cpu(device, dirname):
    torch.manual_seed(0)

    h = ModelCheckpoint(dirname, _PREFIX)
    engine = Engine(lambda e, b: None)
    engine.state = State(epoch=0, iteration=1)

    model = DummyModel().to(device)
    to_save = {"model": model}

    h(engine, to_save)

    idist.barrier()

    fname = h.last_checkpoint
    assert isinstance(fname, str)
    assert os.path.join(dirname, _PREFIX) in fname
    assert os.path.exists(fname)
    loaded_objects = torch.load(fname)
    assert loaded_objects == model.cpu().state_dict()
Exemple #13
0
def get_model_weights(config, logger, with_clearml):

    path = ""
    if with_clearml:
        from clearml import Model

        if idist.get_rank() > 0:
            idist.barrier()
        else:
            model_id = config.weights_path

            logger.info(f"Loading trained model: {model_id}")
            model = Model(model_id)
            assert model is not None, f"{model_id}"
            path = model.get_local_copy()
            idist.barrier()
        path = idist.broadcast(path, src=0)
    else:
        path = config.weights_path
        logger.info(f"Loading {path}")

    assert Path(path).exists(), f"{path} is not found"
    return torch.load(path)
def get_dataflow(config):
    # - Get train/test datasets
    if idist.get_local_rank() > 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    train_dataset, test_dataset = get_dataset(
        config.data_dir, config.model, config.tokenizer_dir, config.max_length
    )

    if idist.get_local_rank() == 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu
    train_loader = idist.auto_dataloader(
        train_dataset,
        batch_size=config.batch_size,
        num_workers=config.num_workers,
        shuffle=True,
        drop_last=True,
        {% if use_distributed_training and not use_distributed_launcher %}
        persistent_workers = True,
        {% endif %}
    )

    test_loader = idist.auto_dataloader(
        test_dataset,
        batch_size=2 * config.batch_size,
        num_workers=config.num_workers,
        shuffle=False,
        {% if use_distributed_training and not use_distributed_launcher %}
    persistent_workers = True,
        {% endif %}
    )
    return train_loader, test_loader
Exemple #15
0
def _test_save_model_optimizer_lr_scheduler_with_state_dict(
        device, on_zero_rank=False):

    if idist.get_rank() == 0:
        clearml.Task.current_task = Mock(return_value=object())
        clearml.binding.frameworks.WeightsFileHandler.create_output_model = MagicMock(
        )

    torch.manual_seed(23)

    model = DummyModel().to(device)

    optim = torch.optim.SGD(model.parameters(), lr=0.1)
    lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optim, gamma=0.5)

    def update_fn(engine, batch):
        x = torch.rand((4, 2)).to(device)
        optim.zero_grad()
        y = model(x)
        # Below code raises: RuntimeError: torch_xla/csrc/tensor_impl.cpp:144 : XLA tensors do not have storage
        # Probably related to https://github.com/pytorch/xla/issues/2576
        # loss = y.pow(2.0).sum()
        loss = y.sum()
        loss.backward()
        if idist.has_xla_support:
            import torch_xla.core.xla_model as xm

            xm.optimizer_step(optim, barrier=True)
        else:
            optim.step()
        lr_scheduler.step()

    engine = Engine(update_fn)

    to_save = {
        "model": model,
        "optimizer": optim,
        "lr_scheduler": lr_scheduler
    }

    with pytest.warns(
            UserWarning,
            match=r"ClearMLSaver created a temporary checkpoints directory"):
        clearml_saver = ClearMLSaver()

    if (not on_zero_rank) or (on_zero_rank and idist.get_rank() == 0):
        checkpoint = Checkpoint(to_save=to_save,
                                save_handler=clearml_saver,
                                n_saved=1)
        engine.add_event_handler(Events.EPOCH_COMPLETED, checkpoint)

    engine.run([0], max_epochs=4)

    idist.barrier()

    saved_objects = sorted(os.listdir(clearml_saver.dirname))
    # saved object is ['PREFIX_checkpoint_3.pt', ]
    saved_checkpoint = os.path.join(clearml_saver.dirname, saved_objects[0])

    if idist.has_xla_support:
        device = "cpu"

    loaded_obj = torch.load(saved_checkpoint, map_location=device)
    for f in ["model", "optimizer", "lr_scheduler"]:
        assert f in loaded_obj
    loaded_model_state_dict = loaded_obj["model"]
    loaded_optimizer_state_dict = loaded_obj["optimizer"]
    loaded_lr_scheduler_state_dict = loaded_obj["lr_scheduler"]

    assert isinstance(loaded_model_state_dict, dict)
    assert isinstance(loaded_optimizer_state_dict, dict)
    assert isinstance(loaded_lr_scheduler_state_dict, dict)

    # Specifically move device to CPU first
    model_state_dict = model.cpu().state_dict()
    for key in model_state_dict.keys():
        assert key in loaded_model_state_dict
        model_value = model_state_dict[key]
        loaded_model_value = loaded_model_state_dict[key]
        assert (model_value.cpu().numpy() == loaded_model_value.cpu().numpy()
                ).all()

    optim_state_dict = optim.state_dict()
    for key in optim_state_dict.keys():
        assert key in loaded_optimizer_state_dict
        optim_value = optim_state_dict[key]
        loaded_optim_value = loaded_optimizer_state_dict[key]
        if idist.get_rank() == 0:
            assert optim_value == loaded_optim_value

    lr_scheduler_state_dict = lr_scheduler.state_dict()
    for key in lr_scheduler_state_dict.keys():
        assert key in loaded_lr_scheduler_state_dict
        lr_scheduler_value = lr_scheduler_state_dict[key]
        loaded_lr_scheduler_value = loaded_lr_scheduler_state_dict[key]
        assert lr_scheduler_value == loaded_lr_scheduler_value
def get_datasets(dataset, dataroot):
    """

    Args:
        dataset (str): Name of the dataset to use. See CLI help for details
        dataroot (str): root directory where the dataset will be stored.

    Returns:
        dataset, num_channels
    """
    local_rank = idist.get_local_rank()

    if local_rank > 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    resize = T.Resize(64)
    crop = T.CenterCrop(64)
    to_tensor = T.ToTensor()
    normalize = T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))

    if dataset in {"imagenet", "folder", "lfw"}:
        dataset = dset.ImageFolder(root=dataroot,
                                   transform=T.Compose(
                                       [resize, crop, to_tensor, normalize]))
        nc = 3

    elif dataset == "lsun":
        dataset = dset.LSUN(root=dataroot,
                            classes=["bedroom_train"],
                            transform=T.Compose(
                                [resize, crop, to_tensor, normalize]))
        nc = 3

    elif dataset == "cifar10":
        dataset = dset.CIFAR10(root=dataroot,
                               download=True,
                               transform=T.Compose(
                                   [resize, to_tensor, normalize]))
        nc = 3

    elif dataset == "mnist":
        dataset = dset.MNIST(root=dataroot,
                             download=True,
                             transform=T.Compose(
                                 [resize, to_tensor, normalize]))
        nc = 1

    elif dataset == "fake":
        dataset = dset.FakeData(size=256,
                                image_size=(3, 64, 64),
                                transform=to_tensor)
        nc = 3

    else:
        raise RuntimeError(f"Invalid dataset name: {dataset}")

    if local_rank == 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    return dataset, nc
Exemple #17
0
def _test_save_model_optimizer_lr_scheduler_with_state_dict(
        device, dirname, on_zero_rank=False):

    torch.manual_seed(23)

    model = DummyModel().to(device)

    optim = torch.optim.SGD(model.parameters(), lr=0.1)
    lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optim, gamma=0.5)

    def update_fn(engine, batch):
        x = torch.rand((4, 1)).to(device)
        optim.zero_grad()
        y = model(x)
        loss = y.pow(2.0).sum()
        loss.backward()
        if idist.has_xla_support:
            import torch_xla.core.xla_model as xm

            xm.optimizer_step(optim, barrier=True)
        else:
            optim.step()
        lr_scheduler.step()

    engine = Engine(update_fn)

    if (not on_zero_rank) or (on_zero_rank and idist.get_rank() == 0):
        handler = ModelCheckpoint(dirname, _PREFIX, create_dir=True, n_saved=1)

        engine.add_event_handler(Events.EPOCH_COMPLETED, handler, {
            "model": model,
            "optimizer": optim,
            "lr_scheduler": lr_scheduler
        })

    engine.run([0], max_epochs=4)

    idist.barrier()

    saved_objects = sorted(os.listdir(dirname))
    # saved object is ['PREFIX_checkpoint_3.pt', ]
    saved_checkpoint = os.path.join(dirname, saved_objects[0])

    if idist.has_xla_support:
        device = "cpu"

    loaded_obj = torch.load(saved_checkpoint, map_location=device)
    for f in ["model", "optimizer", "lr_scheduler"]:
        assert f in loaded_obj
    loaded_model_state_dict = loaded_obj["model"]
    loaded_optimizer_state_dict = loaded_obj["optimizer"]
    loaded_lr_scheduler_state_dict = loaded_obj["lr_scheduler"]

    assert isinstance(loaded_model_state_dict, dict)
    assert isinstance(loaded_optimizer_state_dict, dict)
    assert isinstance(loaded_lr_scheduler_state_dict, dict)

    # Specifically move device to CPU first
    model_state_dict = model.cpu().state_dict()
    for key in model_state_dict.keys():
        assert key in loaded_model_state_dict
        model_value = model_state_dict[key]
        loaded_model_value = loaded_model_state_dict[key]
        assert model_value.cpu().numpy() == loaded_model_value.cpu().numpy()

    optim_state_dict = optim.state_dict()
    for key in optim_state_dict.keys():
        assert key in loaded_optimizer_state_dict
        optim_value = optim_state_dict[key]
        loaded_optim_value = loaded_optimizer_state_dict[key]
        if idist.get_rank() == 0:
            assert optim_value == loaded_optim_value

    lr_scheduler_state_dict = lr_scheduler.state_dict()
    for key in lr_scheduler_state_dict.keys():
        assert key in loaded_lr_scheduler_state_dict
        lr_scheduler_value = lr_scheduler_state_dict[key]
        loaded_lr_scheduler_value = loaded_lr_scheduler_state_dict[key]
        assert lr_scheduler_value == loaded_lr_scheduler_value
def train(args):
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    # initialize the distributed training process, every GPU runs in a process
    dist.init_process_group(backend="nccl", init_method="env://")

    if idist.get_local_rank() == 0 and not os.path.exists(args.dir):
        # create 40 random image, mask paris for training
        print(f"generating synthetic data to {args.dir} (this may take a while)")
        os.makedirs(args.dir)
        # set random seed to generate same random data for every node
        np.random.seed(seed=0)
        for i in range(40):
            im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1)
            n = nib.Nifti1Image(im, np.eye(4))
            nib.save(n, os.path.join(args.dir, f"img{i:d}.nii.gz"))
            n = nib.Nifti1Image(seg, np.eye(4))
            nib.save(n, os.path.join(args.dir, f"seg{i:d}.nii.gz"))
    idist.barrier()

    images = sorted(glob(os.path.join(args.dir, "img*.nii.gz")))
    segs = sorted(glob(os.path.join(args.dir, "seg*.nii.gz")))
    train_files = [{"image": img, "label": seg} for img, seg in zip(images, segs)]

    # define transforms for image and segmentation
    train_transforms = Compose(
        [
            LoadImaged(keys=["image", "label"]),
            AsChannelFirstd(keys=["image", "label"], channel_dim=-1),
            ScaleIntensityd(keys="image"),
            RandCropByPosNegLabeld(
                keys=["image", "label"], label_key="label", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4
            ),
            RandRotate90d(keys=["image", "label"], prob=0.5, spatial_axes=[0, 2]),
            EnsureTyped(keys=["image", "label"]),
        ]
    )

    # create a training data loader
    train_ds = Dataset(data=train_files, transform=train_transforms)
    # create a training data sampler
    train_sampler = DistributedSampler(train_ds)
    # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training
    train_loader = DataLoader(
        train_ds,
        batch_size=2,
        shuffle=False,
        num_workers=2,
        pin_memory=True,
        sampler=train_sampler,
    )

    # create UNet, DiceLoss and Adam optimizer
    device = torch.device(f"cuda:{idist.get_local_rank()}")
    torch.cuda.set_device(device)
    net = monai.networks.nets.UNet(
        spatial_dims=3,
        in_channels=1,
        out_channels=1,
        channels=(16, 32, 64, 128, 256),
        strides=(2, 2, 2, 2),
        num_res_units=2,
    ).to(device)
    loss = monai.losses.DiceLoss(sigmoid=True)
    opt = torch.optim.Adam(net.parameters(), 1e-3)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=2, gamma=0.1)
    # wrap the model with DistributedDataParallel module
    net = DistributedDataParallel(net, device_ids=[device])

    train_post_transforms = Compose(
        [
            EnsureTyped(keys="pred"),
            Activationsd(keys="pred", sigmoid=True),
            AsDiscreted(keys="pred", threshold=0.5),
            KeepLargestConnectedComponentd(keys="pred", applied_labels=[1]),
        ]
    )
    train_handlers = [
        LrScheduleHandler(lr_scheduler=lr_scheduler, print_lr=True),
    ]
    if idist.get_rank() == 0:
        train_handlers.extend(
            [
                StatsHandler(tag_name="train_loss", output_transform=from_engine(["loss"], first=True)),
                CheckpointSaver(save_dir="./runs/", save_dict={"net": net, "opt": opt}, save_interval=2),
            ]
        )

    trainer = SupervisedTrainer(
        device=device,
        max_epochs=5,
        train_data_loader=train_loader,
        network=net,
        optimizer=opt,
        loss_function=loss,
        inferer=SimpleInferer(),
        # if no FP16 support in GPU or PyTorch version < 1.6, will not enable AMP evaluation
        amp=True if monai.utils.get_torch_version_tuple() >= (1, 6) else False,
        postprocessing=train_post_transforms,
        key_train_metric={"train_acc": Accuracy(output_transform=from_engine(["pred", "label"]), device=device)},
        train_handlers=train_handlers,
    )
    trainer.run()
    dist.destroy_process_group()