Beispiel #1
0
    def initialize(self, amp_id: int, num_losses: int, use_amp: bool,
                   amp_opt_level: str, device: torch.device):
        self._amp_id = amp_id
        self._use_amp = use_amp

        if APEX_AVAILABLE and self._use_amp:
            self._model, self._optimizer = amp.initialize(
                self._model,
                self._optimizer,
                opt_level=amp_opt_level,
                num_losses=num_losses)
            if on_multiple_gpus(get_devices()):
                self._model = ApexDDP(self._model, delay_allreduce=True)
        if not APEX_AVAILABLE and on_multiple_gpus(get_devices()):
            self._model = DDP(self._model, device_ids=[device])
Beispiel #2
0
 def _initialize_ddp_process_group(self):
     if on_multiple_gpus(self._devices):
         if NCCL_AVAILABLE:
             if os.environ.get("MASTER_ADDR") is None:
                 os.environ["MASTER_ADDR"] = "127.0.0.1"
             if os.environ.get("MASTER_PORT") is None:
                 os.environ["MASTER_PORT"] = str(
                     self._get_random_free_port())
             if os.environ.get("WORLD_SIZE") is None:
                 os.environ["WORLD_SIZE"] = str(self._world_size)
             torch.distributed.init_process_group(
                 backend='nccl',
                 init_method='env://',
                 world_size=int(os.environ["WORLD_SIZE"]),
                 rank=self._local_rank)
         else:
             raise Exception(
                 "NCCL not available and required for multi-GPU training.")
Beispiel #3
0
                                                    dataset_configs['ABIDE'].test_patch_size,
                                                    dataset_configs["ABIDE"].test_step,
                                                    test_image=ABIDE_reconstruction._target_images[0]))

    # Concat datasets.
    if len(dataset_configs) > 1:
        train_dataset = torch.utils.data.ConcatDataset(train_datasets)
        valid_dataset = torch.utils.data.ConcatDataset(valid_datasets)
        test_dataset = torch.utils.data.ConcatDataset(test_datasets)
    else:
        train_dataset = train_datasets[0]
        valid_dataset = valid_datasets[0]
        test_dataset = test_datasets[0]

    # Create samplers
    if on_multiple_gpus(run_config.devices):
        train_sampler = torch.utils.data.DistributedSampler(train_dataset, run_config.world_size,
                                                            run_config.local_rank)
        valid_sampler = torch.utils.data.DistributedSampler(valid_dataset, run_config.world_size,
                                                            run_config.local_rank)
        test_sampler = torch.utils.data.DistributedSampler(test_dataset, run_config.world_size,
                                                           run_config.local_rank)
    else:
        train_sampler, valid_sampler, test_sampler = None, None, None

    # Create loaders.
    dataloaders = list(map(lambda dataset, sampler: DataLoader(dataset,
                                                               training_config.batch_size,
                                                               sampler=sampler,
                                                               shuffle=False if sampler is not None else True,
                                                               num_workers=args.num_workers,
Beispiel #4
0
 def test_on_multiple_gpu_should_return_true_with_multiple_GPU_devices(
         self):
     assert_that(on_multiple_gpus(self._multiple_gpus_devices), is_(True))
Beispiel #5
0
 def test_on_multiple_gpu_should_return_false_with_single_GPU_device(self):
     assert_that(on_multiple_gpus(self._single_gpu_device), is_(False))