def _get_num_workers(self): """Check horovod, smdataparallel, and torch.distributed.""" # Try torch.distributed # torch.distributed is empty on Mac on Torch <= 1.2 if hasattr(dist, "is_initialized") and dist.is_initialized(): return torch.distributed.get_world_size() else: # Try horovod try: import horovod.torch as hvd if hvd.size(): return hvd.size() except (ModuleNotFoundError, ValueError, ImportError): pass # Try smdataparallel # smdistributed.dataparallel should be invoked via `mpirun`. # It supports EC2 machines with 8 GPUs per machine. if check_smdataparallel_env(): try: import smdistributed.dataparallel.torch.distributed as smdataparallel if smdataparallel.get_world_size(): return smdataparallel.get_world_size() except (ModuleNotFoundError, ValueError, ImportError): pass # Return default return 1
def __init__(self, dataset, num_replicas=None, rank=None, batch_size=None): warnings.warn( "SequentialDistributedSampler is deprecated and will be removed in v5 of Transformers.", FutureWarning, ) if num_replicas is None: if not dist.is_available(): raise RuntimeError( "Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError( "Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank num_samples = len(self.dataset) # Add extra samples to make num_samples a multiple of batch_size if passed if batch_size is not None: self.num_samples = int( math.ceil(num_samples / (batch_size * num_replicas))) * batch_size else: self.num_samples = int(math.ceil(num_samples / num_replicas)) self.total_size = self.num_samples * self.num_replicas self.batch_size = batch_size
def get_world_size(): """ Gets total number of distributed workers or returns one if distributed is not initialized. """ if dist.is_available() and dist.is_initialized(): world_size = dist.get_world_size() else: world_size = 1 return world_size
def world_size(self): """ The number of processes used in parallel. """ if is_torch_tpu_available(): return xm.xrt_world_size() elif is_sagemaker_distributed_available(): return sm_dist.get_world_size() elif self.local_rank != -1: return torch.distributed.get_world_size() return 1
def __init__( self, batch_size: int, dataset: Optional[Dataset] = None, num_replicas: Optional[int] = None, rank: Optional[int] = None, seed: int = 0, drop_last: bool = False, lengths: Optional[List[int]] = None, model_input_name: Optional[str] = None, ): if dataset is None and lengths is None: raise ValueError("One of dataset and lengths must be provided.") if num_replicas is None: if not dist.is_available(): raise RuntimeError( "Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError( "Requires distributed package to be available") rank = dist.get_rank() self.batch_size = batch_size self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.drop_last = drop_last if lengths is None: model_input_name = model_input_name if model_input_name is not None else "input_ids" if (not (isinstance(dataset[0], dict) or isinstance(dataset[0], BatchEncoding)) or model_input_name not in dataset[0]): raise ValueError( "Can only automatically infer lengths for datasets whose items are dictionaries with an " f"'{model_input_name}' key.") lengths = [len(feature[model_input_name]) for feature in dataset] self.lengths = lengths # If the dataset length is evenly divisible by # of replicas, then there # is no need to drop any data, since the dataset will be split equally. if self.drop_last and len(self.lengths) % self.num_replicas != 0: # Split to nearest available length that is evenly divisible. # This is to ensure each rank receives the same amount of data when # using this Sampler. self.num_samples = math.ceil( (len(self.lengths) - self.num_replicas) / self.num_replicas) else: self.num_samples = math.ceil(len(self.lengths) / self.num_replicas) self.total_size = self.num_samples * self.num_replicas self.seed = seed
def distributed_concat(tensor: "torch.Tensor", num_total_examples: Optional[int] = None) -> torch.Tensor: try: if isinstance(tensor, (tuple, list)): return type(tensor)(distributed_concat(t, num_total_examples) for t in tensor) output_tensors = [tensor.clone() for _ in range(dist.get_world_size())] dist.all_gather(output_tensors, tensor) concat = torch.cat(output_tensors, dim=0) # truncate the dummy elements added by SequentialDistributedSampler if num_total_examples is not None: concat = concat[:num_total_examples] return concat except AssertionError: raise AssertionError("Not currently using distributed training")
def __init__(self, dataset, num_replicas=None, rank=None): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas
def dist_setting(args): # args.data_parallel = False print("args.data_parallel : {}".format(args.data_parallel)) print("args.model_parallel : {}".format(args.model_parallel)) print("args.apex : {}".format(args.apex)) args.world_size = 1 args.host_num = args.hosts.index(args.current_host) if args.data_parallel: args.world_size = sdp.get_world_size() args.rank = sdp.get_rank() # total rank in all hosts args.local_rank = sdp.get_local_rank() # rank per host elif args.model_parallel: args.world_size = smp.size() args.local_rank = smp.local_rank() # rank per host args.rank = smp.rank() args.dp_size = smp.dp_size() args.dp_rank = smp.dp_rank() print( "smp.rank() : {}, smp.size() : {}, smp.mp_rank() : {}, smp.local_size() : {}, smp.get_mp_group() : {}, smp.get_dp_group() : {}, smp.local_rank() : {}, smp.dp_size() : {}, smp.dp_rank() : {}" .format(smp.rank(), smp.size(), smp.mp_rank(), smp.local_size(), smp.get_mp_group(), smp.get_dp_group(), smp.local_rank(), smp.dp_size(), smp.dp_rank())) else: args.world_size = len(args.hosts) * args.num_gpus if args.local_rank is not None: args.rank = args.num_gpus * args.host_num + \ args.local_rank # total rank in all hosts dist.init_process_group(backend=args.backend, rank=args.rank, world_size=args.world_size) logger.info( 'Initialized the distributed environment: \'{}\' backend on {} nodes. ' .format(args.backend, dist.get_world_size()) + 'Current host rank is {}. Number of gpus: {}'.format( dist.get_rank(), args.num_gpus)) print("**** [dist_setting] args.rank : {}".format(args.rank)) print("args.world_size : {}".format(args.world_size)) print("Use GPU: {} for training".format(args.local_rank)) args.lr = args.lr * float(args.world_size) args.batch_size //= args.world_size // args.num_gpus args.batch_size = max(args.batch_size, 1) return args
def distributed_broadcast_scalars( scalars: List[Union[int, float]], num_total_examples: Optional[int] = None ) -> torch.Tensor: try: tensorized_scalar = torch.tensor(scalars).cuda() output_tensors = [tensorized_scalar.clone() for _ in range(dist.get_world_size())] dist.all_gather(output_tensors, tensorized_scalar) concat = torch.cat(output_tensors, dim=0) # truncate the dummy elements added by SequentialDistributedSampler if num_total_examples is not None: concat = concat[:num_total_examples] return concat except AssertionError: raise AssertionError("Not currently using distributed training")
def get_distributed_worker(): """Get the rank for horovod or torch distributed. If none of them are being used, return None""" rank = None try: import torch.distributed as dist except (ImportError, ModuleNotFoundError): dist = None rank = None if dist and hasattr(dist, "is_initialized") and dist.is_initialized(): rank = dist.get_rank() else: try: import horovod.torch as hvd if hvd.size(): rank = hvd.rank() except (ModuleNotFoundError, ValueError, ImportError): pass try: import horovod.tensorflow as hvd if hvd.size(): rank = hvd.rank() except (ModuleNotFoundError, ValueError, ImportError): pass # smdistributed.dataparallel should be invoked via `mpirun`. # It supports EC2 machines with 8 GPUs per machine. if check_smdataparallel_env(): try: import smdistributed.dataparallel.torch.distributed as smdataparallel if smdataparallel.get_world_size(): return smdataparallel.get_rank() except (ModuleNotFoundError, ValueError, ImportError): pass try: import smdistributed.dataparallel.tensorflow as smdataparallel if smdataparallel.size(): return smdataparallel.rank() except (ModuleNotFoundError, ValueError, ImportError): pass return rank
def get_val_dataloader(dataset, args): if args.distributed: val_sampler = torch.utils.data.distributed.DistributedSampler( dataset, num_replicas=herring.get_world_size(), rank=herring.get_rank()) else: val_sampler = None val_dataloader = DataLoader( dataset, batch_size=args.eval_batch_size, shuffle=False, # Note: distributed sampler is shuffled :( sampler=val_sampler, num_workers=args.num_workers) return val_dataloader
def total_processes_number(local_rank): """ Return the number of processes launched in parallel. Works with `torch.distributed` and TPUs. """ if is_torch_tpu_available(): import torch_xla.core.xla_model as xm return xm.xrt_world_size() elif is_sagemaker_dp_enabled(): import smdistributed.dataparallel.torch.distributed as dist return dist.get_world_size() elif local_rank != -1 and is_torch_available(): import torch return torch.distributed.get_world_size() return 1
def __init__(self, dataset, num_replicas=None, rank=None, batch_size=None): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank num_samples = len(self.dataset) # Add extra samples to make num_samples a multiple of batch_size if passed if batch_size is not None: self.num_samples = int(math.ceil(num_samples / (batch_size * num_replicas))) * batch_size else: self.num_samples = int(math.ceil(num_samples / num_replicas)) self.total_size = self.num_samples * self.num_replicas self.batch_size = batch_size
def reduce_loss_dict(loss_dict): """ Reduce the loss dictionary from all processes so that process with rank 0 has the averaged results. Returns a dict with the same fields as loss_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return loss_dict with torch.no_grad(): loss_names = [] all_losses = [] for k in sorted(loss_dict.keys()): loss_names.append(k) all_losses.append(loss_dict[k]) all_losses = torch.stack(all_losses, dim=0) #dist.reduce(all_losses, dst=0) herring.all_reduce(all_losses) #if herring.get_rank() == 0: # only main process gets accumulated, so only divide by # world_size in this case all_losses /= world_size reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} return reduced_losses
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=herring.get_local_rank()) parser.add_argument("--seed", help="manually set random seed for torch", type=int, default=99) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) parser.add_argument( "--bucket-cap-mb", dest="bucket_cap_mb", help="specify bucket size for herring", default=25, type=int, ) parser.add_argument("--data-dir", dest="data_dir", help="Absolute path of dataset ", type=str, default=None) args = parser.parse_args() # Set seed to reduce randomness random.seed(args.seed + herring.get_local_rank()) np.random.seed(args.seed + herring.get_local_rank()) torch.manual_seed(args.seed + herring.get_local_rank()) torch.cuda.manual_seed(args.seed + herring.get_local_rank()) # num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 num_gpus = herring.get_world_size() args.distributed = num_gpus > 1 if args.distributed: torch.cuda.set_device(args.local_rank) # torch.distributed.init_process_group( # backend="nccl", init_method="env://" # ) #synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) logger = setup_logger("maskrcnn_benchmark", output_dir, herring.get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, args) if not args.skip_test: if not cfg.PER_EPOCH_EVAL: test_model(cfg, model, args)
def main(): parser = argparse.ArgumentParser(description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=dist.get_local_rank()) parser.add_argument( "--seed", help="manually set random seed for torch", type=int, default=99 ) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) parser.add_argument( "--bucket-cap-mb", dest="bucket_cap_mb", help="specify bucket size for SMDataParallel", default=25, type=int, ) parser.add_argument( "--data-dir", dest="data_dir", help="Absolute path of dataset ", type=str, default=None ) parser.add_argument( "--dtype", dest="dtype" ) parser.add_argument( "--spot_ckpt", default=None ) args = parser.parse_args() keys = list(os.environ.keys()) args.data_dir = os.environ['SM_CHANNEL_TRAIN'] if 'SM_CHANNEL_TRAIN' in keys else args.data_dir print("dataset dir: ", args.data_dir) # Set seed to reduce randomness random.seed(args.seed + dist.get_local_rank()) np.random.seed(args.seed + dist.get_local_rank()) torch.manual_seed(args.seed + dist.get_local_rank()) torch.cuda.manual_seed(args.seed + dist.get_local_rank()) # num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 num_gpus = dist.get_world_size() args.distributed = num_gpus > 1 if args.distributed: # SMDataParallel: Pin each GPU to a single SMDataParallel process. torch.cuda.set_device(args.local_rank) # torch.distributed.init_process_group( # backend="nccl", init_method="env://" # ) #synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.DTYPE=args.dtype # grab checkpoint file to start from os.system(f"aws s3 cp {args.spot_ckpt} /opt/ml/checkpoints/{args.spot_ckpt.split('/')[-1]}") cfg.MODEL.WEIGHT = f"/opt/ml/checkpoints/{args.spot_ckpt.split('/')[-1]}" cfg.freeze() print ("CONFIG") print (cfg) output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) logger = setup_logger("maskrcnn_benchmark", output_dir, dist.get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, args) if not args.skip_test: if not cfg.PER_EPOCH_EVAL: test_model(cfg, model, args)
def get_world_size(): # if not dist.is_available(): # return 1 # if not dist.is_initialized(): # return 1 return herring.get_world_size()
def _get_data_loader(imgs, trn_df, vld_df): import albumentations as A from albumentations import ( Rotate, HorizontalFlip, IAAPerspective, ShiftScaleRotate, CLAHE, RandomRotate90, Transpose, ShiftScaleRotate, Blur, OpticalDistortion, GridDistortion, HueSaturationValue, IAAAdditiveGaussianNoise, GaussNoise, MotionBlur, MedianBlur, RandomBrightnessContrast, IAAPiecewiseAffine, IAASharpen, IAAEmboss, Flip, OneOf, Compose) from albumentations.pytorch import ToTensor, ToTensorV2 train_transforms = A.Compose([ Rotate(20), OneOf([ IAAAdditiveGaussianNoise(), GaussNoise(), ], p=0.2), OneOf([ MotionBlur(p=.2), MedianBlur(blur_limit=3, p=0.1), Blur(blur_limit=3, p=0.1), ], p=0.2), ShiftScaleRotate( shift_limit=0.0625, scale_limit=0.2, rotate_limit=45, p=0.2), OneOf([ OpticalDistortion(p=0.3), GridDistortion(p=.1), IAAPiecewiseAffine(p=0.3), ], p=0.2), OneOf([ CLAHE(clip_limit=2), IAASharpen(), IAAEmboss(), RandomBrightnessContrast(), ], p=0.3), HueSaturationValue(p=0.3), ToTensor() ], p=1.0) valid_transforms = A.Compose([ToTensor()]) from torch.utils.data import Dataset, DataLoader trn_dataset = BangaliDataset(imgs=imgs, label_df=trn_df, transform=train_transforms) vld_dataset = BangaliDataset(imgs=imgs, label_df=vld_df, transform=valid_transforms) rank = dist.get_rank() world_size = dist.get_world_size() trn_sampler = torch.utils.data.distributed.DistributedSampler( trn_dataset, num_replicas=world_size, # worldsize만큼 분할 rank=rank) trn_loader = DataLoader(trn_dataset, shuffle=False, num_workers=8, pin_memory=True, batch_size=BATCH_SIZE, sampler=trn_sampler) vld_loader = DataLoader(vld_dataset, shuffle=False, num_workers=NUM_WORKERS, batch_size=BATCH_SIZE) return trn_loader, vld_loader
default=os.environ['SM_CHANNEL_TRAINING']) parser.add_argument('--model_dir', type=str, default=os.environ['SM_MODEL_DIR']) parser.add_argument('--num_gpus', type=int, default=os.environ['SM_NUM_GPUS']) args = parser.parse_args() return args if __name__ == '__main__': #parse arguments args = parser_args() args.world_size = dist.get_world_size() args.rank = dist.get_rank() args.local_rank = dist.get_local_rank() #print(f"rank={args.rank}, local_rank={args.local_rank}") args.batch_size //= args.world_size // 8 args.batch_size = max(args.batch_size, 1) args.use_cuda = args.num_gpus > 0 print("args.use_cuda : {} , args.num_gpus : {}".format( args.use_cuda, args.num_gpus)) args.device = torch.device("cuda" if args.use_cuda else "cpu") train_model(args)
def train(train_loop_func, logger, args): # Check that GPUs are actually available use_cuda = not args.no_cuda train_samples = 118287 # Setup multi-GPU if necessary args.distributed = False # if 'WORLD_SIZE' in os.environ: # args.distributed = int(os.environ['WORLD_SIZE']) > 1 num_gpus = herring.get_world_size() args.distributed = num_gpus > 1 if args.distributed: torch.cuda.set_device(args.local_rank) # torch.distributed.init_process_group(backend='nccl', init_method='env://') # args.N_gpu = torch.distributed.get_world_size() args.N_gpu = herring.get_world_size() else: args.N_gpu = 1 if args.seed is None: args.seed = np.random.randint(1e4) if args.distributed: # args.seed = (args.seed + torch.distributed.get_rank()) % 2**32 args.seed = (args.seed + herring.get_rank()) % 2 ** 32 print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) np.random.seed(seed=args.seed) # Setup data, defaults dboxes = dboxes300_coco() encoder = Encoder(dboxes) cocoGt = get_coco_ground_truth(args) train_loader = get_train_loader(args, args.seed - 2**31) val_dataset = get_val_dataset(args) val_dataloader = get_val_dataloader(val_dataset, args) ssd300 = SSD300(backbone=ResNet(args.backbone, args.backbone_path)) args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32) start_epoch = 0 iteration = 0 loss_func = Loss(dboxes) if use_cuda: ssd300.cuda() loss_func.cuda() optimizer = torch.optim.SGD(tencent_trick(ssd300), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = MultiStepLR(optimizer=optimizer, milestones=args.multistep, gamma=0.1) if args.amp: ssd300, optimizer = amp.initialize(ssd300, optimizer, opt_level='O2') if args.distributed: ssd300 = DDP(ssd300, broadcast_buffers=False) if args.checkpoint is not None: if os.path.isfile(args.checkpoint): load_checkpoint(ssd300.module if args.distributed else ssd300, args.checkpoint) checkpoint = torch.load(args.checkpoint, map_location=lambda storage, loc: storage.cuda(torch.cuda.current_device())) start_epoch = checkpoint['epoch'] iteration = checkpoint['iteration'] scheduler.load_state_dict(checkpoint['scheduler']) optimizer.load_state_dict(checkpoint['optimizer']) else: print('Provided checkpoint is not path to a file') return inv_map = {v: k for k, v in val_dataset.label_map.items()} total_time = 0 if args.mode == 'evaluation': acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.local_rank == 0: print('Model precision {} mAP'.format(acc)) return mean, std = generate_mean_std(args) for epoch in range(start_epoch, args.epochs): start_epoch_time = time.time() scheduler.step() iteration = train_loop_func(ssd300, loss_func, epoch, optimizer, train_loader, val_dataloader, encoder, iteration, logger, args, mean, std) end_epoch_time = time.time() - start_epoch_time total_time += end_epoch_time if herring.get_rank() == 0: throughput = train_samples / end_epoch_time logger.update_epoch_time(epoch, end_epoch_time) logger.update_throughput_speed(epoch, throughput) if epoch in args.evaluation: acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.local_rank == 0: logger.update_epoch(epoch, acc) if args.save and args.local_rank == 0: print("saving model...") obj = {'epoch': epoch + 1, 'iteration': iteration, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'label_map': val_dataset.label_info} if args.distributed: obj['model'] = ssd300.module.state_dict() else: obj['model'] = ssd300.state_dict() save_path = os.path.join(args.save, f'epoch_{epoch}.pt') torch.save(obj, save_path) logger.log('model path', save_path) train_loader.reset() if herring.get_rank() == 0: DLLogger.log((), { 'Total training time': '%.2f' % total_time + ' secs' }) logger.log_summary()
def main(): # Training settings parser = argparse.ArgumentParser(description="PyTorch MNIST Example") parser.add_argument( "--batch-size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)", ) parser.add_argument( "--test-batch-size", type=int, default=1000, metavar="N", help="input batch size for testing (default: 1000)", ) parser.add_argument( "--epochs", type=int, default=14, metavar="N", help="number of epochs to train (default: 14)", ) parser.add_argument("--lr", type=float, default=1.0, metavar="LR", help="learning rate (default: 1.0)") parser.add_argument( "--gamma", type=float, default=0.7, metavar="M", help="Learning rate step gamma (default: 0.7)", ) parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") parser.add_argument( "--log-interval", type=int, default=10, metavar="N", help="how many batches to wait before logging training status", ) parser.add_argument("--save-model", action="store_true", default=False, help="For Saving the current Model") parser.add_argument( "--verbose", action="store_true", default=False, help="For displaying smdistributed.dataparallel-specific logs", ) parser.add_argument( "--data-path", type=str, default="/tmp/data", help="Path for downloading " "the MNIST dataset", ) args = parser.parse_args() args.world_size = dist.get_world_size() args.rank = rank = dist.get_rank() args.local_rank = local_rank = dist.get_local_rank() args.lr = 1.0 args.batch_size //= args.world_size // 8 args.batch_size = max(args.batch_size, 1) data_path = args.data_path if args.verbose: print( "Hello from rank", rank, "of local_rank", local_rank, "in world size of", args.world_size, ) if not torch.cuda.is_available(): raise CUDANotFoundException( "Must run smdistributed.dataparallel MNIST example on CUDA-capable devices." ) torch.manual_seed(args.seed) device = torch.device("cuda") # select a single rank per node to download data is_first_local_rank = local_rank == 0 if is_first_local_rank: train_dataset = datasets.MNIST( data_path, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ) dist.barrier() # prevent other ranks from accessing the data early if not is_first_local_rank: train_dataset = datasets.MNIST( data_path, train=True, download=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=args.world_size, rank=rank) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=True, sampler=train_sampler, ) if rank == 0: test_loader = torch.utils.data.DataLoader( datasets.MNIST( data_path, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ), batch_size=args.test_batch_size, shuffle=True, ) model = DDP(Net().to(device)) torch.cuda.set_device(local_rank) model.cuda(local_rank) optimizer = optim.Adadelta(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) if rank == 0: test(model, device, test_loader) scheduler.step() if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt")
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=14, metavar='N', help='number of epochs to train (default: 14)') parser.add_argument('--lr', type=float, default=1.0, metavar='LR', help='learning rate (default: 1.0)') parser.add_argument('--gamma', type=float, default=0.7, metavar='M', help='Learning rate step gamma (default: 0.7)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument('--verbose', action='store_true', default=False, help='For displaying SM Data Parallel-specific logs') parser.add_argument('--data-path', type=str, default='/tmp/data', help='Path for downloading ' 'the MNIST dataset') args = parser.parse_args() args.world_size = dist.get_world_size() args.rank = rank = dist.get_rank() args.local_rank = local_rank = dist.get_local_rank() args.lr = 1.0 args.batch_size //= args.world_size // 8 args.batch_size = max(args.batch_size, 1) data_path = args.data_path if args.verbose: print('Hello from rank', rank, 'of local_rank', local_rank, 'in world size of', args.world_size) if not torch.cuda.is_available(): raise Exception( "Must run SM Distributed DataParallel MNIST example on CUDA-capable devices." ) torch.manual_seed(args.seed) device = torch.device("cuda") if local_rank == 0: train_dataset = datasets.MNIST(data_path, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) else: time.sleep(8) train_dataset = datasets.MNIST(data_path, train=True, download=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=args.world_size, rank=rank) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=True, sampler=train_sampler) if rank == 0: test_loader = torch.utils.data.DataLoader( datasets.MNIST(data_path, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True) model = DDP(Net().to(device)) torch.cuda.set_device(local_rank) model.cuda(local_rank) optimizer = optim.Adadelta(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) if rank == 0: test(model, device, test_loader) scheduler.step() if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt")
def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0, data_dir=None): num_gpus = get_world_size() if is_train: images_per_batch = cfg.SOLVER.IMS_PER_BATCH print("batch size ", cfg.SOLVER.IMS_PER_BATCH, "num_gpus, ", num_gpus, images_per_batch % num_gpus) assert ( images_per_batch % num_gpus == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = True num_iters = cfg.SOLVER.MAX_ITER else: images_per_batch = cfg.TEST.IMS_PER_BATCH assert ( images_per_batch % num_gpus == 0 ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = False if not is_distributed else True num_iters = None start_iter = 0 if images_per_gpu > 1: logger = logging.getLogger(__name__) logger.warning( "When using more than one image per GPU you may encounter " "an out-of-memory (OOM) error if your GPU does not have " "sufficient memory. If this happens, you can reduce " "SOLVER.IMS_PER_BATCH (for training) or " "TEST.IMS_PER_BATCH (for inference). For training, you must " "also adjust the learning rate and schedule length according " "to the linear scaling rule. See for example: " "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14" ) # group images which have similar aspect ratio. In this case, we only # group in two cases: those with width / height > 1, and the other way around, # but the code supports more general grouping strategy aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] paths_catalog = import_file( "maskrcnn_benchmark.config.paths_catalog", cfg.PATHS_CATALOG, True ) DatasetCatalog = paths_catalog.DatasetCatalog dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST transforms = build_transforms(cfg, is_train) datasets, epoch_size = build_dataset(dataset_list, transforms, DatasetCatalog, data_dir, is_train) print("total_dataset_size: ", epoch_size) data_loaders = [] for dataset in datasets: sampler = make_data_sampler(dataset, shuffle, is_distributed) batch_sampler = make_batch_data_sampler( dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter ) collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY) num_workers = cfg.DATALOADER.NUM_WORKERS data_loader = torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=collator, ) data_loaders.append(data_loader) if is_train: # during training, a single (possibly concatenated) data_loader is returned assert len(data_loaders) == 1 iterations_per_epoch = epoch_size // images_per_batch return data_loaders[0], iterations_per_epoch return data_loaders
# SageMaker Container environment parser.add_argument('--model-dir', type=str, default='../model') parser.add_argument('--data-dir', type=str, default='../data') args = parser.parse_args() try: args.model_dir = os.environ['SM_MODEL_DIR'] args.data_dir = os.environ['SM_CHANNEL_TRAINING'] except KeyError as e: print( "The model starts training on the local host without SageMaker TrainingJob." ) if not os.path.exists(args.model_dir): os.makedirs(args.model_dir) pass ######################################################## ####### 2. SageMaker Distributed Data Parallel ####### ####### - Get all number of GPU and rank number ####### ######################################################## args.world_size = smdp.get_world_size() # all number of GPU args.rank = smdp.get_rank() # total rank in all hosts args.local_rank = smdp.get_local_rank() # rank per host ######################################################## train(args)