def __init__(self, dataset, num_replicas=None, rank=None, batch_size=None): warnings.warn( "SequentialDistributedSampler is deprecated and will be removed in v5 of Transformers.", FutureWarning, ) if num_replicas is None: if not dist.is_available(): raise RuntimeError( "Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError( "Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank num_samples = len(self.dataset) # Add extra samples to make num_samples a multiple of batch_size if passed if batch_size is not None: self.num_samples = int( math.ceil(num_samples / (batch_size * num_replicas))) * batch_size else: self.num_samples = int(math.ceil(num_samples / num_replicas)) self.total_size = self.num_samples * self.num_replicas self.batch_size = batch_size
def train_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std): # for nbatch, (img, _, img_size, bbox, label) in enumerate(train_dataloader): for nbatch, data in enumerate(train_dataloader): img = data[0][0][0] bbox = data[0][1][0] label = data[0][2][0] label = label.type(torch.cuda.LongTensor) bbox_offsets = data[0][3][0] # handle random flipping outside of DALI for now bbox_offsets = bbox_offsets.cuda() img, bbox = C.random_horiz_flip(img, bbox, bbox_offsets, 0.5, False) img.sub_(mean).div_(std) if not args.no_cuda: img = img.cuda() bbox = bbox.cuda() label = label.cuda() bbox_offsets = bbox_offsets.cuda() N = img.shape[0] if bbox_offsets[-1].item() == 0: print("No labels in batch") continue bbox, label = C.box_encoder(N, bbox, bbox_offsets, label, encoder.dboxes.cuda(), 0.5) # output is ([N*8732, 4], [N*8732], need [N, 8732, 4], [N, 8732] respectively M = bbox.shape[0] // N bbox = bbox.view(N, M, 4) label = label.view(N, M) ploc, plabel = model(img) ploc, plabel = ploc.float(), plabel.float() trans_bbox = bbox.transpose(1, 2).contiguous().cuda() if not args.no_cuda: label = label.cuda() gloc = Variable(trans_bbox, requires_grad=False) glabel = Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if herring.get_rank() == 0: logger.update_iter(epoch, iteration, loss.item()) if args.amp: with amp.scale_loss(loss, optim) as scale_loss: scale_loss.backward() else: loss.backward() if args.warmup is not None: warmup(optim, args.warmup, iteration, args.learning_rate) optim.step() optim.zero_grad() iteration += 1 return iteration
def get_rank(): """ Gets distributed rank or returns zero if distributed is not initialized. """ if dist.is_available() and dist.is_initialized(): rank = dist.get_rank() else: rank = 0 return rank
def process_index(self): """ The number of processes used in parallel. """ if is_torch_tpu_available(): return xm.get_ordinal() elif is_sagemaker_distributed_available(): return sm_dist.get_rank() elif self.local_rank != -1: return torch.distributed.get_rank() return 0
def __init__( self, batch_size: int, dataset: Optional[Dataset] = None, num_replicas: Optional[int] = None, rank: Optional[int] = None, seed: int = 0, drop_last: bool = False, lengths: Optional[List[int]] = None, model_input_name: Optional[str] = None, ): if dataset is None and lengths is None: raise ValueError("One of dataset and lengths must be provided.") if num_replicas is None: if not dist.is_available(): raise RuntimeError( "Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError( "Requires distributed package to be available") rank = dist.get_rank() self.batch_size = batch_size self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.drop_last = drop_last if lengths is None: model_input_name = model_input_name if model_input_name is not None else "input_ids" if (not (isinstance(dataset[0], dict) or isinstance(dataset[0], BatchEncoding)) or model_input_name not in dataset[0]): raise ValueError( "Can only automatically infer lengths for datasets whose items are dictionaries with an " f"'{model_input_name}' key.") lengths = [len(feature[model_input_name]) for feature in dataset] self.lengths = lengths # If the dataset length is evenly divisible by # of replicas, then there # is no need to drop any data, since the dataset will be split equally. if self.drop_last and len(self.lengths) % self.num_replicas != 0: # Split to nearest available length that is evenly divisible. # This is to ensure each rank receives the same amount of data when # using this Sampler. self.num_samples = math.ceil( (len(self.lengths) - self.num_replicas) / self.num_replicas) else: self.num_samples = math.ceil(len(self.lengths) / self.num_replicas) self.total_size = self.num_samples * self.num_replicas self.seed = seed
def local_process_index(self): """ The index of the local process used. """ if is_torch_tpu_available(): return xm.get_local_ordinal() elif is_sagemaker_mp_enabled(): return smp.local_rank() elif is_sagemaker_dp_enabled(): return sm_dist.get_rank() elif self.local_rank != -1: return self.local_rank return 0
def process_index(self): """ The index of the current process used. """ if is_torch_tpu_available(): return xm.get_ordinal() elif is_sagemaker_mp_enabled(): return smp.dp_rank() elif is_sagemaker_dp_enabled(): return sm_dist.get_rank() elif self.local_rank != -1: return torch.distributed.get_rank() return 0
def __init__(self, dataset, num_replicas=None, rank=None): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas
def dist_setting(args): # args.data_parallel = False print("args.data_parallel : {}".format(args.data_parallel)) print("args.model_parallel : {}".format(args.model_parallel)) print("args.apex : {}".format(args.apex)) args.world_size = 1 args.host_num = args.hosts.index(args.current_host) if args.data_parallel: args.world_size = sdp.get_world_size() args.rank = sdp.get_rank() # total rank in all hosts args.local_rank = sdp.get_local_rank() # rank per host elif args.model_parallel: args.world_size = smp.size() args.local_rank = smp.local_rank() # rank per host args.rank = smp.rank() args.dp_size = smp.dp_size() args.dp_rank = smp.dp_rank() print( "smp.rank() : {}, smp.size() : {}, smp.mp_rank() : {}, smp.local_size() : {}, smp.get_mp_group() : {}, smp.get_dp_group() : {}, smp.local_rank() : {}, smp.dp_size() : {}, smp.dp_rank() : {}" .format(smp.rank(), smp.size(), smp.mp_rank(), smp.local_size(), smp.get_mp_group(), smp.get_dp_group(), smp.local_rank(), smp.dp_size(), smp.dp_rank())) else: args.world_size = len(args.hosts) * args.num_gpus if args.local_rank is not None: args.rank = args.num_gpus * args.host_num + \ args.local_rank # total rank in all hosts dist.init_process_group(backend=args.backend, rank=args.rank, world_size=args.world_size) logger.info( 'Initialized the distributed environment: \'{}\' backend on {} nodes. ' .format(args.backend, dist.get_world_size()) + 'Current host rank is {}. Number of gpus: {}'.format( dist.get_rank(), args.num_gpus)) print("**** [dist_setting] args.rank : {}".format(args.rank)) print("args.world_size : {}".format(args.world_size)) print("Use GPU: {} for training".format(args.local_rank)) args.lr = args.lr * float(args.world_size) args.batch_size //= args.world_size // args.num_gpus args.batch_size = max(args.batch_size, 1) return args
def get_distributed_worker(): """Get the rank for horovod or torch distributed. If none of them are being used, return None""" rank = None try: import torch.distributed as dist except (ImportError, ModuleNotFoundError): dist = None rank = None if dist and hasattr(dist, "is_initialized") and dist.is_initialized(): rank = dist.get_rank() else: try: import horovod.torch as hvd if hvd.size(): rank = hvd.rank() except (ModuleNotFoundError, ValueError, ImportError): pass try: import horovod.tensorflow as hvd if hvd.size(): rank = hvd.rank() except (ModuleNotFoundError, ValueError, ImportError): pass # smdistributed.dataparallel should be invoked via `mpirun`. # It supports EC2 machines with 8 GPUs per machine. if check_smdataparallel_env(): try: import smdistributed.dataparallel.torch.distributed as smdataparallel if smdataparallel.get_world_size(): return smdataparallel.get_rank() except (ModuleNotFoundError, ValueError, ImportError): pass try: import smdistributed.dataparallel.tensorflow as smdataparallel if smdataparallel.size(): return smdataparallel.rank() except (ModuleNotFoundError, ValueError, ImportError): pass return rank
def get_val_dataloader(dataset, args): if args.distributed: val_sampler = torch.utils.data.distributed.DistributedSampler( dataset, num_replicas=herring.get_world_size(), rank=herring.get_rank()) else: val_sampler = None val_dataloader = DataLoader( dataset, batch_size=args.eval_batch_size, shuffle=False, # Note: distributed sampler is shuffled :( sampler=val_sampler, num_workers=args.num_workers) return val_dataloader
def __init__(self, dataset, num_replicas=None, rank=None, batch_size=None): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank num_samples = len(self.dataset) # Add extra samples to make num_samples a multiple of batch_size if passed if batch_size is not None: self.num_samples = int(math.ceil(num_samples / (batch_size * num_replicas))) * batch_size else: self.num_samples = int(math.ceil(num_samples / num_replicas)) self.total_size = self.num_samples * self.num_replicas self.batch_size = batch_size
def test_and_exchange_map(tester, model, distributed): results = tester(model=model, distributed=distributed) # main process only #if is_main_process(): if dist.get_rank() ==0: # Note: one indirection due to possibility of multiple test datasets, we only care about the first # tester returns (parsed results, raw results). In our case, don't care about the latter map_results, raw_results = results[0] bbox_map = map_results.results["bbox"]['AP'] segm_map = map_results.results["segm"]['AP'] else: bbox_map = 0. segm_map = 0. if distributed: map_tensor = torch.tensor([bbox_map, segm_map], dtype=torch.float32, device=torch.device("cuda")) torch.distributed.broadcast(map_tensor, 0) bbox_map = map_tensor[0].item() segm_map = map_tensor[1].item() return bbox_map, segm_map
def __init__(self, batch_size, device_id, file_root, annotations_file, num_gpus, output_fp16=False, output_nhwc=False, pad_output=False, num_threads=1, seed=15): super(COCOPipeline, self).__init__(batch_size=batch_size, device_id=device_id, num_threads=num_threads, seed = seed) # if torch.distributed.is_initialized(): # shard_id = torch.distributed.get_rank() # else: # shard_id = 0 shard_id = herring.get_rank() self.input = ops.COCOReader(file_root = file_root, annotations_file = annotations_file, shard_id = shard_id, num_shards = num_gpus, ratio=True, ltrb=True, random_shuffle=True, skip_empty=True) self.decode = ops.ImageDecoder(device = "cpu", output_type = types.RGB) # Augumentation techniques self.crop = ops.SSDRandomCrop(device="cpu", num_attempts=1) self.twist = ops.ColorTwist(device="gpu") self.resize = ops.Resize(device = "gpu", resize_x = 300, resize_y = 300) output_dtype = types.FLOAT16 if output_fp16 else types.FLOAT output_layout = types.NHWC if output_nhwc else types.NCHW self.normalize = ops.CropMirrorNormalize(device="gpu", crop=(300, 300), mean=[0.0, 0.0, 0.0], std=[255.0, 255.0, 255.0], mirror=0, output_dtype=output_dtype, output_layout=output_layout, pad_output=pad_output) # Random variables self.rng1 = ops.Uniform(range=[0.5, 1.5]) self.rng2 = ops.Uniform(range=[0.875, 1.125]) self.rng3 = ops.Uniform(range=[-0.5, 0.5])
# SageMaker Container environment parser.add_argument('--model-dir', type=str, default='../model') parser.add_argument('--data-dir', type=str, default='../data') args = parser.parse_args() try: args.model_dir = os.environ['SM_MODEL_DIR'] args.data_dir = os.environ['SM_CHANNEL_TRAINING'] except KeyError as e: print( "The model starts training on the local host without SageMaker TrainingJob." ) if not os.path.exists(args.model_dir): os.makedirs(args.model_dir) pass ######################################################## ####### 2. SageMaker Distributed Data Parallel ####### ####### - Get all number of GPU and rank number ####### ######################################################## args.world_size = smdp.get_world_size() # all number of GPU args.rank = smdp.get_rank() # total rank in all hosts args.local_rank = smdp.get_local_rank() # rank per host ######################################################## train(args)
def get_rank(): # if not dist.is_available(): # return 0 # if not dist.is_initialized(): # return 0 return herring.get_rank()
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=14, metavar='N', help='number of epochs to train (default: 14)') parser.add_argument('--lr', type=float, default=1.0, metavar='LR', help='learning rate (default: 1.0)') parser.add_argument('--gamma', type=float, default=0.7, metavar='M', help='Learning rate step gamma (default: 0.7)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument('--verbose', action='store_true', default=False, help='For displaying SM Data Parallel-specific logs') parser.add_argument('--data-path', type=str, default='/tmp/data', help='Path for downloading ' 'the MNIST dataset') args = parser.parse_args() args.world_size = dist.get_world_size() args.rank = rank = dist.get_rank() args.local_rank = local_rank = dist.get_local_rank() args.lr = 1.0 args.batch_size //= args.world_size // 8 args.batch_size = max(args.batch_size, 1) data_path = args.data_path if args.verbose: print('Hello from rank', rank, 'of local_rank', local_rank, 'in world size of', args.world_size) if not torch.cuda.is_available(): raise Exception( "Must run SM Distributed DataParallel MNIST example on CUDA-capable devices." ) torch.manual_seed(args.seed) device = torch.device("cuda") if local_rank == 0: train_dataset = datasets.MNIST(data_path, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) else: time.sleep(8) train_dataset = datasets.MNIST(data_path, train=True, download=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=args.world_size, rank=rank) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=True, sampler=train_sampler) if rank == 0: test_loader = torch.utils.data.DataLoader( datasets.MNIST(data_path, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True) model = DDP(Net().to(device)) torch.cuda.set_device(local_rank) model.cuda(local_rank) optimizer = optim.Adadelta(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) if rank == 0: test(model, device, test_loader) scheduler.step() if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt")
default=os.environ['SM_CHANNEL_TRAINING']) parser.add_argument('--model_dir', type=str, default=os.environ['SM_MODEL_DIR']) parser.add_argument('--num_gpus', type=int, default=os.environ['SM_NUM_GPUS']) args = parser.parse_args() return args if __name__ == '__main__': #parse arguments args = parser_args() args.world_size = dist.get_world_size() args.rank = dist.get_rank() args.local_rank = dist.get_local_rank() #print(f"rank={args.rank}, local_rank={args.local_rank}") args.batch_size //= args.world_size // 8 args.batch_size = max(args.batch_size, 1) args.use_cuda = args.num_gpus > 0 print("args.use_cuda : {} , args.num_gpus : {}".format( args.use_cuda, args.num_gpus)) args.device = torch.device("cuda" if args.use_cuda else "cpu") train_model(args)
def _get_data_loader(imgs, trn_df, vld_df): import albumentations as A from albumentations import ( Rotate, HorizontalFlip, IAAPerspective, ShiftScaleRotate, CLAHE, RandomRotate90, Transpose, ShiftScaleRotate, Blur, OpticalDistortion, GridDistortion, HueSaturationValue, IAAAdditiveGaussianNoise, GaussNoise, MotionBlur, MedianBlur, RandomBrightnessContrast, IAAPiecewiseAffine, IAASharpen, IAAEmboss, Flip, OneOf, Compose) from albumentations.pytorch import ToTensor, ToTensorV2 train_transforms = A.Compose([ Rotate(20), OneOf([ IAAAdditiveGaussianNoise(), GaussNoise(), ], p=0.2), OneOf([ MotionBlur(p=.2), MedianBlur(blur_limit=3, p=0.1), Blur(blur_limit=3, p=0.1), ], p=0.2), ShiftScaleRotate( shift_limit=0.0625, scale_limit=0.2, rotate_limit=45, p=0.2), OneOf([ OpticalDistortion(p=0.3), GridDistortion(p=.1), IAAPiecewiseAffine(p=0.3), ], p=0.2), OneOf([ CLAHE(clip_limit=2), IAASharpen(), IAAEmboss(), RandomBrightnessContrast(), ], p=0.3), HueSaturationValue(p=0.3), ToTensor() ], p=1.0) valid_transforms = A.Compose([ToTensor()]) from torch.utils.data import Dataset, DataLoader trn_dataset = BangaliDataset(imgs=imgs, label_df=trn_df, transform=train_transforms) vld_dataset = BangaliDataset(imgs=imgs, label_df=vld_df, transform=valid_transforms) rank = dist.get_rank() world_size = dist.get_world_size() trn_sampler = torch.utils.data.distributed.DistributedSampler( trn_dataset, num_replicas=world_size, # worldsize만큼 분할 rank=rank) trn_loader = DataLoader(trn_dataset, shuffle=False, num_workers=8, pin_memory=True, batch_size=BATCH_SIZE, sampler=trn_sampler) vld_loader = DataLoader(vld_dataset, shuffle=False, num_workers=NUM_WORKERS, batch_size=BATCH_SIZE) return trn_loader, vld_loader
def train(cfg, args): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if use_amp: # Initialize mixed-precision training use_mixed_precision = cfg.DTYPE == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if args.distributed: # if use_apex_ddp: # model = DDP(model, delay_allreduce=True) # else: # SMDataParallel: Wrap the PyTorch model with SMDataParallel’s DDP model = DDP(model, device_ids=[dist.get_local_rank()], broadcast_buffers=False) #model = DDP(model) print("model parameter size: ", sum(p.numel() for p in model.parameters() if p.requires_grad)) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR # SMDataParallel: Save model on master node. save_to_disk = dist.get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk ) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader, iters_per_epoch = make_data_loader( cfg, is_train=True, is_distributed=args.distributed, start_iter=arguments["iteration"], data_dir = args.data_dir ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD # set the callback function to evaluate and potentially # early exit each epoch if cfg.PER_EPOCH_EVAL: per_iter_callback_fn = functools.partial( mlperf_test_early_exit, iters_per_epoch=iters_per_epoch, tester=functools.partial(test, cfg=cfg), model=model, distributed=args.distributed, min_bbox_map=cfg.MIN_BBOX_MAP, min_segm_map=cfg.MIN_MASK_MAP) else: per_iter_callback_fn = None do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, use_amp, cfg, per_iter_end_callback_fn=per_iter_callback_fn, ) return model
def main(): parser = argparse.ArgumentParser(description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=dist.get_local_rank()) parser.add_argument( "--seed", help="manually set random seed for torch", type=int, default=99 ) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) parser.add_argument( "--bucket-cap-mb", dest="bucket_cap_mb", help="specify bucket size for SMDataParallel", default=25, type=int, ) parser.add_argument( "--data-dir", dest="data_dir", help="Absolute path of dataset ", type=str, default=None ) parser.add_argument( "--dtype", dest="dtype" ) parser.add_argument( "--spot_ckpt", default=None ) args = parser.parse_args() keys = list(os.environ.keys()) args.data_dir = os.environ['SM_CHANNEL_TRAIN'] if 'SM_CHANNEL_TRAIN' in keys else args.data_dir print("dataset dir: ", args.data_dir) # Set seed to reduce randomness random.seed(args.seed + dist.get_local_rank()) np.random.seed(args.seed + dist.get_local_rank()) torch.manual_seed(args.seed + dist.get_local_rank()) torch.cuda.manual_seed(args.seed + dist.get_local_rank()) # num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 num_gpus = dist.get_world_size() args.distributed = num_gpus > 1 if args.distributed: # SMDataParallel: Pin each GPU to a single SMDataParallel process. torch.cuda.set_device(args.local_rank) # torch.distributed.init_process_group( # backend="nccl", init_method="env://" # ) #synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.DTYPE=args.dtype # grab checkpoint file to start from os.system(f"aws s3 cp {args.spot_ckpt} /opt/ml/checkpoints/{args.spot_ckpt.split('/')[-1]}") cfg.MODEL.WEIGHT = f"/opt/ml/checkpoints/{args.spot_ckpt.split('/')[-1]}" cfg.freeze() print ("CONFIG") print (cfg) output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) logger = setup_logger("maskrcnn_benchmark", output_dir, dist.get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, args) if not args.skip_test: if not cfg.PER_EPOCH_EVAL: test_model(cfg, model, args)
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=herring.get_local_rank()) parser.add_argument("--seed", help="manually set random seed for torch", type=int, default=99) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) parser.add_argument( "--bucket-cap-mb", dest="bucket_cap_mb", help="specify bucket size for herring", default=25, type=int, ) parser.add_argument("--data-dir", dest="data_dir", help="Absolute path of dataset ", type=str, default=None) args = parser.parse_args() # Set seed to reduce randomness random.seed(args.seed + herring.get_local_rank()) np.random.seed(args.seed + herring.get_local_rank()) torch.manual_seed(args.seed + herring.get_local_rank()) torch.cuda.manual_seed(args.seed + herring.get_local_rank()) # num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 num_gpus = herring.get_world_size() args.distributed = num_gpus > 1 if args.distributed: torch.cuda.set_device(args.local_rank) # torch.distributed.init_process_group( # backend="nccl", init_method="env://" # ) #synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) logger = setup_logger("maskrcnn_benchmark", output_dir, herring.get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, args) if not args.skip_test: if not cfg.PER_EPOCH_EVAL: test_model(cfg, model, args)
def train(train_loop_func, logger, args): # Check that GPUs are actually available use_cuda = not args.no_cuda train_samples = 118287 # Setup multi-GPU if necessary args.distributed = False # if 'WORLD_SIZE' in os.environ: # args.distributed = int(os.environ['WORLD_SIZE']) > 1 num_gpus = herring.get_world_size() args.distributed = num_gpus > 1 if args.distributed: torch.cuda.set_device(args.local_rank) # torch.distributed.init_process_group(backend='nccl', init_method='env://') # args.N_gpu = torch.distributed.get_world_size() args.N_gpu = herring.get_world_size() else: args.N_gpu = 1 if args.seed is None: args.seed = np.random.randint(1e4) if args.distributed: # args.seed = (args.seed + torch.distributed.get_rank()) % 2**32 args.seed = (args.seed + herring.get_rank()) % 2 ** 32 print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) np.random.seed(seed=args.seed) # Setup data, defaults dboxes = dboxes300_coco() encoder = Encoder(dboxes) cocoGt = get_coco_ground_truth(args) train_loader = get_train_loader(args, args.seed - 2**31) val_dataset = get_val_dataset(args) val_dataloader = get_val_dataloader(val_dataset, args) ssd300 = SSD300(backbone=ResNet(args.backbone, args.backbone_path)) args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32) start_epoch = 0 iteration = 0 loss_func = Loss(dboxes) if use_cuda: ssd300.cuda() loss_func.cuda() optimizer = torch.optim.SGD(tencent_trick(ssd300), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = MultiStepLR(optimizer=optimizer, milestones=args.multistep, gamma=0.1) if args.amp: ssd300, optimizer = amp.initialize(ssd300, optimizer, opt_level='O2') if args.distributed: ssd300 = DDP(ssd300, broadcast_buffers=False) if args.checkpoint is not None: if os.path.isfile(args.checkpoint): load_checkpoint(ssd300.module if args.distributed else ssd300, args.checkpoint) checkpoint = torch.load(args.checkpoint, map_location=lambda storage, loc: storage.cuda(torch.cuda.current_device())) start_epoch = checkpoint['epoch'] iteration = checkpoint['iteration'] scheduler.load_state_dict(checkpoint['scheduler']) optimizer.load_state_dict(checkpoint['optimizer']) else: print('Provided checkpoint is not path to a file') return inv_map = {v: k for k, v in val_dataset.label_map.items()} total_time = 0 if args.mode == 'evaluation': acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.local_rank == 0: print('Model precision {} mAP'.format(acc)) return mean, std = generate_mean_std(args) for epoch in range(start_epoch, args.epochs): start_epoch_time = time.time() scheduler.step() iteration = train_loop_func(ssd300, loss_func, epoch, optimizer, train_loader, val_dataloader, encoder, iteration, logger, args, mean, std) end_epoch_time = time.time() - start_epoch_time total_time += end_epoch_time if herring.get_rank() == 0: throughput = train_samples / end_epoch_time logger.update_epoch_time(epoch, end_epoch_time) logger.update_throughput_speed(epoch, throughput) if epoch in args.evaluation: acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.local_rank == 0: logger.update_epoch(epoch, acc) if args.save and args.local_rank == 0: print("saving model...") obj = {'epoch': epoch + 1, 'iteration': iteration, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'label_map': val_dataset.label_info} if args.distributed: obj['model'] = ssd300.module.state_dict() else: obj['model'] = ssd300.state_dict() save_path = os.path.join(args.save, f'epoch_{epoch}.pt') torch.save(obj, save_path) logger.log('model path', save_path) train_loader.reset() if herring.get_rank() == 0: DLLogger.log((), { 'Total training time': '%.2f' % total_time + ' secs' }) logger.log_summary()
def main(): # Training settings parser = argparse.ArgumentParser(description="PyTorch MNIST Example") parser.add_argument( "--batch-size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)", ) parser.add_argument( "--test-batch-size", type=int, default=1000, metavar="N", help="input batch size for testing (default: 1000)", ) parser.add_argument( "--epochs", type=int, default=14, metavar="N", help="number of epochs to train (default: 14)", ) parser.add_argument("--lr", type=float, default=1.0, metavar="LR", help="learning rate (default: 1.0)") parser.add_argument( "--gamma", type=float, default=0.7, metavar="M", help="Learning rate step gamma (default: 0.7)", ) parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") parser.add_argument( "--log-interval", type=int, default=10, metavar="N", help="how many batches to wait before logging training status", ) parser.add_argument("--save-model", action="store_true", default=False, help="For Saving the current Model") parser.add_argument( "--verbose", action="store_true", default=False, help="For displaying smdistributed.dataparallel-specific logs", ) parser.add_argument( "--data-path", type=str, default="/tmp/data", help="Path for downloading " "the MNIST dataset", ) args = parser.parse_args() args.world_size = dist.get_world_size() args.rank = rank = dist.get_rank() args.local_rank = local_rank = dist.get_local_rank() args.lr = 1.0 args.batch_size //= args.world_size // 8 args.batch_size = max(args.batch_size, 1) data_path = args.data_path if args.verbose: print( "Hello from rank", rank, "of local_rank", local_rank, "in world size of", args.world_size, ) if not torch.cuda.is_available(): raise CUDANotFoundException( "Must run smdistributed.dataparallel MNIST example on CUDA-capable devices." ) torch.manual_seed(args.seed) device = torch.device("cuda") # select a single rank per node to download data is_first_local_rank = local_rank == 0 if is_first_local_rank: train_dataset = datasets.MNIST( data_path, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ) dist.barrier() # prevent other ranks from accessing the data early if not is_first_local_rank: train_dataset = datasets.MNIST( data_path, train=True, download=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=args.world_size, rank=rank) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=True, sampler=train_sampler, ) if rank == 0: test_loader = torch.utils.data.DataLoader( datasets.MNIST( data_path, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ), batch_size=args.test_batch_size, shuffle=True, ) model = DDP(Net().to(device)) torch.cuda.set_device(local_rank) model.cuda(local_rank) optimizer = optim.Adadelta(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) if rank == 0: test(model, device, test_loader) scheduler.step() if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt")