def make_parser(): parser = ArgumentParser(description="Train Single Shot MultiBox Detector" " on COCO") parser.add_argument('--data', '-d', type=str, default='/coco', required=True, help='path to test and training data files') parser.add_argument('--epochs', '-e', type=int, default=65, help='number of epochs for training') parser.add_argument('--batch-size', '--bs', type=int, default=32, help='number of examples for each iteration') parser.add_argument('--eval-batch-size', '--ebs', type=int, default=32, help='number of examples for each evaluation iteration') parser.add_argument('--no-cuda', action='store_true', help='use available GPUs') parser.add_argument('--seed', '-s', type=int, help='manually set random seed for torch') parser.add_argument('--checkpoint', type=str, default=None, help='path to model checkpoint file') parser.add_argument('--save', type=str, default=None, help='save model checkpoints in the specified directory') parser.add_argument('--mode', type=str, default='training', choices=['training', 'evaluation', 'benchmark-training', 'benchmark-inference']) parser.add_argument('--evaluation', nargs='*', type=int, default=[21, 31, 37, 42, 48, 53, 59, 64], help='epochs at which to evaluate') parser.add_argument('--multistep', nargs='*', type=int, default=[43, 54], help='epochs at which to decay learning rate') # Hyperparameters parser.add_argument('--learning-rate', '--lr', type=float, default=2.6e-3, help='learning rate') parser.add_argument('--momentum', '-m', type=float, default=0.9, help='momentum argument for SGD optimizer') parser.add_argument('--weight-decay', '--wd', type=float, default=0.0005, help='momentum argument for SGD optimizer') parser.add_argument('--warmup', type=int, default=None) parser.add_argument('--benchmark-iterations', type=int, default=20, metavar='N', help='Run N iterations while benchmarking (ignored when training and validation)') parser.add_argument('--benchmark-warmup', type=int, default=20, metavar='N', help='Number of warmup iterations for benchmarking') parser.add_argument('--backbone', type=str, default='resnet50', choices=['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152']) parser.add_argument('--backbone-path', type=str, default=None, help='Path to chekcpointed backbone. It should match the' ' backbone model declared with the --backbone argument.' ' When it is not provided, pretrained model from torchvision' ' will be downloaded.') parser.add_argument('--num-workers', type=int, default=4) parser.add_argument('--amp', action='store_true', help='Whether to enable AMP ops. When false, uses TF32 on A100 and FP32 on V100 GPUS.') parser.add_argument('--json-summary', type=str, default=None, help='If provided, the json summary will be written to' 'the specified file.') # Distributed parser.add_argument('--local_rank', default=herring.get_local_rank(), type=int, help='Used for multi-process training. Can either be manually set ' + 'or automatically set by using \'python -m multiproc\'.') return parser
def _setup_devices(self) -> "torch.device": logger.info("PyTorch: setting up devices") if self.no_cuda: device = torch.device("cpu") self._n_gpu = 0 elif is_torch_tpu_available(): device = xm.xla_device() self._n_gpu = 0 elif is_sagemaker_mp_enabled(): local_rank = smp.local_rank() device = torch.device("cuda", local_rank) self._n_gpu = 1 elif is_sagemaker_dp_enabled(): sm_dist.init_process_group() self.local_rank = sm_dist.get_local_rank() device = torch.device("cuda", self.local_rank) self._n_gpu = 1 elif self.deepspeed: # deepspeed performs its own DDP internally, and requires the program to be started with: # deepspeed ./program.py # rather than: # python -m torch.distributed.launch --nproc_per_node=2 ./program.py from .integrations import is_deepspeed_available if not is_deepspeed_available(): raise ImportError("--deepspeed requires deepspeed: `pip install deepspeed`.") import deepspeed deepspeed.init_distributed() # workaround for setups like notebooks where the launcher can't be used, # but deepspeed requires a dist env. # env LOCAL_RANK could be set manually by the user, or via init_distributed if mpi4py is installed self.local_rank = int(os.environ.get("LOCAL_RANK", "-1")) device = torch.device("cuda", self.local_rank) self._n_gpu = 1 elif self.local_rank == -1: # if n_gpu is > 1 we'll use nn.DataParallel. # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0` # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will # trigger an error that a device index is missing. Index 0 takes into account the # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0` # will use the first GPU in that env, i.e. GPU#1 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at # the default value. self._n_gpu = torch.cuda.device_count() else: # Here, we'll use torch.distributed. # Initializes the distributed backend which will take care of synchronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") device = torch.device("cuda", self.local_rank) self._n_gpu = 1 if device.type == "cuda": torch.cuda.set_device(device) return device
def dist_setting(args): # args.data_parallel = False print("args.data_parallel : {}".format(args.data_parallel)) print("args.model_parallel : {}".format(args.model_parallel)) print("args.apex : {}".format(args.apex)) args.world_size = 1 args.host_num = args.hosts.index(args.current_host) if args.data_parallel: args.world_size = sdp.get_world_size() args.rank = sdp.get_rank() # total rank in all hosts args.local_rank = sdp.get_local_rank() # rank per host elif args.model_parallel: args.world_size = smp.size() args.local_rank = smp.local_rank() # rank per host args.rank = smp.rank() args.dp_size = smp.dp_size() args.dp_rank = smp.dp_rank() print( "smp.rank() : {}, smp.size() : {}, smp.mp_rank() : {}, smp.local_size() : {}, smp.get_mp_group() : {}, smp.get_dp_group() : {}, smp.local_rank() : {}, smp.dp_size() : {}, smp.dp_rank() : {}" .format(smp.rank(), smp.size(), smp.mp_rank(), smp.local_size(), smp.get_mp_group(), smp.get_dp_group(), smp.local_rank(), smp.dp_size(), smp.dp_rank())) else: args.world_size = len(args.hosts) * args.num_gpus if args.local_rank is not None: args.rank = args.num_gpus * args.host_num + \ args.local_rank # total rank in all hosts dist.init_process_group(backend=args.backend, rank=args.rank, world_size=args.world_size) logger.info( 'Initialized the distributed environment: \'{}\' backend on {} nodes. ' .format(args.backend, dist.get_world_size()) + 'Current host rank is {}. Number of gpus: {}'.format( dist.get_rank(), args.num_gpus)) print("**** [dist_setting] args.rank : {}".format(args.rank)) print("args.world_size : {}".format(args.world_size)) print("Use GPU: {} for training".format(args.local_rank)) args.lr = args.lr * float(args.world_size) args.batch_size //= args.world_size // args.num_gpus args.batch_size = max(args.batch_size, 1) return args
def _setup_devices(self) -> "torch.device": logger.info("PyTorch: setting up devices") if self.no_cuda: device = torch.device("cpu") self._n_gpu = 0 elif is_smdistributed_available() and self.mp_parameters != "": # smp.init() local_rank = smp.local_rank() device = torch.device("cuda", local_rank) self._n_gpu = 1 elif is_sagemaker_distributed_available(): import smdistributed.dataparallel.torch.distributed as dist dist.init_process_group() self.local_rank = dist.get_local_rank() device = torch.device("cuda", self.local_rank) self._n_gpu = 1 elif self.local_rank == -1: # if n_gpu is > 1 we'll use nn.DataParallel. # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0` # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will # trigger an error that a device index is missing. Index 0 takes into account the # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0` # will use the first GPU in that env, i.e. GPU#1 device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at # the default value. self._n_gpu = torch.cuda.device_count() else: # Here, we'll use torch.distributed. # Initializes the distributed backend which will take care of synchronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") device = torch.device("cuda", self.local_rank) self._n_gpu = 1 if device.type == "cuda": torch.cuda.set_device(device) return device
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=14, metavar='N', help='number of epochs to train (default: 14)') parser.add_argument('--lr', type=float, default=1.0, metavar='LR', help='learning rate (default: 1.0)') parser.add_argument('--gamma', type=float, default=0.7, metavar='M', help='Learning rate step gamma (default: 0.7)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument('--verbose', action='store_true', default=False, help='For displaying SM Data Parallel-specific logs') parser.add_argument('--data-path', type=str, default='/tmp/data', help='Path for downloading ' 'the MNIST dataset') args = parser.parse_args() args.world_size = dist.get_world_size() args.rank = rank = dist.get_rank() args.local_rank = local_rank = dist.get_local_rank() args.lr = 1.0 args.batch_size //= args.world_size // 8 args.batch_size = max(args.batch_size, 1) data_path = args.data_path if args.verbose: print('Hello from rank', rank, 'of local_rank', local_rank, 'in world size of', args.world_size) if not torch.cuda.is_available(): raise Exception( "Must run SM Distributed DataParallel MNIST example on CUDA-capable devices." ) torch.manual_seed(args.seed) device = torch.device("cuda") if local_rank == 0: train_dataset = datasets.MNIST(data_path, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) else: time.sleep(8) train_dataset = datasets.MNIST(data_path, train=True, download=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=args.world_size, rank=rank) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=True, sampler=train_sampler) if rank == 0: test_loader = torch.utils.data.DataLoader( datasets.MNIST(data_path, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True) model = DDP(Net().to(device)) torch.cuda.set_device(local_rank) model.cuda(local_rank) optimizer = optim.Adadelta(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) if rank == 0: test(model, device, test_loader) scheduler.step() if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt")
default=os.environ['SM_CHANNEL_TRAINING']) parser.add_argument('--model_dir', type=str, default=os.environ['SM_MODEL_DIR']) parser.add_argument('--num_gpus', type=int, default=os.environ['SM_NUM_GPUS']) args = parser.parse_args() return args if __name__ == '__main__': #parse arguments args = parser_args() args.world_size = dist.get_world_size() args.rank = dist.get_rank() args.local_rank = dist.get_local_rank() #print(f"rank={args.rank}, local_rank={args.local_rank}") args.batch_size //= args.world_size // 8 args.batch_size = max(args.batch_size, 1) args.use_cuda = args.num_gpus > 0 print("args.use_cuda : {} , args.num_gpus : {}".format( args.use_cuda, args.num_gpus)) args.device = torch.device("cuda" if args.use_cuda else "cpu") train_model(args)
def main(): # Training settings parser = argparse.ArgumentParser(description="PyTorch MNIST Example") parser.add_argument( "--batch-size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)", ) parser.add_argument( "--test-batch-size", type=int, default=1000, metavar="N", help="input batch size for testing (default: 1000)", ) parser.add_argument( "--epochs", type=int, default=14, metavar="N", help="number of epochs to train (default: 14)", ) parser.add_argument("--lr", type=float, default=1.0, metavar="LR", help="learning rate (default: 1.0)") parser.add_argument( "--gamma", type=float, default=0.7, metavar="M", help="Learning rate step gamma (default: 0.7)", ) parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") parser.add_argument( "--log-interval", type=int, default=10, metavar="N", help="how many batches to wait before logging training status", ) parser.add_argument("--save-model", action="store_true", default=False, help="For Saving the current Model") parser.add_argument( "--verbose", action="store_true", default=False, help="For displaying smdistributed.dataparallel-specific logs", ) parser.add_argument( "--data-path", type=str, default="/tmp/data", help="Path for downloading " "the MNIST dataset", ) args = parser.parse_args() args.world_size = dist.get_world_size() args.rank = rank = dist.get_rank() args.local_rank = local_rank = dist.get_local_rank() args.lr = 1.0 args.batch_size //= args.world_size // 8 args.batch_size = max(args.batch_size, 1) data_path = args.data_path if args.verbose: print( "Hello from rank", rank, "of local_rank", local_rank, "in world size of", args.world_size, ) if not torch.cuda.is_available(): raise CUDANotFoundException( "Must run smdistributed.dataparallel MNIST example on CUDA-capable devices." ) torch.manual_seed(args.seed) device = torch.device("cuda") # select a single rank per node to download data is_first_local_rank = local_rank == 0 if is_first_local_rank: train_dataset = datasets.MNIST( data_path, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ) dist.barrier() # prevent other ranks from accessing the data early if not is_first_local_rank: train_dataset = datasets.MNIST( data_path, train=True, download=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=args.world_size, rank=rank) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=True, sampler=train_sampler, ) if rank == 0: test_loader = torch.utils.data.DataLoader( datasets.MNIST( data_path, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ), batch_size=args.test_batch_size, shuffle=True, ) model = DDP(Net().to(device)) torch.cuda.set_device(local_rank) model.cuda(local_rank) optimizer = optim.Adadelta(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) if rank == 0: test(model, device, test_loader) scheduler.step() if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt")
def main(): parser = argparse.ArgumentParser(description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=dist.get_local_rank()) parser.add_argument( "--seed", help="manually set random seed for torch", type=int, default=99 ) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) parser.add_argument( "--bucket-cap-mb", dest="bucket_cap_mb", help="specify bucket size for SMDataParallel", default=25, type=int, ) parser.add_argument( "--data-dir", dest="data_dir", help="Absolute path of dataset ", type=str, default=None ) parser.add_argument( "--dtype", dest="dtype" ) parser.add_argument( "--spot_ckpt", default=None ) args = parser.parse_args() keys = list(os.environ.keys()) args.data_dir = os.environ['SM_CHANNEL_TRAIN'] if 'SM_CHANNEL_TRAIN' in keys else args.data_dir print("dataset dir: ", args.data_dir) # Set seed to reduce randomness random.seed(args.seed + dist.get_local_rank()) np.random.seed(args.seed + dist.get_local_rank()) torch.manual_seed(args.seed + dist.get_local_rank()) torch.cuda.manual_seed(args.seed + dist.get_local_rank()) # num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 num_gpus = dist.get_world_size() args.distributed = num_gpus > 1 if args.distributed: # SMDataParallel: Pin each GPU to a single SMDataParallel process. torch.cuda.set_device(args.local_rank) # torch.distributed.init_process_group( # backend="nccl", init_method="env://" # ) #synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.DTYPE=args.dtype # grab checkpoint file to start from os.system(f"aws s3 cp {args.spot_ckpt} /opt/ml/checkpoints/{args.spot_ckpt.split('/')[-1]}") cfg.MODEL.WEIGHT = f"/opt/ml/checkpoints/{args.spot_ckpt.split('/')[-1]}" cfg.freeze() print ("CONFIG") print (cfg) output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) logger = setup_logger("maskrcnn_benchmark", output_dir, dist.get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, args) if not args.skip_test: if not cfg.PER_EPOCH_EVAL: test_model(cfg, model, args)
def train(cfg, args): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if use_amp: # Initialize mixed-precision training use_mixed_precision = cfg.DTYPE == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if args.distributed: # if use_apex_ddp: # model = DDP(model, delay_allreduce=True) # else: # SMDataParallel: Wrap the PyTorch model with SMDataParallel’s DDP model = DDP(model, device_ids=[dist.get_local_rank()], broadcast_buffers=False) #model = DDP(model) print("model parameter size: ", sum(p.numel() for p in model.parameters() if p.requires_grad)) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR # SMDataParallel: Save model on master node. save_to_disk = dist.get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk ) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader, iters_per_epoch = make_data_loader( cfg, is_train=True, is_distributed=args.distributed, start_iter=arguments["iteration"], data_dir = args.data_dir ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD # set the callback function to evaluate and potentially # early exit each epoch if cfg.PER_EPOCH_EVAL: per_iter_callback_fn = functools.partial( mlperf_test_early_exit, iters_per_epoch=iters_per_epoch, tester=functools.partial(test, cfg=cfg), model=model, distributed=args.distributed, min_bbox_map=cfg.MIN_BBOX_MAP, min_segm_map=cfg.MIN_MASK_MAP) else: per_iter_callback_fn = None do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, use_amp, cfg, per_iter_end_callback_fn=per_iter_callback_fn, ) return model
def parse_arguments(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--input_dir", default=None, type=str, required=True, help="The input data dir. Should contain .hdf5 files for the task.") parser.add_argument("--config_file", default=None, type=str, required=True, help="The BERT model config") parser.add_argument("--bert_model", default="bert-large-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--init_checkpoint", default=None, type=str, help="The initial checkpoint to start training from.") parser.add_argument("--max_seq_length", default=512, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--max_predictions_per_seq", default=80, type=int, help="The maximum total of masked tokens in input sequence") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_steps", default=1000, type=float, help="Total number of training steps to perform.") parser.add_argument("--warmup_proportion", default=0.01, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--local_rank", type=int, default=herring.get_local_rank(), help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.") parser.add_argument('--fp16', default=False, action='store_true', help="Mixed precision training") parser.add_argument('--amp', default=False, action='store_true', help="Mixed precision training") parser.add_argument('--loss_scale', type=float, default=0.0, help='Loss scaling, positive power of 2 values can improve fp16 convergence.') parser.add_argument('--log_freq', type=float, default=1.0, help='frequency of logging loss.') parser.add_argument('--checkpoint_activations', default=False, action='store_true', help="Whether to use gradient checkpointing") parser.add_argument("--resume_from_checkpoint", default=False, action='store_true', help="Whether to resume training from checkpoint.") parser.add_argument('--resume_step', type=int, default=-1, help="Step to resume training from.") parser.add_argument('--num_steps_per_checkpoint', type=int, default=100, help="Number of update steps until a model checkpoint is saved to disk.") parser.add_argument('--skip_checkpoint', default=False, action='store_true', help="Whether to save checkpoints") parser.add_argument('--phase2', default=False, action='store_true', help="Whether to train with seq len 512") parser.add_argument('--allreduce_post_accumulation', default=False, action='store_true', help="Whether to do allreduces during gradient accumulation steps.") parser.add_argument('--allreduce_post_accumulation_fp16', default=False, action='store_true', help="Whether to do fp16 allreduce post accumulation.") parser.add_argument('--phase1_end_step', type=int, default=7038, help="Number of training steps in Phase1 - seq len 128") parser.add_argument('--init_loss_scale', type=int, default=2**20, help="Initial loss scaler value") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument('--json-summary', type=str, default="results/dllogger.json", help='If provided, the json summary will be written to' 'the specified file.') parser.add_argument("--use_env", action='store_true', help="Whether to read local rank from ENVVAR") parser.add_argument('--disable_progress_bar', default=False, action='store_true', help='Disable tqdm progress bar') parser.add_argument('--steps_this_run', type=int, default=-1, help='If provided, only run this many steps before exiting') parser.add_argument('--bucket_cap_mb', type=int, default=25, help='If provided, only run this many steps before exiting') args = parser.parse_args() args.fp16 = args.fp16 or args.amp if args.steps_this_run < 0: args.steps_this_run = args.max_steps return args
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=herring.get_local_rank()) parser.add_argument("--seed", help="manually set random seed for torch", type=int, default=99) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) parser.add_argument( "--bucket-cap-mb", dest="bucket_cap_mb", help="specify bucket size for herring", default=25, type=int, ) parser.add_argument("--data-dir", dest="data_dir", help="Absolute path of dataset ", type=str, default=None) args = parser.parse_args() # Set seed to reduce randomness random.seed(args.seed + herring.get_local_rank()) np.random.seed(args.seed + herring.get_local_rank()) torch.manual_seed(args.seed + herring.get_local_rank()) torch.cuda.manual_seed(args.seed + herring.get_local_rank()) # num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 num_gpus = herring.get_world_size() args.distributed = num_gpus > 1 if args.distributed: torch.cuda.set_device(args.local_rank) # torch.distributed.init_process_group( # backend="nccl", init_method="env://" # ) #synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) logger = setup_logger("maskrcnn_benchmark", output_dir, herring.get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, args) if not args.skip_test: if not cfg.PER_EPOCH_EVAL: test_model(cfg, model, args)
# SageMaker Container environment parser.add_argument('--model-dir', type=str, default='../model') parser.add_argument('--data-dir', type=str, default='../data') args = parser.parse_args() try: args.model_dir = os.environ['SM_MODEL_DIR'] args.data_dir = os.environ['SM_CHANNEL_TRAINING'] except KeyError as e: print( "The model starts training on the local host without SageMaker TrainingJob." ) if not os.path.exists(args.model_dir): os.makedirs(args.model_dir) pass ######################################################## ####### 2. SageMaker Distributed Data Parallel ####### ####### - Get all number of GPU and rank number ####### ######################################################## args.world_size = smdp.get_world_size() # all number of GPU args.rank = smdp.get_rank() # total rank in all hosts args.local_rank = smdp.get_local_rank() # rank per host ######################################################## train(args)