# the JSON file to store the S3 test stats TEST_TIMES_FILE = ".pytorch-test-times.json" # if a test file takes longer than 5 min, we add it to TARGET_DET_LIST SLOW_TEST_THRESHOLD = 300 DISTRIBUTED_TESTS_CONFIG = {} if dist.is_available(): DISTRIBUTED_TESTS_CONFIG["test"] = {"WORLD_SIZE": "1"} if not TEST_WITH_ROCM and dist.is_mpi_available(): DISTRIBUTED_TESTS_CONFIG["mpi"] = { "WORLD_SIZE": "3", "TEST_REPORT_SOURCE_OVERRIDE": "dist-mpi", } if dist.is_nccl_available(): DISTRIBUTED_TESTS_CONFIG["nccl"] = { "WORLD_SIZE": "2" if torch.cuda.device_count() == 2 else "3", "TEST_REPORT_SOURCE_OVERRIDE": "dist-nccl", } if dist.is_gloo_available(): DISTRIBUTED_TESTS_CONFIG["gloo"] = { "WORLD_SIZE": "2" if torch.cuda.device_count() == 2 else "3", "TEST_REPORT_SOURCE_OVERRIDE": "dist-gloo", } # https://stackoverflow.com/questions/2549939/get-signal-names-from-numbers-in-python SIGNALS_TO_NAMES_DICT = { getattr(signal, n): n for n in dir(signal) if n.startswith("SIG") and "_" not in n }
def main(opt): set_logging(RANK) if RANK in [-1, 0]: print(colorstr('train: ') + ', '.join(f'{k}={v}' for k, v in vars(opt).items())) check_git_status() check_requirements(exclude=['thop']) # Resume wandb_run = check_wandb_resume(opt) if opt.resume and not wandb_run: # resume an interrupted run ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # specified or most recent path assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist' with open(Path(ckpt).parent.parent / 'opt.yaml') as f: opt = argparse.Namespace(**yaml.safe_load(f)) # replace opt.cfg, opt.weights, opt.resume = '', ckpt, True # reinstate logger.info('Resuming training from %s' % ckpt) else: # opt.hyp = opt.hyp or ('hyp.finetune.yaml' if opt.weights else 'hyp.scratch.yaml') opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file(opt.cfg), check_file(opt.hyp) # check files assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified' opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test) opt.name = 'evolve' if opt.evolve else opt.name opt.save_dir = str(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok | opt.evolve)) # DDP mode device = select_device(opt.device, batch_size=opt.batch_size) if LOCAL_RANK != -1: from datetime import timedelta assert torch.cuda.device_count() > LOCAL_RANK, 'insufficient CUDA devices for DDP command' torch.cuda.set_device(LOCAL_RANK) device = torch.device('cuda', LOCAL_RANK) dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo", timeout=timedelta(seconds=60)) assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count' assert not opt.image_weights, '--image-weights argument is not compatible with DDP training' # Train if not opt.evolve: train(opt.hyp, opt, device) if WORLD_SIZE > 1 and RANK == 0: _ = [print('Destroying process group... ', end=''), dist.destroy_process_group(), print('Done.')] # Evolve hyperparameters (optional) else: # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit) meta = {'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3) 'lrf': (1, 0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf) 'momentum': (0.3, 0.6, 0.98), # SGD momentum/Adam beta1 'weight_decay': (1, 0.0, 0.001), # optimizer weight decay 'warmup_epochs': (1, 0.0, 5.0), # warmup epochs (fractions ok) 'warmup_momentum': (1, 0.0, 0.95), # warmup initial momentum 'warmup_bias_lr': (1, 0.0, 0.2), # warmup initial bias lr 'box': (1, 0.02, 0.2), # box loss gain 'cls': (1, 0.2, 4.0), # cls loss gain 'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight 'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels) 'obj_pw': (1, 0.5, 2.0), # obj BCELoss positive_weight 'iou_t': (0, 0.1, 0.7), # IoU training threshold 'anchor_t': (1, 2.0, 8.0), # anchor-multiple threshold 'anchors': (2, 2.0, 10.0), # anchors per output grid (0 to ignore) 'fl_gamma': (0, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5) 'hsv_h': (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction) 'hsv_s': (1, 0.0, 0.9), # image HSV-Saturation augmentation (fraction) 'hsv_v': (1, 0.0, 0.9), # image HSV-Value augmentation (fraction) 'degrees': (1, 0.0, 45.0), # image rotation (+/- deg) 'translate': (1, 0.0, 0.9), # image translation (+/- fraction) 'scale': (1, 0.0, 0.9), # image scale (+/- gain) 'shear': (1, 0.0, 10.0), # image shear (+/- deg) 'perspective': (0, 0.0, 0.001), # image perspective (+/- fraction), range 0-0.001 'flipud': (1, 0.0, 1.0), # image flip up-down (probability) 'fliplr': (0, 0.0, 1.0), # image flip left-right (probability) 'mosaic': (1, 0.0, 1.0), # image mixup (probability) 'mixup': (1, 0.0, 1.0)} # image mixup (probability) with open(opt.hyp) as f: hyp = yaml.safe_load(f) # load hyps dict assert LOCAL_RANK == -1, 'DDP mode not implemented for --evolve' opt.notest, opt.nosave = True, True # only test/save final epoch # ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices yaml_file = Path(opt.save_dir) / 'hyp_evolved.yaml' # save best result here if opt.bucket: os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket) # download evolve.txt if exists for _ in range(300): # generations to evolve if Path('evolve.txt').exists(): # if evolve.txt exists: select best hyps and mutate # Select parent(s) parent = 'single' # parent selection method: 'single' or 'weighted' x = np.loadtxt('evolve.txt', ndmin=2) n = min(5, len(x)) # number of previous results to consider x = x[np.argsort(-fitness(x))][:n] # top n mutations w = fitness(x) - fitness(x).min() # weights if parent == 'single' or len(x) == 1: # x = x[random.randint(0, n - 1)] # random selection x = x[random.choices(range(n), weights=w)[0]] # weighted selection elif parent == 'weighted': x = (x * w.reshape(n, 1)).sum(0) / w.sum() # weighted combination # Mutate mp, s = 0.8, 0.2 # mutation probability, sigma npr = np.random npr.seed(int(time.time())) g = np.array([x[0] for x in meta.values()]) # gains 0-1 ng = len(meta) v = np.ones(ng) while all(v == 1): # mutate until a change occurs (prevent duplicates) v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0) for i, k in enumerate(hyp.keys()): # plt.hist(v.ravel(), 300) hyp[k] = float(x[i + 7] * v[i]) # mutate # Constrain to limits for k, v in meta.items(): hyp[k] = max(hyp[k], v[1]) # lower limit hyp[k] = min(hyp[k], v[2]) # upper limit hyp[k] = round(hyp[k], 5) # significant digits # Train mutation results = train(hyp.copy(), opt, device) # Write mutation results print_mutation(hyp.copy(), results, yaml_file, opt.bucket) # Plot results plot_evolution(yaml_file) print(f'Hyperparameter evolution complete. Best results saved as: {yaml_file}\n' f'Command to train a new model with these hyperparameters: $ python train.py --hyp {yaml_file}')
def main(opt, callbacks=Callbacks()): # Checks if RANK in [-1, 0]: print_args(FILE.stem, opt) check_git_status() check_requirements(exclude=['thop']) # Resume if opt.resume and not check_wandb_resume(opt) and not opt.evolve: # resume an interrupted run ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # specified or most recent path assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist' with open(Path(ckpt).parent.parent / 'opt.yaml', errors='ignore') as f: opt = argparse.Namespace(**yaml.safe_load(f)) # replace opt.cfg, opt.weights, opt.resume = '', ckpt, True # reinstate LOGGER.info(f'Resuming training from {ckpt}') else: opt.data, opt.cfg, opt.hyp, opt.weights, opt.project = \ check_file(opt.data), check_yaml(opt.cfg), check_yaml(opt.hyp), str(opt.weights), str(opt.project) # checks assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified' if opt.evolve: opt.project = str(ROOT / 'runs/evolve') opt.exist_ok, opt.resume = opt.resume, False # pass resume to exist_ok and disable resume opt.save_dir = str(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok)) # DDP mode device = select_device(opt.device, batch_size=opt.batch_size) if LOCAL_RANK != -1: assert torch.cuda.device_count() > LOCAL_RANK, 'insufficient CUDA devices for DDP command' assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count' assert not opt.image_weights, '--image-weights argument is not compatible with DDP training' assert not opt.evolve, '--evolve argument is not compatible with DDP training' torch.cuda.set_device(LOCAL_RANK) device = torch.device('cuda', LOCAL_RANK) dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo") # Train if not opt.evolve: train(opt.hyp, opt, device, callbacks) if WORLD_SIZE > 1 and RANK == 0: LOGGER.info('Destroying process group... ') dist.destroy_process_group() # Evolve hyperparameters (optional) else: # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit) meta = {'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3) 'lrf': (1, 0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf) 'momentum': (0.3, 0.6, 0.98), # SGD momentum/Adam beta1 'weight_decay': (1, 0.0, 0.001), # optimizer weight decay 'warmup_epochs': (1, 0.0, 5.0), # warmup epochs (fractions ok) 'warmup_momentum': (1, 0.0, 0.95), # warmup initial momentum 'warmup_bias_lr': (1, 0.0, 0.2), # warmup initial bias lr 'box': (1, 0.02, 0.2), # box loss gain 'cls': (1, 0.2, 4.0), # cls loss gain 'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight 'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels) 'obj_pw': (1, 0.5, 2.0), # obj BCELoss positive_weight 'iou_t': (0, 0.1, 0.7), # IoU training threshold 'anchor_t': (1, 2.0, 8.0), # anchor-multiple threshold 'anchors': (2, 2.0, 10.0), # anchors per output grid (0 to ignore) 'fl_gamma': (0, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5) 'hsv_h': (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction) 'hsv_s': (1, 0.0, 0.9), # image HSV-Saturation augmentation (fraction) 'hsv_v': (1, 0.0, 0.9), # image HSV-Value augmentation (fraction) 'degrees': (1, 0.0, 45.0), # image rotation (+/- deg) 'translate': (1, 0.0, 0.9), # image translation (+/- fraction) 'scale': (1, 0.0, 0.9), # image scale (+/- gain) 'shear': (1, 0.0, 10.0), # image shear (+/- deg) 'perspective': (0, 0.0, 0.001), # image perspective (+/- fraction), range 0-0.001 'flipud': (1, 0.0, 1.0), # image flip up-down (probability) 'fliplr': (0, 0.0, 1.0), # image flip left-right (probability) 'mosaic': (1, 0.0, 1.0), # image mixup (probability) 'mixup': (1, 0.0, 1.0), # image mixup (probability) 'copy_paste': (1, 0.0, 1.0)} # segment copy-paste (probability) with open(opt.hyp, errors='ignore') as f: hyp = yaml.safe_load(f) # load hyps dict if 'anchors' not in hyp: # anchors commented in hyp.yaml hyp['anchors'] = 3 opt.noval, opt.nosave, save_dir = True, True, Path(opt.save_dir) # only val/save final epoch # ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices evolve_yaml, evolve_csv = save_dir / 'hyp_evolve.yaml', save_dir / 'evolve.csv' if opt.bucket: os.system(f'gsutil cp gs://{opt.bucket}/evolve.csv {save_dir}') # download evolve.csv if exists for _ in range(opt.evolve): # generations to evolve if evolve_csv.exists(): # if evolve.csv exists: select best hyps and mutate # Select parent(s) parent = 'single' # parent selection method: 'single' or 'weighted' x = np.loadtxt(evolve_csv, ndmin=2, delimiter=',', skiprows=1) n = min(5, len(x)) # number of previous results to consider x = x[np.argsort(-fitness(x))][:n] # top n mutations w = fitness(x) - fitness(x).min() + 1E-6 # weights (sum > 0) if parent == 'single' or len(x) == 1: # x = x[random.randint(0, n - 1)] # random selection x = x[random.choices(range(n), weights=w)[0]] # weighted selection elif parent == 'weighted': x = (x * w.reshape(n, 1)).sum(0) / w.sum() # weighted combination # Mutate mp, s = 0.8, 0.2 # mutation probability, sigma npr = np.random npr.seed(int(time.time())) g = np.array([meta[k][0] for k in hyp.keys()]) # gains 0-1 ng = len(meta) v = np.ones(ng) while all(v == 1): # mutate until a change occurs (prevent duplicates) v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0) for i, k in enumerate(hyp.keys()): # plt.hist(v.ravel(), 300) hyp[k] = float(x[i + 7] * v[i]) # mutate # Constrain to limits for k, v in meta.items(): hyp[k] = max(hyp[k], v[1]) # lower limit hyp[k] = min(hyp[k], v[2]) # upper limit hyp[k] = round(hyp[k], 5) # significant digits # Train mutation results = train(hyp.copy(), opt, device, callbacks) # Write mutation results print_mutation(results, hyp.copy(), save_dir, opt.bucket) # Plot results plot_evolve(evolve_csv) LOGGER.info(f'Hyperparameter evolution finished\n' f"Results saved to {colorstr('bold', save_dir)}\n" f'Use best hyperparameters example: $ python train.py --hyp {evolve_yaml}')
from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes import re HIP_VERSION = 0.0 if torch.version.hip is None else float( re.search(r"^\d+\.\d+", torch.version.hip)[0]) # load_tests from common_utils is used to automatically filter tests for # sharding on sandcastle. This line silences flake warnings load_tests = load_tests nGPUs = torch.cuda.device_count() if not TEST_CUDA: print('CUDA not available, skipping tests', file=sys.stderr) TestCase = object # noqa: F811 datatypes = [torch.float] if (TEST_CUDA and CUDA11OrLater and c10d.is_nccl_available() and nccl.version() >= (2, 10)) or TEST_WITH_ROCM: datatypes.append(torch.bfloat16) class TestNCCL(TestCase): @sandcastle_skip_if(IS_WINDOWS, "NCCL doesn't support Windows") def test_unique_id(self, device): uid = nccl.unique_id() self.assertIsInstance(uid, bytes) self.assertGreater(len(uid), 1) @sandcastle_skip_if(TEST_WITH_ROCM and HIP_VERSION < 3.5, 'Skip NCCL tests for ROCm') @sandcastle_skip_if(IS_WINDOWS, "NCCL doesn't support Windows") @sandcastle_skip_if(not TEST_MULTIGPU, "only one GPU detected")
def requires_nccl(): return unittest.skipUnless( c10d.is_nccl_available(), "c10d was not compiled with the NCCL backend", )
def requires_nccl(): return sandcastle_skip_if( not c10d.is_nccl_available(), "c10d was not compiled with the NCCL backend", )
if not dist.is_available(): print("Distributed not available, skipping tests", file=sys.stderr) sys.exit(0) if TEST_WITH_DEV_DBG_ASAN: print( "Skip dev-asan as torch + multiprocessing spawn have known issues", file=sys.stderr, ) sys.exit(0) # Various mixed precision configs to test under. default_mp = MixedPrecision() nccl_supports_bf16 = (CUDA11OrLater and dist.is_nccl_available() and nccl.version() >= (2, 10)) mp_configs = [default_mp] if nccl_supports_bf16: mp_diff_reduce = MixedPrecision(reduce_dtype=torch.bfloat16) mp_diff_buffer = MixedPrecision(buffer_dtype=torch.bfloat16) mp_diff_buffer_and_reduce = MixedPrecision(buffer_dtype=torch.bfloat16, reduce_dtype=torch.float32) mp_configs.extend([ mp_diff_reduce, mp_diff_buffer, mp_diff_buffer_and_reduce, ])
# print("\tIn Model: input size", input.size(), # "output size", output.size()) return output if __name__ == "__main__": # Parameters input_size = 5 output_size = 2 batch_size = 30 data_size = 100 # check the nccl backend if not dist.is_nccl_available(): print("Error: nccl backend not available.") sys.exit(1) # init group dist.init_process_group(backend="nccl", init_method="env://") # get the process rank and the world size rank = dist.get_rank() world_size = dist.get_world_size() # prepare the dataset dataset = RandomDataset(input_size, data_size) train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) rand_loader = DataLoader(dataset,
else: assert "nccl" not in available_backends if dist.is_gloo_available(): assert "gloo" in available_backends else: assert "gloo" not in available_backends if dist.is_mpi_available(): assert "mpi" in available_backends else: assert "mpi" not in available_backends @pytest.mark.distributed @pytest.mark.skipif(not dist.is_nccl_available(), reason="Skip if nccl not available") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test__native_nccl_but_no_gpu(mock_gpu_is_not_available): with pytest.raises( RuntimeError, match=r"Nccl backend is required but no cuda capable devices"): _NativeDistModel(backend="nccl") @pytest.mark.distributed @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test__native_dist_model_create_from_backend_bad_config():
def get_world_size(): if not dist.is_nccl_available(): return 1 if not dist.is_initialized(): return 1 return dist.get_world_size()
def get_rank(): if not dist.is_nccl_available(): return 0 if not dist.is_initialized(): return 0 return dist.get_rank()
def init_distributed(rank=-1, local_rank=-1, size=-1, use_gpu=False, backend=""): global myreq global my_rank global my_size global my_local_rank global my_local_size global a2a_impl global alltoall_supported # guess MPI ranks from env (works for IMPI, OMPI and MVAPICH2) num_mpi_ranks = env2int([ "PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE" ]) if backend == "" and num_mpi_ranks > 1: if torch_ccl and env2int(["CCL_WORKER_COUNT"]) > 0: backend = "ccl" elif use_gpu and dist.is_nccl_available(): backend = "nccl" elif dist.is_mpi_available(): backend = "mpi" else: print( "WARNING: MPI multi-process launch detected but PyTorch MPI backend not available." ) backend = "gloo" if backend != "": # guess Rank and size if rank == -1: rank = env2int([ "PMI_RANK", "OMPI_COMM_WORLD_RANK", "MV2_COMM_WORLD_RANK", "RANK" ], 0) if size == -1: size = env2int( [ "PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE", ], 1, ) if not os.environ.get("RANK", None) and rank != -1: os.environ["RANK"] = str(rank) if not os.environ.get("WORLD_SIZE", None) and size != -1: os.environ["WORLD_SIZE"] = str(size) if not os.environ.get("MASTER_PORT", None): os.environ["MASTER_PORT"] = "29500" if not os.environ.get("MASTER_ADDR", None): local_size = env2int( [ "MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE", ], 1, ) if local_size != size and backend != "mpi": print( "Warning: Looks like distributed multinode run but MASTER_ADDR env not set, using '127.0.0.1' as default" ) print( "If this run hangs, try exporting rank 0's hostname as MASTER_ADDR" ) os.environ["MASTER_ADDR"] = "127.0.0.1" if size > 1: if local_rank == -1: my_local_rank = env2int( [ "MPI_LOCALRANKID", "OMPI_COMM_WORLD_LOCAL_RANK", "MV2_COMM_WORLD_LOCAL_RANK", "LOCAL_RANK", ], 0, ) else: my_local_rank = local_rank my_local_size = env2int( [ "MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE", ], 1, ) if use_gpu: if my_local_size > torch.cuda.device_count(): print( "Not sufficient GPUs available... local_size = %d, ngpus = %d" % (my_local_size, torch.cuda.device_count())) sys.exit(1) torch.cuda.set_device(my_local_rank) dist.init_process_group(backend, rank=rank, world_size=size) my_rank = dist.get_rank() my_size = dist.get_world_size() if my_rank == 0: print("Running on %d ranks using %s backend" % (my_size, backend)) if hasattr(dist, "all_to_all_single"): try: t = torch.zeros([4]) if use_gpu: t = t.cuda() dist.all_to_all_single(t, t) alltoall_supported = True except RuntimeError as err: print("fail to enable all_to_all_single primitive: %s" % err) if a2a_impl == "alltoall" and alltoall_supported == False: print( "Requested DLRM_ALLTOALL_IMPL=%s but backend %s does not support it, use scatter/gather based alltoall" % (a2a_impl, backend)) a2a_impl = "scatter" if a2a_impl != "": print("Using DLRM_ALLTOALL_IMPL=%s" % a2a_impl) else: my_rank = 0 my_size = 1 my_local_rank = 0 my_local_size = 1 print_all("world size: %d, current rank: %d, local rank: %d" % (my_size, my_rank, my_local_rank)) myreq = Request()