def get_loaders_for_linear_eval(cfg): if cfg.debug: train_n_samples = comm.get_world_size() * DEBUG_NUM_BATCH train_n_samples = train_n_samples * cfg.eval.batch_size_train eval_n_samples = comm.get_world_size() * DEBUG_NUM_BATCH eval_n_samples = eval_n_samples * cfg.eval.batch_size_eval else: train_n_samples = eval_n_samples = -1 # augmentation train_transforms = get_simple_transforms(input_size=cfg.augment.input_size) eval_transforms = get_center_crop_transforms( input_size=cfg.augment.input_size) # dataset train_dataset, num_classes = get_dataset( data_name=cfg.dataset.name, data_root=cfg.dataset.root, train=True, transform=train_transforms, num_subsample=int(train_n_samples), ) eval_dataset, _ = get_dataset( data_name=cfg.dataset.name, data_root=cfg.dataset.root, train=False, transform=eval_transforms, num_subsample=int(eval_n_samples), ) # sampler train_sampler = DistributedSampler(dataset=train_dataset, rank=comm.get_rank(), num_replicas=comm.get_world_size(), shuffle=True) eval_sampler = DistributedSampler(dataset=eval_dataset, rank=comm.get_rank(), num_replicas=comm.get_world_size(), shuffle=True) # dataloader num_workers = cfg.eval.num_workers if not cfg.debug else 4 train_loader = FastDataloader(dataset=train_dataset, batch_size=cfg.eval.batch_size_train, num_workers=num_workers, drop_last=False, sampler=train_sampler, collate_fn=_collate_fn) eval_loader = FastDataloader(dataset=eval_dataset, batch_size=cfg.eval.batch_size_eval, num_workers=num_workers, drop_last=False, sampler=eval_sampler, collate_fn=_collate_fn) return train_loader, eval_loader, num_classes
def get_loaders_for_trainer(cfg): train_loader, eval_loader, num_classes = (None, ) * 3 # train dataset if cfg.train.enabled: if cfg.debug: n_samples = comm.get_world_size() * DEBUG_NUM_BATCH n_samples = n_samples * cfg.train.batch_size_train else: n_samples = -1 train_dataset, num_classes = get_dataset( data_name=cfg.dataset.name, data_root=cfg.dataset.root, train=True, transform=get_transforms(cfg, train=True), num_subsample=int(n_samples), ) train_sampler = DistributedSampler(dataset=train_dataset, rank=comm.get_rank(), num_replicas=comm.get_world_size(), shuffle=True) train_loader = FastDataloader(dataset=train_dataset, batch_size=cfg.train.batch_size_train, num_workers=cfg.train.num_workers, sampler=train_sampler, drop_last=False, collate_fn=_collate_fn) # test dataset (for online evaluation) if cfg.train.enabled and cfg.train.online_eval: if cfg.debug: n_samples = comm.get_world_size() * DEBUG_NUM_BATCH n_samples = n_samples * cfg.train.batch_size_eval eval_dataset, num_classes = get_dataset( data_name=cfg.dataset.name, data_root=cfg.dataset.root, train=False, transform=get_transforms(cfg, train=False), num_subsample=int(n_samples), ) eval_sampler = DistributedSampler(dataset=eval_dataset, rank=comm.get_rank(), num_replicas=comm.get_world_size(), shuffle=True) eval_loader = FastDataloader(dataset=eval_dataset, batch_size=cfg.train.batch_size_eval, num_workers=cfg.train.num_workers, sampler=eval_sampler, drop_last=False, collate_fn=_collate_fn) return train_loader, eval_loader, num_classes
def scatter_values(dist_url, value): url = list(urlsplit(dist_url))[1] addr = url[:url.find(":")] port = int(url[url.find(":") + 1:]) + 1 sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) def threaded(client_sock, i): client_sock.send(value.encode()) client_sock.close() if comm.is_main_process(): # server side sock.bind((addr, port)) sock.listen() comm.synchronize() for i in range(comm.get_world_size() - 1): client_sock, addr = sock.accept() thread = threading.Thread(target=threaded, args=(client_sock, i)) thread.daemon = True thread.start() else: # client side comm.synchronize() time.sleep(3) sock.connect((addr, port)) value = sock.recv(1024).decode() sock.close() return value
def wrap_if_distributed(module, device): if comm.get_world_size() > 1 and len(list(module.parameters())) > 0: module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module) module = DistributedDataParallel(module=module, device_ids=[device], broadcast_buffers=False, find_unused_parameters=True) return module
def initialize(cfg=None): cfg = get_cfg(parse_args()) if cfg is None else cfg # launch multi-process for DDP # - processes will be branched off at this point # - subprocess ignores launching process and returns None if cfg.num_machines * cfg.num_gpus > 1: log.info(C.green(f"[!] Lauching Multiprocessing..")) cfg.spawn_ctx = launch(main_func=initialize, num_gpus_per_machine=cfg.num_gpus, num_machines=cfg.num_machines, machine_rank=cfg.machine_rank, dist_url=cfg.dist_url, args=(cfg, )) else: cfg.spawn_ctx = None # scatter save_dir to all of non-main ranks cfg.save_dir = comm.scatter(cfg.save_dir) # finalize config C.set_enabled(not cfg.no_color) # for sub-processes cfg.device = comm.get_local_rank() cfg.freeze() # file logging on the local ranks set_stream_handler('comm', cfg.log_level) # for sub-processes log_rank_file = f"log_rank_{comm.get_rank()}.txt" set_file_handler('main', cfg.log_level, cfg.save_dir, log_rank_file) set_stream_handler('error', cfg.log_level) set_file_handler('error', cfg.log_level, cfg.save_dir, "log_error.txt") if comm.is_main_process(): set_file_handler('result', cfg.log_level, "./", "log_result.txt") # log distriubted learning if comm.get_world_size() > 1: log.info(f"[DDP] dist_url: {cfg.dist_url}") log.info(f"[DDP] global_world_size = {comm.get_world_size()}") log.info(f"[DDP] num_gpus_per_machine = {torch.cuda.device_count()}") log.info(f"[DDP] machine_rank {cfg.machine_rank} / " f"num_machines = {cfg.num_machines}") comm.synchronize() log_comm.info(f"[DDP] rank (local: {comm.get_local_rank()}, " f"global: {comm.get_rank()}) has been spawned.") comm.synchronize() log.info(f"[DDP] Synchronized across all the ranks.") if not cfg.spawn_ctx: # This structure (including customized launch.py) is for compatibility # with our internal API. There is no functional difference from the # typical usage of distributed package. Please don't mind this # pecularity and focus on the main algorithm. for _ in train(cfg): pass return cfg
def scale_learning_rate(cfg, mode): assert mode in ['train', 'eval'] world_size = comm.get_world_size() lr_origin = cfg[mode].optim.lr local_batch = cfg[mode].batch_size_train global_batch = int(local_batch * world_size) ratio = global_batch / 256. lr = lr_origin * ratio log.info(f'[LR({mode})] local_batch ({local_batch}) x ' f'world_size ({world_size}) = global_batch ({global_batch})') log.info(f'[LR({mode})] scale LR from {lr_origin} ' f'to {lr} (x{ratio:3.2f}) by linear scaling rule.') optim = deepcopy(cfg[mode].optim) optim.lr = lr return optim