Exemple #1
0
def get_loaders_for_linear_eval(cfg):
    if cfg.debug:
        train_n_samples = comm.get_world_size() * DEBUG_NUM_BATCH
        train_n_samples = train_n_samples * cfg.eval.batch_size_train
        eval_n_samples = comm.get_world_size() * DEBUG_NUM_BATCH
        eval_n_samples = eval_n_samples * cfg.eval.batch_size_eval
    else:
        train_n_samples = eval_n_samples = -1

    # augmentation
    train_transforms = get_simple_transforms(input_size=cfg.augment.input_size)
    eval_transforms = get_center_crop_transforms(
        input_size=cfg.augment.input_size)

    # dataset
    train_dataset, num_classes = get_dataset(
        data_name=cfg.dataset.name,
        data_root=cfg.dataset.root,
        train=True,
        transform=train_transforms,
        num_subsample=int(train_n_samples),
    )
    eval_dataset, _ = get_dataset(
        data_name=cfg.dataset.name,
        data_root=cfg.dataset.root,
        train=False,
        transform=eval_transforms,
        num_subsample=int(eval_n_samples),
    )

    # sampler
    train_sampler = DistributedSampler(dataset=train_dataset,
                                       rank=comm.get_rank(),
                                       num_replicas=comm.get_world_size(),
                                       shuffle=True)
    eval_sampler = DistributedSampler(dataset=eval_dataset,
                                      rank=comm.get_rank(),
                                      num_replicas=comm.get_world_size(),
                                      shuffle=True)

    # dataloader
    num_workers = cfg.eval.num_workers if not cfg.debug else 4
    train_loader = FastDataloader(dataset=train_dataset,
                                  batch_size=cfg.eval.batch_size_train,
                                  num_workers=num_workers,
                                  drop_last=False,
                                  sampler=train_sampler,
                                  collate_fn=_collate_fn)
    eval_loader = FastDataloader(dataset=eval_dataset,
                                 batch_size=cfg.eval.batch_size_eval,
                                 num_workers=num_workers,
                                 drop_last=False,
                                 sampler=eval_sampler,
                                 collate_fn=_collate_fn)
    return train_loader, eval_loader, num_classes
Exemple #2
0
def get_loaders_for_trainer(cfg):
    train_loader, eval_loader, num_classes = (None, ) * 3

    # train dataset
    if cfg.train.enabled:
        if cfg.debug:
            n_samples = comm.get_world_size() * DEBUG_NUM_BATCH
            n_samples = n_samples * cfg.train.batch_size_train
        else:
            n_samples = -1

        train_dataset, num_classes = get_dataset(
            data_name=cfg.dataset.name,
            data_root=cfg.dataset.root,
            train=True,
            transform=get_transforms(cfg, train=True),
            num_subsample=int(n_samples),
        )
        train_sampler = DistributedSampler(dataset=train_dataset,
                                           rank=comm.get_rank(),
                                           num_replicas=comm.get_world_size(),
                                           shuffle=True)
        train_loader = FastDataloader(dataset=train_dataset,
                                      batch_size=cfg.train.batch_size_train,
                                      num_workers=cfg.train.num_workers,
                                      sampler=train_sampler,
                                      drop_last=False,
                                      collate_fn=_collate_fn)

    # test dataset (for online evaluation)
    if cfg.train.enabled and cfg.train.online_eval:
        if cfg.debug:
            n_samples = comm.get_world_size() * DEBUG_NUM_BATCH
            n_samples = n_samples * cfg.train.batch_size_eval

        eval_dataset, num_classes = get_dataset(
            data_name=cfg.dataset.name,
            data_root=cfg.dataset.root,
            train=False,
            transform=get_transforms(cfg, train=False),
            num_subsample=int(n_samples),
        )
        eval_sampler = DistributedSampler(dataset=eval_dataset,
                                          rank=comm.get_rank(),
                                          num_replicas=comm.get_world_size(),
                                          shuffle=True)
        eval_loader = FastDataloader(dataset=eval_dataset,
                                     batch_size=cfg.train.batch_size_eval,
                                     num_workers=cfg.train.num_workers,
                                     sampler=eval_sampler,
                                     drop_last=False,
                                     collate_fn=_collate_fn)

    return train_loader, eval_loader, num_classes
Exemple #3
0
def scatter_values(dist_url, value):
    url = list(urlsplit(dist_url))[1]
    addr = url[:url.find(":")]
    port = int(url[url.find(":") + 1:]) + 1
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
    
    def threaded(client_sock, i):
        client_sock.send(value.encode())
        client_sock.close()
    
    if comm.is_main_process():
        # server side
        sock.bind((addr, port))
        sock.listen()
        comm.synchronize()
        for i in range(comm.get_world_size() - 1):
            client_sock, addr = sock.accept()
            thread = threading.Thread(target=threaded, args=(client_sock, i))
            thread.daemon = True
            thread.start()
    else:
        # client side
        comm.synchronize()
        time.sleep(3)
        sock.connect((addr, port))
        value = sock.recv(1024).decode()
        
    sock.close()
    return value
Exemple #4
0
def wrap_if_distributed(module, device):
    if comm.get_world_size() > 1 and len(list(module.parameters())) > 0:
        module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module)
        module = DistributedDataParallel(module=module, 
                                         device_ids=[device], 
                                         broadcast_buffers=False, 
                                         find_unused_parameters=True)
    return module
Exemple #5
0
def initialize(cfg=None):
    cfg = get_cfg(parse_args()) if cfg is None else cfg
    # launch multi-process for DDP
    #   - processes will be branched off at this point
    #   - subprocess ignores launching process and returns None
    if cfg.num_machines * cfg.num_gpus > 1:
        log.info(C.green(f"[!] Lauching Multiprocessing.."))
        cfg.spawn_ctx = launch(main_func=initialize,
                               num_gpus_per_machine=cfg.num_gpus,
                               num_machines=cfg.num_machines,
                               machine_rank=cfg.machine_rank,
                               dist_url=cfg.dist_url,
                               args=(cfg, ))
    else:
        cfg.spawn_ctx = None

    # scatter save_dir to all of non-main ranks
    cfg.save_dir = comm.scatter(cfg.save_dir)

    # finalize config
    C.set_enabled(not cfg.no_color)  # for sub-processes
    cfg.device = comm.get_local_rank()
    cfg.freeze()

    # file logging on the local ranks
    set_stream_handler('comm', cfg.log_level)  # for sub-processes
    log_rank_file = f"log_rank_{comm.get_rank()}.txt"
    set_file_handler('main', cfg.log_level, cfg.save_dir, log_rank_file)
    set_stream_handler('error', cfg.log_level)
    set_file_handler('error', cfg.log_level, cfg.save_dir, "log_error.txt")
    if comm.is_main_process():
        set_file_handler('result', cfg.log_level, "./", "log_result.txt")

    # log distriubted learning
    if comm.get_world_size() > 1:
        log.info(f"[DDP] dist_url: {cfg.dist_url}")
        log.info(f"[DDP] global_world_size = {comm.get_world_size()}")
        log.info(f"[DDP] num_gpus_per_machine = {torch.cuda.device_count()}")
        log.info(f"[DDP] machine_rank {cfg.machine_rank} / "
                 f"num_machines = {cfg.num_machines}")
        comm.synchronize()
        log_comm.info(f"[DDP] rank (local: {comm.get_local_rank()}, "
                      f"global: {comm.get_rank()}) has been spawned.")
        comm.synchronize()
        log.info(f"[DDP] Synchronized across all the ranks.")

    if not cfg.spawn_ctx:
        # This structure (including customized launch.py) is for compatibility
        # with our internal API. There is no functional difference from the
        # typical usage of distributed package. Please don't mind this
        # pecularity and focus on the main algorithm.
        for _ in train(cfg):
            pass

    return cfg
Exemple #6
0
def scale_learning_rate(cfg, mode):
    assert mode in ['train', 'eval']
    world_size = comm.get_world_size()
    lr_origin = cfg[mode].optim.lr
    local_batch = cfg[mode].batch_size_train
    global_batch = int(local_batch * world_size)
    ratio = global_batch / 256.
    lr = lr_origin * ratio
    log.info(f'[LR({mode})] local_batch ({local_batch}) x '
             f'world_size ({world_size}) = global_batch ({global_batch})')
    log.info(f'[LR({mode})] scale LR from {lr_origin} '
             f'to {lr} (x{ratio:3.2f}) by linear scaling rule.')
    optim = deepcopy(cfg[mode].optim)
    optim.lr = lr
    return optim