Exemple #1
0
def initialize(cfg=None):
    cfg = get_cfg(parse_args()) if cfg is None else cfg
    # launch multi-process for DDP
    #   - processes will be branched off at this point
    #   - subprocess ignores launching process and returns None
    if cfg.num_machines * cfg.num_gpus > 1:
        log.info(C.green(f"[!] Lauching Multiprocessing.."))
        cfg.spawn_ctx = launch(main_func=initialize,
                               num_gpus_per_machine=cfg.num_gpus,
                               num_machines=cfg.num_machines,
                               machine_rank=cfg.machine_rank,
                               dist_url=cfg.dist_url,
                               args=(cfg, ))
    else:
        cfg.spawn_ctx = None

    # scatter save_dir to all of non-main ranks
    cfg.save_dir = comm.scatter(cfg.save_dir)

    # finalize config
    C.set_enabled(not cfg.no_color)  # for sub-processes
    cfg.device = comm.get_local_rank()
    cfg.freeze()

    # file logging on the local ranks
    set_stream_handler('comm', cfg.log_level)  # for sub-processes
    log_rank_file = f"log_rank_{comm.get_rank()}.txt"
    set_file_handler('main', cfg.log_level, cfg.save_dir, log_rank_file)
    set_stream_handler('error', cfg.log_level)
    set_file_handler('error', cfg.log_level, cfg.save_dir, "log_error.txt")
    if comm.is_main_process():
        set_file_handler('result', cfg.log_level, "./", "log_result.txt")

    # log distriubted learning
    if comm.get_world_size() > 1:
        log.info(f"[DDP] dist_url: {cfg.dist_url}")
        log.info(f"[DDP] global_world_size = {comm.get_world_size()}")
        log.info(f"[DDP] num_gpus_per_machine = {torch.cuda.device_count()}")
        log.info(f"[DDP] machine_rank {cfg.machine_rank} / "
                 f"num_machines = {cfg.num_machines}")
        comm.synchronize()
        log_comm.info(f"[DDP] rank (local: {comm.get_local_rank()}, "
                      f"global: {comm.get_rank()}) has been spawned.")
        comm.synchronize()
        log.info(f"[DDP] Synchronized across all the ranks.")

    if not cfg.spawn_ctx:
        # This structure (including customized launch.py) is for compatibility
        # with our internal API. There is no functional difference from the
        # typical usage of distributed package. Please don't mind this
        # pecularity and focus on the main algorithm.
        for _ in train(cfg):
            pass

    return cfg
        )

    for i in range(args.epoch):
        train(i, loader, model, optimizer, scheduler, scaler, device)

        if dist.is_primary():
            torch.save(model.state_dict(),
                       f"checkpoint/vqvae_{str(i + 1).zfill(3)}.pt")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--n_gpu", type=int, default=1)

    port = (2**15 + 2**14 +
            hash(os.getuid() if sys.platform != "win32" else 1) % 2**14)
    parser.add_argument("--dist_url", default=f"tcp://127.0.0.1:{port}")

    parser.add_argument("--size", type=int, default=256)
    parser.add_argument("--epoch", type=int, default=560)
    parser.add_argument("--lr", type=float, default=3e-4)
    parser.add_argument("--disable-amp", action='store_true')
    parser.add_argument("--sched", type=str)
    parser.add_argument("path", type=str)

    args = parser.parse_args()

    print(args)

    dist.launch(main, args.n_gpu, 1, 0, args.dist_url, args=(args, ))
    parser.add_argument("--save_steps", type=int, default=1000)
    parser.add_argument("--eval_steps", type=int, default=100)
    parser.add_argument("--batch_size", type=int, default=128)
    parser.add_argument("--eval_path", type=str, default=None)
    parser.add_argument("--config_path", type=str, default=None)
    parser.add_argument("--output_path", type=str, default=None)
    parser.add_argument("--device", type=str, default="cuda")
    parser.add_argument("--save_total_limit", type=int, default=5)
    parser.add_argument("--fp16", type=bool, default=False)
    parser.add_argument("--fp16_opt_level", type=str, default="01")
    parser.add_argument("--img_keys_path", type=str, default=None)
    parser.add_argument("--img_root_path", type=str, default=None)
    parser.add_argument("--min_lr", type=float, default=1e-5)
    parser.add_argument("--cycle_step", type=int, default=2000)

    args = parser.parse_args()

    assert os.path.isdir(args.img_keys_path)
    assert os.path.isdir(args.img_root_path)

    os.makedirs(args.output_path, exist_ok=True)
    os.makedirs(args.eval_path, exist_ok=True)

    print(args, file=sys.stderr, flush=True)

    proc_num = 1
    if args.device == "cuda":
        proc_num = torch.cuda.device_count()
    print("proc_num={}".format(proc_num), file=sys.stderr, flush=True)
    dist.launch(main, proc_num, 1, 0, args.dist_url, args=(args, ))