def run_dist_training(rank_id: int, world_size: int, task: str,
                      task_cfg: CfgNode, parsed_args, model, dist_url):
    """method to run on distributed process
       passed to multiprocessing.spawn
    
    Parameters
    ----------
    rank_id : int
        rank id, ith spawned process 
    world_size : int
        total number of spawned process
    task : str
        task name (passed to builder)
    task_cfg : CfgNode
        task builder (passed to builder)
    parsed_args : [type]
        parsed arguments from command line
    """
    devs = ["cuda:{}".format(rank_id)]
    # set up distributed
    setup(rank_id, world_size, dist_url)
    dist_utils.synchronize()
    # move model to device before building optimizer.
    # quick fix for resuming of DDP
    # TODO: need to be refined in future
    model.set_device(devs[0])
    # build optimizer
    optimizer = optim_builder.build(task, task_cfg.optim, model)
    # build dataloader with trainer
    with Timer(name="Dataloader building", verbose=True):
        dataloader = dataloader_builder.build(task,
                                              task_cfg.data,
                                              seed=rank_id)
    # build trainer
    trainer = engine_builder.build(task, task_cfg.trainer, "trainer",
                                   optimizer, dataloader)
    trainer.set_device(
        devs
    )  # need to be placed after optimizer built (potential pytorch issue)
    trainer.resume(parsed_args.resume)
    # trainer.init_train()
    logger.info("Start training")
    while not trainer.is_completed():
        trainer.train()
        if rank_id == 0:
            trainer.save_snapshot()
        dist_utils.synchronize()  # one synchronization per epoch

    if rank_id == 0:
        trainer.save_snapshot(model_param_only=True)
    # clean up distributed
    cleanup()
Exemple #2
0
def run_dist_training(rank_id: int, world_size: int, task: str,
                      task_cfg: CfgNode, parsed_args, model):
    """method to run on distributed process
       passed to multiprocessing.spawn
    
    Parameters
    ----------
    rank_id : int
        rank id, ith spawned process 
    world_size : int
        total number of spawned process
    task : str
        task name (passed to builder)
    task_cfg : CfgNode
        task builder (passed to builder)
    parsed_args : [type]
        parsed arguments from command line
    """
    # set up distributed
    setup(rank_id, world_size)
    # build model
    # model = model_builder.build(task, task_cfg.model)
    # build optimizer
    optimizer = optim_builder.build(task, task_cfg.optim, model)
    # build dataloader with trainer
    with Timer(name="Dataloader building", verbose=True, logger=logger):
        dataloader = dataloader_builder.build(task,
                                              task_cfg.data,
                                              seed=rank_id)
    # build trainer
    trainer = engine_builder.build(task, task_cfg.trainer, "trainer",
                                   optimizer, dataloader)
    devs = ["cuda:%d" % rank_id]
    trainer.set_device(devs)
    trainer.resume(parsed_args.resume_from_epoch, parsed_args.resume_from_file)
    # trainer.init_train()
    logger.info("Start training")
    while not trainer.is_completed():
        trainer.train()
        if rank_id == 0:
            trainer.save_snapshot()
        dist.barrier()  # one synchronization per epoch

    # clean up distributed
    cleanup()
Exemple #3
0
    exp_cfg_path = osp.realpath(parsed_args.config)
    root_cfg.merge_from_file(exp_cfg_path)
    logger.info("Load experiment configuration at: %s" % exp_cfg_path)
    logger.info(
        "Merged with root_cfg imported from videoanalyst.config.config.cfg")
    # resolve config
    root_cfg = root_cfg.train
    task, task_cfg = specify_task(root_cfg)
    task_cfg.data.num_workers = 2
    task_cfg.data.sampler.submodules.dataset.GOT10kDataset.check_integrity = False
    task_cfg.freeze()

    if parsed_args.target == "dataloader":
        logger.info("visualize for dataloader")
        with Timer(name="Dataloader building", verbose=True):
            dataloader = dataloader_builder.build(task, task_cfg.data)

        for batch_training_data in dataloader:
            keys = list(batch_training_data.keys())
            batch_size = len(batch_training_data[keys[0]])
            training_samples = [{
                k: v[[idx]]
                for k, v in batch_training_data.items()
            } for idx in range(batch_size)]
            for training_sample in training_samples:
                target_cfg = task_cfg.data.target
                show_img_FCOS(target_cfg[target_cfg.name], training_sample)
                scan_key()
    elif parsed_args.target == "dataset":
        logger.info("visualize for dataset")
        import numpy as np