def run_dist_training(rank_id: int, world_size: int, task: str, task_cfg: CfgNode, parsed_args, model, dist_url): """method to run on distributed process passed to multiprocessing.spawn Parameters ---------- rank_id : int rank id, ith spawned process world_size : int total number of spawned process task : str task name (passed to builder) task_cfg : CfgNode task builder (passed to builder) parsed_args : [type] parsed arguments from command line """ devs = ["cuda:{}".format(rank_id)] # set up distributed setup(rank_id, world_size, dist_url) dist_utils.synchronize() # move model to device before building optimizer. # quick fix for resuming of DDP # TODO: need to be refined in future model.set_device(devs[0]) # build optimizer optimizer = optim_builder.build(task, task_cfg.optim, model) # build dataloader with trainer with Timer(name="Dataloader building", verbose=True): dataloader = dataloader_builder.build(task, task_cfg.data, seed=rank_id) # build trainer trainer = engine_builder.build(task, task_cfg.trainer, "trainer", optimizer, dataloader) trainer.set_device( devs ) # need to be placed after optimizer built (potential pytorch issue) trainer.resume(parsed_args.resume) # trainer.init_train() logger.info("Start training") while not trainer.is_completed(): trainer.train() if rank_id == 0: trainer.save_snapshot() dist_utils.synchronize() # one synchronization per epoch if rank_id == 0: trainer.save_snapshot(model_param_only=True) # clean up distributed cleanup()
def run_dist_training(rank_id: int, world_size: int, task: str, task_cfg: CfgNode, parsed_args, model): """method to run on distributed process passed to multiprocessing.spawn Parameters ---------- rank_id : int rank id, ith spawned process world_size : int total number of spawned process task : str task name (passed to builder) task_cfg : CfgNode task builder (passed to builder) parsed_args : [type] parsed arguments from command line """ # set up distributed setup(rank_id, world_size) # build model # model = model_builder.build(task, task_cfg.model) # build optimizer optimizer = optim_builder.build(task, task_cfg.optim, model) # build dataloader with trainer with Timer(name="Dataloader building", verbose=True, logger=logger): dataloader = dataloader_builder.build(task, task_cfg.data, seed=rank_id) # build trainer trainer = engine_builder.build(task, task_cfg.trainer, "trainer", optimizer, dataloader) devs = ["cuda:%d" % rank_id] trainer.set_device(devs) trainer.resume(parsed_args.resume_from_epoch, parsed_args.resume_from_file) # trainer.init_train() logger.info("Start training") while not trainer.is_completed(): trainer.train() if rank_id == 0: trainer.save_snapshot() dist.barrier() # one synchronization per epoch # clean up distributed cleanup()
exp_cfg_path = osp.realpath(parsed_args.config) root_cfg.merge_from_file(exp_cfg_path) logger.info("Load experiment configuration at: %s" % exp_cfg_path) logger.info( "Merged with root_cfg imported from videoanalyst.config.config.cfg") # resolve config root_cfg = root_cfg.train task, task_cfg = specify_task(root_cfg) task_cfg.data.num_workers = 2 task_cfg.data.sampler.submodules.dataset.GOT10kDataset.check_integrity = False task_cfg.freeze() if parsed_args.target == "dataloader": logger.info("visualize for dataloader") with Timer(name="Dataloader building", verbose=True): dataloader = dataloader_builder.build(task, task_cfg.data) for batch_training_data in dataloader: keys = list(batch_training_data.keys()) batch_size = len(batch_training_data[keys[0]]) training_samples = [{ k: v[[idx]] for k, v in batch_training_data.items() } for idx in range(batch_size)] for training_sample in training_samples: target_cfg = task_cfg.data.target show_img_FCOS(target_cfg[target_cfg.name], training_sample) scan_key() elif parsed_args.target == "dataset": logger.info("visualize for dataset") import numpy as np