def _parse_losses(self, losses): """Parse the raw outputs (losses) of the network. Args: losses (dict): Raw output of the network, which usually contain losses and other necessary infomation. Returns: tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor \ which may be a weighted sum of all losses, log_vars contains \ all the variables to be sent to the logger. """ log_vars = OrderedDict() for loss_name, loss_value in losses.items(): if isinstance(loss_value, torch.Tensor): log_vars[loss_name] = loss_value.mean() elif isinstance(loss_value, list): log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value) else: raise TypeError( f'{loss_name} is not a tensor or list of tensors') loss = sum(_value for _key, _value in log_vars.items() if 'loss' in _key) log_vars['loss'] = loss for loss_name, loss_value in log_vars.items(): # reduce loss when distributed training if dist.is_available() and dist.is_initialized(): loss_value = loss_value.data.clone() dist.all_reduce(loss_value.div_(dist.get_world_size())) log_vars[loss_name] = loss_value.item() return loss, log_vars
def __init__(self, dataset, samples_per_gpu=1, num_replicas=None, rank=None): _rank, _num_replicas = dist.get_rank(), dist.get_world_size() if num_replicas is None: num_replicas = _num_replicas if rank is None: rank = _rank self.dataset = dataset self.samples_per_gpu = samples_per_gpu self.num_replicas = num_replicas self.rank = rank self.epoch = 0 assert hasattr(self.dataset, 'flag') self.flag = self.dataset.flag self.group_sizes = np.bincount(self.flag) self.num_samples = 0 for i, j in enumerate(self.group_sizes): self.num_samples += int( math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu / self.num_replicas)) * self.samples_per_gpu self.total_size = self.num_samples * self.num_replicas
def get_world_size(): if run_herring: return herring.get_world_size() else: if not dist.is_available(): return 1 if not dist.is_initialized(): return 1 return dist.get_world_size()
def allreduce_grads(params, coalesce=True, bucket_size_mb=-1): """Allreduce gradients. Args: params (list[torch.Parameters]): List of parameters of a model coalesce (bool, optional): Whether allreduce parameters as a whole. Defaults to True. bucket_size_mb (int, optional): Size of bucket, the unit is MB. Defaults to -1. """ grads = [ param.grad.data for param in params if param.requires_grad and param.grad is not None ] world_size = dist.get_world_size() if coalesce: _allreduce_coalesced(grads, world_size, bucket_size_mb) else: for tensor in grads: dist.all_reduce(tensor.div_(world_size))
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if run_herring: num_replicas = herring.get_world_size() else: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if run_herring: rank = herring.get_rank() else: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
def main(): configure_logger(constants.MASKRCNN) log_start(key=constants.INIT_START) parser = argparse.ArgumentParser( description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=herring.get_local_rank()) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = herring.get_world_size() args.distributed = num_gpus > 1 args.local_rank = herring.get_local_rank() # if is_main_process: # # Setting logging file parameters for compliance logging # os.environ["COMPLIANCE_FILE"] = '/MASKRCNN_complVv0.5.0_' + str(datetime.datetime.now()) # constants.LOG_FILE = os.getenv("COMPLIANCE_FILE") # constants._FILE_HANDLER = logging.FileHandler(constants.LOG_FILE) # constants._FILE_HANDLER.setLevel(logging.DEBUG) # constants.LOGGER.addHandler(constants._FILE_HANDLER) if args.distributed: torch.cuda.set_device(args.local_rank) # setting seeds - needs to be timed, so after RUN_START if is_main_process(): master_seed = random.SystemRandom().randint(0, 2**32 - 1) seed_tensor = torch.tensor(master_seed, dtype=torch.float32, device=torch.device("cuda")) else: seed_tensor = torch.tensor(0, dtype=torch.float32, device=torch.device("cuda")) herring.broadcast(seed_tensor, 0) master_seed = int(seed_tensor.item()) else: # random master seed, random.SystemRandom() uses /dev/urandom on Unix master_seed = random.SystemRandom().randint(0, 2**32 - 1) # actually use the random seed args.seed = master_seed # random number generator with seed set to master_seed random_number_generator = random.Random(master_seed) log_event(key=constants.SEED, value=master_seed) cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) # generate worker seeds, one seed for every distributed worker worker_seeds = generate_seeds(random_number_generator, herring.get_world_size()) # todo sharath what if CPU # broadcast seeds from rank=0 to other workers worker_seeds = broadcast_seeds(worker_seeds, device='cuda') # Setting worker seeds logger.info("Worker {}: Setting seed {}".format( args.local_rank, worker_seeds[args.local_rank])) torch.manual_seed(worker_seeds[args.local_rank]) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) # Initialise async eval init() model, success = train(cfg, args.local_rank, args.distributed, random_number_generator) if success is not None: if success: log_end(key=constants.RUN_STOP, metadata={"status": "success"}) else: log_end(key=constants.RUN_STOP, metadata={"status": "aborted"})
def build_dataloader(dataset, samples_per_gpu, workers_per_gpu, num_gpus=1, dist=True, shuffle=True, seed=None, **kwargs): """Build PyTorch DataLoader. In distributed training, each GPU/process has a dataloader. In non-distributed training, there is only one dataloader for all GPUs. Args: dataset (Dataset): A PyTorch dataset. samples_per_gpu (int): Number of training samples on each GPU, i.e., batch size of each GPU. workers_per_gpu (int): How many subprocesses to use for data loading for each GPU. num_gpus (int): Number of GPUs. Only used in non-distributed training. dist (bool): Distributed training/test or not. Default: True. shuffle (bool): Whether to shuffle the data at every epoch. Default: True. kwargs: any keyword argument to be used to initialize DataLoader Returns: DataLoader: A PyTorch dataloader. """ rank, world_size = dist.get_rank(), dist.get_world_size() if dist: # DistributedGroupSampler will definitely shuffle the data to satisfy # that images on each GPU are in the same group if shuffle: sampler = DistributedGroupSampler(dataset, samples_per_gpu, world_size, rank) else: sampler = DistributedSampler(dataset, world_size, rank, shuffle=False) batch_size = samples_per_gpu num_workers = workers_per_gpu else: sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None batch_size = num_gpus * samples_per_gpu num_workers = num_gpus * workers_per_gpu init_fn = partial( worker_init_fn, num_workers=num_workers, rank=rank, seed=seed) if seed is not None else None data_loader = DataLoader(dataset, batch_size=batch_size, sampler=sampler, num_workers=num_workers, collate_fn=partial( collate, samples_per_gpu=samples_per_gpu), pin_memory=False, worker_init_fn=init_fn, **kwargs) return data_loader
def reduce_mean(tensor): if not (dist.is_available() and dist.is_initialized()): return tensor tensor = tensor.clone() dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM) return tensor
def make_data_sampler(dataset, shuffle, distributed): if distributed: if run_herring: return samplers.DistributedSampler(dataset, shuffle=shuffle, num_replicas=herring.get_world_size(), rank=herring.get_rank()) else: return samplers.DistributedSampler(dataset, shuffle=shuffle) if shuffle: sampler = torch.utils.data.sampler.RandomSampler(dataset) else: sampler = torch.utils.data.sampler.SequentialSampler(dataset) return sampler