def forward(self, z_i, z_j): device_size = z_i.shape[0] batch_size = device_size * comm.get_world_size() local_rank = comm.get_rank() neg_perm = torch.randperm(batch_size - 1)[:self.K] if comm.get_world_size() > 1: group = comm._get_global_gloo_group() zi_large = [ torch.zeros_like(z_i) for _ in range(comm.get_world_size()) ] zj_large = [ torch.zeros_like(z_j) for _ in range(comm.get_world_size()) ] dist.all_gather(zi_large, z_i, group=group) dist.all_gather(zj_large, z_j, group=group) choices = [ torch.zeros_like(neg_perm, dtype=torch.int64) for _ in range(comm.get_world_size()) ] dist.all_gather(choices, neg_perm, group=group) neg_perm = choices[0] else: zi_large = [z_i] zj_large = [z_j] zi_large[local_rank] = z_i zi_large = torch.cat(zi_large) zj_large = torch.cat(zj_large) sim_i_large = self.similarity_f( zi_large.unsqueeze(1), zj_large.unsqueeze(0)) / self.temperature positive_samples_i = sim_i_large[self.pos_mask_i].reshape( batch_size, 1) negative_samples_i = sim_i_large[self.neg_mask_i].reshape( batch_size, -1)[:, neg_perm] labels_i = torch.zeros(batch_size).to(self.device).long() logits_i = torch.cat((positive_samples_i, negative_samples_i), dim=1) # EqCo loss_i = torch.log( torch.exp(positive_samples_i) + # self.alpha / negative_samples_i.shape[1] * # uncomment this when negatives != bs torch.exp(negative_samples_i).sum(dim=-1, keepdim=True) ) - positive_samples_i loss_i = loss_i.sum() / device_size acc1, acc5 = accuracy(logits_i, labels_i, topk=(1, 5)) return loss_i, acc1, acc5
def forward(self, outputs, targets): """ This performs the loss computation. Parameters: outputs: dict of tensors, see the output specification of the model for the format targets: list of dicts, such that len(targets) == batch_size. The expected keys in each dict depends on the losses applied, see each loss' doc """ outputs_without_aux = { k: v for k, v in outputs.items() if k != "aux_outputs" } # Retrieve the matching between the outputs of the last layer and the targets indices = self.matcher(outputs_without_aux, targets) # Compute the average number of target boxes accross all nodes, for normalization purposes num_boxes = sum(len(t["labels"]) for t in targets) num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) if comm.get_world_size() > 1: torch.distributed.all_reduce(num_boxes) num_boxes = torch.clamp(num_boxes / comm.get_world_size(), min=1).item() # Compute all the requested losses losses = {} for loss in self.losses: losses.update( self.get_loss(loss, outputs, targets, indices, num_boxes)) # In case of auxiliary losses, we repeat this process with the output of # each intermediate layer. if "aux_outputs" in outputs: for i, aux_outputs in enumerate(outputs["aux_outputs"]): indices = self.matcher(aux_outputs, targets) for loss in self.losses: if loss == "masks": # Intermediate masks losses are too costly to compute, we ignore them. continue kwargs = {} if loss == "labels": # Logging is enabled only for the last layer kwargs = {"log": False} l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs) l_dict = {k + f"_{i}": v for k, v in l_dict.items()} losses.update(l_dict) return losses
def aux_losses(self, gt_classes, pred_class_logits): pred_class_logits = cat([ permute_to_N_HWA_K(x, self.num_classes) for x in pred_class_logits ], dim=1).view(-1, self.num_classes) gt_classes = gt_classes.flatten() valid_idxs = gt_classes >= 0 foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes) num_foreground = foreground_idxs.sum() gt_classes_target = torch.zeros_like(pred_class_logits) gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1 num_foreground = comm.all_reduce(num_foreground) / float( comm.get_world_size()) # logits loss loss_cls_aux = sigmoid_focal_loss_jit( pred_class_logits[valid_idxs], gt_classes_target[valid_idxs], alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) / max(1.0, num_foreground) return {"loss_cls_aux": loss_cls_aux}
def adjust_config(cfg): base_world_size = int(cfg.SOLVER.IMS_PER_BATCH / cfg.SOLVER.IMS_PER_DEVICE) # Batchsize, learning rate and max_iter in original config is used for 8 GPUs assert base_world_size == 8, "IMS_PER_BATCH/DEVICE in config file is used for 8 GPUs" world_size = comm.get_world_size() machines_ratio = world_size / base_world_size # ------ adjust batch_size ---------- # cfg.SOLVER.IMS_PER_BATCH = int(machines_ratio * cfg.SOLVER.IMS_PER_BATCH) assert ( cfg.SOLVER.IMS_PER_BATCH / cfg.SOLVER.IMS_PER_DEVICE == world_size ), "IMS_PER_BATCH ({}) not equal to IMS_PER_BATCH ({}) * world_size ({})".format( cfg.SOLVER.IMS_PER_BATCH, cfg.SOLVER.IMS_PER_DEVICE, world_size) check_subdivision_config(cfg) # ------- adjust scheduler --------- # # since we use new IMS_PER_BATCH value, epoch value doesn't need to multiply ratio if cfg.SOLVER.LR_SCHEDULER.MAX_EPOCH is None: cfg.SOLVER.LR_SCHEDULER.MAX_ITER = int( cfg.SOLVER.LR_SCHEDULER.MAX_ITER / machines_ratio) cfg.SOLVER.LR_SCHEDULER.STEPS = [ int(step / machines_ratio) for step in cfg.SOLVER.LR_SCHEDULER.STEPS ] cfg.SOLVER.CHECKPOINT_PERIOD = int(cfg.SOLVER.CHECKPOINT_PERIOD / machines_ratio) cfg.TEST.EVAL_PERIOD = int(cfg.TEST.EVAL_PERIOD / machines_ratio) if "SGD" in cfg.SOLVER.OPTIMIZER.NAME: # adjust learning rate according to Linear rule cfg.SOLVER.OPTIMIZER.BASE_LR = machines_ratio * cfg.SOLVER.OPTIMIZER.BASE_LR
def forward(self, batched_inputs): cur_bs = len(batched_inputs) t_inputs = [bi["t"] for bi in batched_inputs] p_inputs = [bi["t_prime"] for bi in batched_inputs] y1 = self.preprocess_image([bi["image"][0] for bi in t_inputs]) y2 = self.preprocess_image([bi["image"][0] for bi in p_inputs]) z1 = self.projector(self.backbone(y1)["linear"]) z2 = self.projector(self.backbone(y2)["linear"]) # empirical cross-correlation matrix c = self.bn(z1).T @ self.bn(z2) # sum the cross-correlation matrix between all gpus c.div_(cur_bs * comm.get_world_size()) torch.distributed.all_reduce(c) # use --scale-loss to multiply the loss by a constant factor # see the Issues section of the readme on_diag = torch.diagonal(c).add_(-1).pow_(2).sum().mul(self.scale_loss) off_diag = off_diagonal(c).pow_(2).sum().mul(self.scale_loss) loss = on_diag + self.lambd * off_diag return dict(loss=loss)
def stage_main(args, cfg, build): logger = logging.getLogger(__name__) assert comm.get_world_size() == 1, "DEBUG mode only supported for 1 GPU" cfg.merge_from_list(args.opts) cfg, logger = default_setup(cfg, args) model = build(cfg) optimizer = build_optimizer(cfg, model) debug_ckpt = Checkpointer(model, resume=True, optimizer=optimizer) ckpt_file = args.ckpt_file if ckpt_file is None: # find latest checkpoint in log dir if ckpt_file is not given log_dir = "./log" matched_files = [ os.path.join(log_dir, files) for files in os.listdir(log_dir) if re.match("debug_.*.pth", files) is not None ] ckpt_file = sorted(matched_files, key=os.path.getatime)[-1] left_dict = debug_ckpt.load(ckpt_file) assert "inputs" in left_dict, "input data not found in checkpoints" data = left_dict["inputs"] trainer = DebugTrainer(model, data, optimizer) logger.info("start run models") trainer.run_step() logger.info("finish debuging")
def __init__(self, dataset, repeat_thresh, shuffle=True, seed=None): """ Args: dataset (Dataset): dataset used for sampling. repeat_thresh (float): frequency threshold below which data is repeated. shuffle (bool): whether to shuffle the indices or not. seed (int): the initial seed of the shuffle. Must be the same across all workers. If None, will use a random seed shared among workers (require synchronization among all workers). """ self._shuffle = shuffle if seed is None: seed = comm.shared_random_seed() self._seed = int(seed) self._rank = comm.get_rank() self._world_size = comm.get_world_size() dataset_dicts = [] if hasattr(dataset, "datasets"): for d in dataset.datasets: dataset_dicts += d.dataset_dicts else: dataset_dicts = dataset.dataset_dicts # Get fractional repeat factors and split into whole number (_int_part) # and fractional (_frac_part) parts. rep_factors = self._get_repeat_factors(dataset_dicts, repeat_thresh) self._int_part = torch.trunc(rep_factors) self._frac_part = rep_factors - self._int_part
def forward(self, input): if comm.get_world_size() == 1 or not self.training: return super().forward(input) assert input.shape[0] > 0, "SyncBatchNorm does not support empty inputs" C = input.shape[1] mean = torch.mean(input, dim=[0]) meansqr = torch.mean(input * input, dim=[0]) vec = torch.cat([mean, meansqr], dim=0) vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size()) mean, meansqr = torch.split(vec, C) var = meansqr - mean * mean self.running_mean += self.momentum * (mean.detach() - self.running_mean) self.running_var += self.momentum * (var.detach() - self.running_var) invstd = torch.rsqrt(var + self.eps) scale = self.weight * invstd bias = self.bias - mean * scale scale = scale.reshape(1, -1) bias = bias.reshape(1, -1) return input * scale + bias
def benchmark_train(args): cfg = setup(args) model = build_model(cfg) logger.info("Model:\n{}".format(model)) if comm.get_world_size() > 1: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) optimizer = build_optimizer(cfg, model) checkpointer = DetectionCheckpointer(model, optimizer=optimizer) checkpointer.load(cfg.MODEL.WEIGHTS) cfg.defrost() cfg.DATALOADER.NUM_WORKERS = 0 data_loader = build_detection_train_loader(cfg) dummy_data = list(itertools.islice(data_loader, 100)) def f(): while True: yield from DatasetFromList(dummy_data, copy=False) max_iter = 400 trainer = SimpleTrainer(model, f(), optimizer) trainer.register_hooks([ hooks.IterationTimer(), hooks.PeriodicWriter([CommonMetricPrinter(max_iter)]) ]) trainer.train(1, max_iter)
def __init__(self, dataset, samples_per_gpu=1, num_replicas=None, rank=None): """ Args: dataset (Dataset): Dataset used for sampling. num_replicas (optional): Number of processes participating in distributed training. rank (optional): Rank of the current process within num_replicas. """ _rank = comm.get_rank() _num_replicas = comm.get_world_size() if num_replicas is None: num_replicas = _num_replicas if rank is None: rank = _rank self.dataset = dataset self.samples_per_gpu = samples_per_gpu self.num_replicas = num_replicas self.rank = rank self.epoch = 0 assert hasattr(self.dataset, 'aspect_ratios') self.aspect_ratios = self.dataset.aspect_ratios self.group_sizes = np.bincount(self.aspect_ratios) self.num_samples = 0 for i, j in enumerate(self.group_sizes): self.num_samples += int( math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu / self.num_replicas)) * self.samples_per_gpu self.total_size = self.num_samples * self.num_replicas
def build_detection_train_loader(cfg): """ A data loader is created by the following steps: 1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts. 2. Start workers to work on the dicts. Each worker will: * Map each metadata dict into another format to be consumed by the model. * Batch them by simply putting dicts into a list. The batched ``list[mapped_dict]`` is what this dataloader will return. Args: cfg (CfgNode): the config Returns: an infinite iterator of training data """ # For simulate large batch training num_devices = comm.get_world_size() rank = comm.get_rank() # use subdivision batchsize images_per_minibatch = cfg.SOLVER.IMS_PER_DEVICE // cfg.SOLVER.BATCH_SUBDIVISIONS logger = logging.getLogger(__name__) transform_gens = build_transform_gen(cfg.INPUT.AUG.TRAIN_PIPELINES) logger.info(f"TransformGens used: {transform_gens} in training") dataset = build_dataset(cfg, cfg.DATASETS.TRAIN, transforms=transform_gens, is_train=True) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger.info("Using training sampler {}".format(sampler_name)) assert sampler_name in SAMPLERS, "{} not found in SAMPLERS".format( sampler_name) if sampler_name == "TrainingSampler": sampler = SAMPLERS.get(sampler_name)(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": sampler = SAMPLERS.get(sampler_name)(dataset, cfg.DATALOADER.REPEAT_THRESHOLD) elif sampler_name == "DistributedGroupSampler": sampler = SAMPLERS.get(sampler_name)(dataset, images_per_minibatch, num_devices, rank) data_loader = torch.utils.data.DataLoader( dataset, batch_size=images_per_minibatch, sampler=sampler, num_workers=cfg.DATALOADER.NUM_WORKERS, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, ) adjust_epoch_and_iter(cfg, data_loader) return data_loader
def losses(self, gt_classes, gt_shifts_deltas, pred_class_logits, pred_shift_deltas, pred_filtering): """ Args: For `gt_classes` and `gt_shifts_deltas` parameters, see :meth:`FCOS.get_ground_truth`. Their shapes are (N, R) and (N, R, 4), respectively, where R is the total number of shifts across levels, i.e. sum(Hi x Wi) For `pred_class_logits`, `pred_shift_deltas` and `pred_fitering`, see :meth:`FCOSHead.forward`. Returns: dict[str: Tensor]: mapping from a named loss to a scalar tensor storing the loss. Used during training only. The dict keys are: "loss_cls" and "loss_box_reg" """ pred_class_logits, pred_shift_deltas, pred_filtering = \ permute_all_cls_and_box_to_N_HWA_K_and_concat( pred_class_logits, pred_shift_deltas, pred_filtering, self.num_classes ) # Shapes: (N x R, K) and (N x R, 4), respectively. gt_classes = gt_classes.flatten() gt_shifts_deltas = gt_shifts_deltas.view(-1, 4) valid_idxs = gt_classes >= 0 foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes) num_foreground = foreground_idxs.sum() gt_classes_target = torch.zeros_like(pred_class_logits) gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1 num_foreground = comm.all_reduce(num_foreground) / float(comm.get_world_size()) pred_class_logits = pred_class_logits.sigmoid() * pred_filtering.sigmoid() # logits loss loss_cls = focal_loss_jit( pred_class_logits[valid_idxs], gt_classes_target[valid_idxs], alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) / max(1.0, num_foreground) # regression loss loss_box_reg = iou_loss( pred_shift_deltas[foreground_idxs], gt_shifts_deltas[foreground_idxs], box_mode="ltrb", loss_type=self.iou_loss_type, reduction="sum", ) / max(1.0, num_foreground) * self.reg_weight return { "loss_cls": loss_cls, "loss_box_reg": loss_box_reg, }
def resume_or_load(self, resume=True): super().resume_or_load(resume) if comm.get_world_size() > 1: self.model.module.steps = self.start_iter self.model.module.epoch = self.start_epoch else: self.model.steps = self.start_iter self.model.epoch = self.start_epoch
def build_model(cfg): cfg.build_backbone = build_backbone model = BYOL(cfg) if comm.get_world_size() > 1: model = nn.SyncBatchNorm.convert_sync_batchnorm(model) return model
def default_setup(cfg, args): """ Perform some basic common setups at the beginning of a job, including: 1. Set up the cvpods logger 2. Log basic information about environment, cmdline arguments, and config 3. Backup the config to the output directory Args: cfg (BaseConfig): the full config to be used args (argparse.NameSpace): the command line arguments to be logged """ output_dir = cfg.OUTPUT_DIR if comm.is_main_process() and output_dir: PathManager.mkdirs(output_dir) rank = comm.get_rank() # setup_logger(output_dir, distributed_rank=rank, name="cvpods") logger = setup_logger(output_dir, distributed_rank=rank) logger.info("Rank of current process: {}. World size: {}".format( rank, comm.get_world_size())) logger.info("Environment info:\n" + collect_env_info()) logger.info("Command line arguments: " + str(args)) if hasattr(args, "config_file") and args.config_file != "": logger.info("Contents of args.config_file={}:\n{}".format( args.config_file, PathManager.open(args.config_file, "r").read())) adjust_config(cfg) logger.info("Running with full config:\n{}".format(cfg)) base_config = cfg.__class__.__base__() logger.info("different config with base class:\n{}".format( cfg.diff(base_config))) # if comm.is_main_process() and output_dir: # # Note: some of our scripts may expect the existence of # # config.yaml in output directory # path = os.path.join(output_dir, "config.yaml") # with PathManager.open(path, "w") as f: # f.write(cfg.dump()) # logger.info("Full config saved to {}".format(os.path.abspath(path))) # make sure each worker has a different, yet deterministic seed if specified seed = seed_all_rng(None if cfg.SEED < 0 else cfg.SEED + rank) # save seed to config for dump cfg.SEED = seed # cudnn benchmark has large overhead. It shouldn't be used considering the small size of # typical validation set. if not (hasattr(args, "eval_only") and args.eval_only): torch.backends.cudnn.benchmark = cfg.CUDNN_BENCHMARK return cfg, logger
def __init__(self, cfg, model_build_func): """ Args: cfg (BaseConfig): """ logger = logging.getLogger("cvpods") if not logger.isEnabledFor( logging.INFO): # setup_logger is not called for d2 setup_logger() self.start_iter = 0 data_loader = self.build_train_loader(cfg) epoch_iters = adjust_epoch_and_iter(cfg, data_loader) self.max_iter = cfg.SOLVER.LR_SCHEDULER.MAX_ITER self.max_epoch = cfg.SOLVER.LR_SCHEDULER.MAX_EPOCH model = model_build_func(cfg) model = maybe_convert_module(model) logger.info(f"Model structure: {model}") # Assume these objects must be constructed in this order. optimizer = self.build_optimizer(cfg, model) # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False, find_unused_parameters=True) # TODO: @wangfeng02, `batch_subdivisions` super().__init__(model, data_loader, optimizer, cfg.SOLVER.BATCH_SUBDIVISIONS) if not cfg.SOLVER.LR_SCHEDULER.get("EPOCH_WISE", False): epoch_iters = -1 self.scheduler = self.build_lr_scheduler(cfg, optimizer, epoch_iters=epoch_iters) # Assume no other objects need to be checkpointed. # We can later make it checkpoint the stateful hooks self.checkpointer = DetectionCheckpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler, ) self.cfg = cfg self.register_hooks(self.build_hooks())
def __init__(self, size: int): """ Args: size (int): the total number of data of the underlying dataset to sample from """ self._size = size assert size > 0 self._rank = comm.get_rank() self._world_size = comm.get_world_size() shard_size = (self._size - 1) // self._world_size + 1 begin = shard_size * self._rank end = min(shard_size * (self._rank + 1), self._size) self._local_indices = range(begin, end)
def __init__(self, device_size, temperature, alpha, K, device): super(NT_Xent, self).__init__() self.device_size = device_size self.temperature = temperature self.alpha = alpha self.K = K self.device = device self.similarity_f = nn.CosineSimilarity(dim=2) pos_mask_i, neg_mask_i = \ self.mask_correlated_samples(comm.get_world_size(), self.device_size) self.pos_mask_i = pos_mask_i.to(self.device) self.neg_mask_i = neg_mask_i.to(self.device)
def __init__(self, device_size, temperature, device): super(NT_Xent, self).__init__() self.device_size = device_size self.temperature = temperature self.device = device self.criterion = nn.CrossEntropyLoss(reduction="sum") self.similarity_f = nn.CosineSimilarity(dim=2) pos_mask_i, pos_mask_j, neg_mask_i, neg_mask_j = \ self.mask_correlated_samples(comm.get_world_size() * self.device_size, self.device_size) self.pos_mask_i = pos_mask_i.to(self.device) self.neg_mask_i = neg_mask_i.to(self.device) self.pos_mask_j = pos_mask_j.to(self.device) self.neg_mask_j = neg_mask_j.to(self.device)
def distributed_sinkhorn(Q, nmb_iters): with torch.no_grad(): Q = shoot_infs(Q) sum_Q = torch.sum(Q) dist.all_reduce(sum_Q) Q /= sum_Q r = torch.ones(Q.shape[0]).cuda(non_blocking=True) / Q.shape[0] c = torch.ones(Q.shape[1]).cuda( non_blocking=True) / (comm.get_world_size() * Q.shape[1]) for it in range(nmb_iters): u = torch.sum(Q, dim=1) dist.all_reduce(u) u = r / u u = shoot_infs(u) Q *= u.unsqueeze(1) Q *= (c / torch.sum(Q, dim=0)).unsqueeze(0) return (Q / torch.sum(Q, dim=0, keepdim=True)).t().float()
def __init__(self, size: int, shuffle: bool = True, seed: Optional[int] = None): """ Args: size (int): the total number of data of the underlying dataset to sample from shuffle (bool): whether to shuffle the indices or not seed (int): the initial seed of the shuffle. Must be the same across all workers. If None, will use a random seed shared among workers (require synchronization among all workers). """ self._size = size assert size > 0 self._shuffle = shuffle if seed is None: seed = comm.shared_random_seed() self._seed = int(seed) self._rank = comm.get_rank() self._world_size = comm.get_world_size()
def main(args): config.merge_from_list(args.opts) cfg = setup(args) model = build_model(cfg) logger.info("Model:\n{}".format(model)) if args.eval_only: DefaultCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume) return do_test(cfg, model) distributed = comm.get_world_size() > 1 if distributed: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) do_train(cfg, model) return do_test(cfg, model)
def default_setup(cfg, args): """ Perform some basic common setups at the beginning of a job, including: 1. Set up the cvpods logger 2. Log basic information about environment, cmdline arguments, and config 3. Backup the config to the output directory Args: cfg (BaseConfig): the full config to be used args (argparse.NameSpace): the command line arguments to be logged """ output_dir = cfg.OUTPUT_DIR if comm.is_main_process() and output_dir: ensure_dir(output_dir) rank = comm.get_rank() # setup_logger(output_dir, distributed_rank=rank, name="cvpods") setup_logger(output_dir, distributed_rank=rank) logger.info("Rank of current process: {}. World size: {}".format( rank, comm.get_world_size())) logger.info("Environment info:\n" + collect_env_info()) logger.info("Command line arguments: " + str(args)) if hasattr(args, "config_file") and args.config_file != "": logger.info("Contents of args.config_file={}:\n{}".format( args.config_file, megfile.smart_open(args.config_file, "r").read())) adjust_config(cfg) # make sure each worker has a different, yet deterministic seed if specified seed = seed_all_rng(None if cfg.SEED < 0 else cfg.SEED + rank) # save seed to config for dump cfg.SEED = seed # cudnn benchmark has large overhead. It shouldn't be used considering the small size of # typical validation set. if not (hasattr(args, "eval_only") and args.eval_only): torch.backends.cudnn.benchmark = cfg.CUDNN_BENCHMARK return cfg
def __init__(self, cfg): super(SwAV, self).__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.D = cfg.MODEL.SWAV.D self.K = cfg.MODEL.SWAV.K self.K_start = cfg.MODEL.SWAV.K_START self.P = cfg.MODEL.SWAV.P self.T = cfg.MODEL.SWAV.TAU self.EPS = cfg.MODEL.SWAV.EPS self.SK_ITERS = cfg.MODEL.SWAV.SK_ITERS self.improve_numerical_stability = cfg.MODEL.SWAV.NUMERICAL_STABILITY self.crops_for_assign = cfg.MODEL.SWAV.CROPS_FOR_ASSIGN self.nmb_crops = cfg.MODEL.SWAV.NMB_CROPS self.network = resnet_models.__dict__[cfg.MODEL.SWAV.ARCH]( normalize=True, hidden_mlp=cfg.MODEL.SWAV.HIDDEN_MLP, output_dim=cfg.MODEL.SWAV.D, nmb_prototypes=cfg.MODEL.SWAV.P, ) # create the queue self.register_buffer( "queue", torch.zeros(len(self.crops_for_assign), self.K // comm.get_world_size(), self.D), ) self.use_the_queue = False # self.linear_eval = nn.Linear(encoder_dim, 1000) # self.loss_evaluator = nn.CrossEntropyLoss() self.softmax = nn.Softmax(dim=1) self.to(self.device)
def losses( self, gt_classes, gt_shifts_deltas, gt_centerness, gt_classes_border, gt_deltas_border, pred_class_logits, pred_shift_deltas, pred_centerness, border_box_cls, border_bbox_reg, ): """ Args: For `gt_classes`, `gt_shifts_deltas` and `gt_centerness` parameters, see :meth:`BorderDet.get_ground_truth`. Their shapes are (N, R) and (N, R, 4), respectively, where R is the total number of shifts across levels, i.e. sum(Hi x Wi) For `pred_class_logits`, `pred_shift_deltas` and `pred_centerness`, see :meth:`BorderHead.forward`. Returns: dict[str: Tensor]: mapping from a named loss to a scalar tensor storing the loss. Used during training only. The dict keys are: "loss_cls" and "loss_box_reg" """ ( pred_class_logits, pred_shift_deltas, pred_centerness, border_class_logits, border_shift_deltas, ) = permute_all_cls_and_box_to_N_HWA_K_and_concat( pred_class_logits, pred_shift_deltas, pred_centerness, border_box_cls, border_bbox_reg, self.num_classes ) # Shapes: (N x R, K) and (N x R, 4), respectively. # fcos gt_classes = gt_classes.flatten() gt_shifts_deltas = gt_shifts_deltas.view(-1, 4) gt_centerness = gt_centerness.view(-1, 1) valid_idxs = gt_classes >= 0 foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes) num_foreground = foreground_idxs.sum() gt_classes_target = torch.zeros_like(pred_class_logits) gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1 num_foreground = comm.all_reduce(num_foreground) / float( comm.get_world_size()) num_foreground_centerness = gt_centerness[foreground_idxs].sum() num_targets = comm.all_reduce(num_foreground_centerness) / float( comm.get_world_size()) # logits loss loss_cls = sigmoid_focal_loss_jit( pred_class_logits[valid_idxs], gt_classes_target[valid_idxs], alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) / max(1.0, num_foreground) # regression loss loss_box_reg = iou_loss( pred_shift_deltas[foreground_idxs], gt_shifts_deltas[foreground_idxs], gt_centerness[foreground_idxs], box_mode="ltrb", loss_type=self.iou_loss_type, reduction="sum", ) / max(1.0, num_targets) # centerness loss loss_centerness = F.binary_cross_entropy_with_logits( pred_centerness[foreground_idxs], gt_centerness[foreground_idxs], reduction="sum", ) / max(1.0, num_foreground) # borderdet gt_classes_border = gt_classes_border.flatten() gt_deltas_border = gt_deltas_border.view(-1, 4) valid_idxs_border = gt_classes_border >= 0 foreground_idxs_border = (gt_classes_border >= 0) & (gt_classes_border != self.num_classes) num_foreground_border = foreground_idxs_border.sum() gt_classes_border_target = torch.zeros_like(border_class_logits) gt_classes_border_target[foreground_idxs_border, gt_classes_border[foreground_idxs_border]] = 1 num_foreground_border = (comm.all_reduce(num_foreground_border) / float(comm.get_world_size())) num_foreground_border = max(num_foreground_border, 1.0) loss_border_cls = sigmoid_focal_loss_jit( border_class_logits[valid_idxs_border], gt_classes_border_target[valid_idxs_border], alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) / num_foreground_border if foreground_idxs_border.numel() > 0: loss_border_reg = ( smooth_l1_loss(border_shift_deltas[foreground_idxs_border], gt_deltas_border[foreground_idxs_border], beta=0, reduction="sum") / num_foreground_border) else: loss_border_reg = border_shift_deltas.sum() return { "loss_cls": loss_cls, "loss_box_reg": loss_box_reg, "loss_centerness": loss_centerness, "loss_border_cls": loss_border_cls, "loss_border_reg": loss_border_reg, }
def _load_instance_annotations(self, image_dir, gt_dir, from_json=True, to_polygons=True): """ Args: image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train". gt_dir (str): path to the raw annotations. e.g., "~/cityscapes/gtFine/train". from_json (bool): whether to read annotations from the raw json file or the png files. to_polygons (bool): whether to represent the segmentation as polygons (COCO's format) instead of masks (cityscapes's format). Returns: list[dict]: a list of dicts in cvpods standard format. (See `Using Custom Datasets </tutorials/datasets.html>`_ ) """ if from_json: assert to_polygons, ( "Cityscapes's json annotations are in polygon format. " "Converting to mask format is not supported now.") files = [] for image_file in glob.glob(os.path.join(image_dir, "**/*.png")): suffix = "leftImg8bit.png" assert image_file.endswith(suffix) prefix = image_dir instance_file = (gt_dir + image_file[len(prefix):-len(suffix)] + "gtFine_instanceIds.png") assert os.path.isfile(instance_file), instance_file label_file = gt_dir + image_file[ len(prefix):-len(suffix)] + "gtFine_labelIds.png" assert os.path.isfile(label_file), label_file json_file = gt_dir + image_file[ len(prefix):-len(suffix)] + "gtFine_polygons.json" files.append((image_file, instance_file, label_file, json_file)) assert len(files), "No images found in {}".format(image_dir) logger = logging.getLogger(__name__) logger.info("Preprocessing cityscapes annotations ...") # This is still not fast: all workers will execute duplicate works and will # take up to 10m on a 8GPU server. pool = mp.Pool( processes=max(mp.cpu_count() // comm.get_world_size() // 2, 4)) ret = pool.map( functools.partial(cityscapes_files_to_dict, from_json=from_json, to_polygons=to_polygons), files, ) logger.info("Loaded {} images from {}".format(len(ret), image_dir)) # Map cityscape ids to contiguous ids from cityscapesscripts.helpers.labels import labels labels = [ label for label in labels if label.hasInstances and not label.ignoreInEval ] dataset_id_to_contiguous_id = { l.id: idx for idx, l in enumerate(labels) } for dict_per_image in ret: for anno in dict_per_image["annotations"]: anno["category_id"] = dataset_id_to_contiguous_id[ anno["category_id"]] return ret
def losses(self, gt_classes, gt_shifts_deltas, gt_centerness, pred_class_logits, pred_shift_deltas, pred_centerness): """ Args: For `gt_classes`, `gt_shifts_deltas` and `gt_centerness` parameters, see :meth:`FCOS.get_ground_truth`. Their shapes are (N, R) and (N, R, 4), respectively, where R is the total number of shifts across levels, i.e. sum(Hi x Wi) For `pred_class_logits`, `pred_shift_deltas` and `pred_centerness`, see :meth:`FCOSHead.forward`. Returns: dict[str: Tensor]: mapping from a named loss to a scalar tensor storing the loss. Used during training only. The dict keys are: "loss_cls" and "loss_box_reg" """ pred_class_logits, pred_shift_deltas, pred_centerness = \ permute_all_cls_and_box_to_N_HWA_K_and_concat( pred_class_logits, pred_shift_deltas, pred_centerness, self.num_classes ) # Shapes: (N x R, K) and (N x R, 4), respectively. gt_classes = gt_classes.flatten() gt_shifts_deltas = gt_shifts_deltas.view(-1, 4) gt_centerness = gt_centerness.view(-1, 1) valid_idxs = gt_classes >= 0 foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes) num_foreground = foreground_idxs.sum() gt_classes_target = torch.zeros_like(pred_class_logits) gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1 num_foreground = comm.all_reduce(num_foreground) / float(comm.get_world_size()) num_foreground_centerness = gt_centerness[foreground_idxs].sum() num_targets = comm.all_reduce(num_foreground_centerness) / float(comm.get_world_size()) # logits loss loss_cls = sigmoid_focal_loss_jit( pred_class_logits[valid_idxs], gt_classes_target[valid_idxs], alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) / max(1.0, num_foreground) # regression loss loss_box_reg = iou_loss( pred_shift_deltas[foreground_idxs], gt_shifts_deltas[foreground_idxs], gt_centerness[foreground_idxs], box_mode="ltrb", loss_type=self.iou_loss_type, reduction="sum", ) / max(1.0, num_targets) # centerness loss loss_centerness = F.binary_cross_entropy_with_logits( pred_centerness[foreground_idxs], gt_centerness[foreground_idxs], reduction="sum", ) / max(1, num_foreground) loss = { "loss_cls": loss_cls, "loss_box_reg": loss_box_reg, "loss_centerness": loss_centerness, } # budget loss if self.is_dynamic_head and self.budget_loss_lambda != 0: soft_cost, used_cost, full_cost = get_module_running_cost(self) loss_budget = (soft_cost / full_cost).mean() * self.budget_loss_lambda storage = get_event_storage() storage.put_scalar("complxity_ratio", (used_cost / full_cost).mean()) loss.update({"loss_budget": loss_budget}) return loss
def build_detection_train_loader(cfg): """ A data loader is created by the following steps: 1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts. 2. Start workers to work on the dicts. Each worker will: * Map each metadata dict into another format to be consumed by the model. * Batch them by simply putting dicts into a list. The batched ``list[mapped_dict]`` is what this dataloader will return. Args: cfg (CfgNode): the config Returns: an infinite iterator of training data """ num_workers = comm.get_world_size() rank = comm.get_rank() images_per_batch = cfg.SOLVER.IMS_PER_BATCH # Adjust batchsize according to BATCH_SUBDIVISIONS images_per_batch //= cfg.SOLVER.BATCH_SUBDIVISIONS assert ( images_per_batch % num_workers == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( images_per_batch, num_workers) assert ( images_per_batch >= num_workers ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( images_per_batch, num_workers) images_per_worker = images_per_batch // num_workers logger = logging.getLogger(__name__) transform_gens = build_transform_gen(cfg.INPUT.AUG.TRAIN_PIPELINES) logger.info(f"TransformGens used: {transform_gens} in training") dataset = build_dataset(cfg, cfg.DATASETS.TRAIN, transforms=transform_gens, is_train=True) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": sampler = SAMPLERS.get("TrainingSampler")(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": sampler = SAMPLERS.get("RepeatFactorTrainingSampler")( dataset, cfg.DATALOADER.REPEAT_THRESHOLD) elif sampler_name == "DistributedGroupSampler": sampler = SAMPLERS.get("DistributedGroupSampler")(dataset, images_per_worker, num_workers, rank) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) data_loader = torch.utils.data.DataLoader( dataset, batch_size=images_per_worker, sampler=sampler, num_workers=cfg.DATALOADER.NUM_WORKERS, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, ) return data_loader
def losses(self, indices, gt_instances, anchors, pred_class_logits, pred_anchor_deltas): pred_class_logits = cat(pred_class_logits, dim=1).view(-1, self.num_classes) pred_anchor_deltas = cat(pred_anchor_deltas, dim=1).view(-1, 4) anchors = [Boxes.cat(anchors_i) for anchors_i in anchors] N = len(anchors) # list[Tensor(R, 4)], one for each image all_anchors = Boxes.cat(anchors).tensor # Boxes(Tensor(N*R, 4)) predicted_boxes = self.box2box_transform.apply_deltas( pred_anchor_deltas, all_anchors) predicted_boxes = predicted_boxes.reshape(N, -1, 4) ious = [] pos_ious = [] for i in range(N): src_idx, tgt_idx = indices[i] iou, _ = box_iou(predicted_boxes[i, ...], gt_instances[i].gt_boxes.tensor) if iou.numel() == 0: max_iou = iou.new_full((iou.size(0), ), 0) else: max_iou = iou.max(dim=1)[0] a_iou, _ = box_iou(anchors[i].tensor, gt_instances[i].gt_boxes.tensor) if a_iou.numel() == 0: pos_iou = a_iou.new_full((0, ), 0) else: pos_iou = a_iou[src_idx, tgt_idx] ious.append(max_iou) pos_ious.append(pos_iou) ious = torch.cat(ious) ignore_idx = ious > self.neg_ignore_thresh pos_ious = torch.cat(pos_ious) pos_ignore_idx = pos_ious < self.pos_ignore_thresh src_idx = torch.cat([ src + idx * anchors[0].tensor.shape[0] for idx, (src, _) in enumerate(indices) ]) gt_classes = torch.full(pred_class_logits.shape[:1], self.num_classes, dtype=torch.int64, device=pred_class_logits.device) gt_classes[ignore_idx] = -1 target_classes_o = torch.cat( [t.gt_classes[J] for t, (_, J) in zip(gt_instances, indices)]) target_classes_o[pos_ignore_idx] = -1 gt_classes[src_idx] = target_classes_o valid_idxs = gt_classes >= 0 foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes) num_foreground = foreground_idxs.sum() gt_classes_target = torch.zeros_like(pred_class_logits) gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1 if comm.get_world_size() > 1: dist.all_reduce(num_foreground) num_foreground = num_foreground * 1.0 / comm.get_world_size() # cls loss loss_cls = sigmoid_focal_loss_jit( pred_class_logits[valid_idxs], gt_classes_target[valid_idxs], alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) # reg loss target_boxes = torch.cat( [t.gt_boxes.tensor[i] for t, (_, i) in zip(gt_instances, indices)], dim=0) target_boxes = target_boxes[~pos_ignore_idx] matched_predicted_boxes = predicted_boxes.reshape( -1, 4)[src_idx[~pos_ignore_idx]] loss_box_reg = (1 - torch.diag( generalized_box_iou(matched_predicted_boxes, target_boxes))).sum() return { "loss_cls": loss_cls / max(1, num_foreground), "loss_box_reg": loss_box_reg / max(1, num_foreground), }
def preprocess_image(self, batched_inputs, training): """ Normalize, pad and batch the input images. """ images = [x["image"].to(self.device) for x in batched_inputs] bs = len(images) images = [self.normalizer(x) for x in images] images = ImageList.from_tensors(images, size_divisibility=0, pad_ref_long=True) # sync image size for all gpus comm.synchronize() if training and self.iter % self.change_iter == 0: if self.iter < self.max_iter - 20000: meg = torch.LongTensor(1).to(self.device) comm.synchronize() if comm.is_main_process(): size = np.random.choice(self.multi_size) meg.fill_(size) if comm.get_world_size() > 1: comm.synchronize() dist.broadcast(meg, 0) self.size = meg.item() comm.synchronize() else: self.size = 608 if training: # resize image inputs modes = ['bilinear', 'nearest', 'bicubic', 'area'] mode = modes[random.randrange(4)] if mode == 'bilinear' or mode == 'bicubic': images.tensor = F.interpolate(images.tensor, size=[self.size, self.size], mode=mode, align_corners=False) else: images.tensor = F.interpolate(images.tensor, size=[self.size, self.size], mode=mode) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None targets = [ torch.cat([ instance.gt_classes.float().unsqueeze(-1), instance.gt_boxes.tensor ], dim=-1) for instance in gt_instances ] labels = torch.zeros((bs, 100, 5)) for i, target in enumerate(targets): labels[i][:target.shape[0]] = target labels[:, :, 1:] = labels[:, :, 1:] / 512. * self.size else: labels = None self.iter += 1 return images, labels