def make_data_loader(cfg, stage='train', is_distributed=False, start_iter=0): assert stage in ['train', 'val', 'test'] num_gpus = get_world_size() is_train = (stage == 'train') if is_train: pcs_per_gpu = cfg.SOLVER.PCS_PER_GPU_TRAIN shuffle = True drop_last = True num_iters = cfg.SOLVER.MAX_ITER else: pcs_per_gpu = cfg.SOLVER.PCS_PER_GPU_VAL shuffle = False drop_last = False num_iters = None start_iter = 0 paths_catalog = import_file('config.paths_catalog', 'config/path_catalog.py', True) DatasetCatalog = paths_catalog.DatasetCatalog dataset_name = cfg.DATASETS[stage.upper()] num_workers = cfg.DATALOADER.NUM_WORKERS dataset = build_dataset(cfg, dataset_name, DatasetCatalog, stage) sampler = make_data_sampler(dataset, shuffle, is_distributed) batch_sampler = make_batch_data_sampler(sampler, pcs_per_gpu, num_iters, start_iter, drop_last) data_loader = torch.utils.data.DataLoader(dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=dataset.collate_fn, pin_memory=True) return data_loader
def fcos_losses(self, instances): num_classes = instances.logits_pred.size(1) assert num_classes == self.num_classes labels = instances.labels.flatten() pos_inds = torch.nonzero(labels != num_classes).squeeze(1) num_pos_local = pos_inds.numel() num_gpus = get_world_size() total_num_pos = reduce_sum(pos_inds.new_tensor([num_pos_local])).item() num_pos_avg = max(total_num_pos / num_gpus, 1.0) # prepare one_hot class_target = torch.zeros_like(instances.logits_pred) class_target[pos_inds, labels[pos_inds]] = 1 class_loss = sigmoid_focal_loss_jit( instances.logits_pred, class_target, alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) / num_pos_avg instances = instances[pos_inds] instances.pos_inds = pos_inds ctrness_targets = compute_ctrness_targets(instances.reg_targets) ctrness_targets_sum = ctrness_targets.sum() loss_denorm = max(reduce_sum(ctrness_targets_sum).item() / num_gpus, 1e-6) instances.gt_ctrs = ctrness_targets if pos_inds.numel() > 0: reg_loss = self.loc_loss_func( instances.reg_pred, instances.reg_targets, ctrness_targets ) / loss_denorm ctrness_loss = F.binary_cross_entropy_with_logits( instances.ctrness_pred, ctrness_targets, reduction="sum" ) / num_pos_avg else: reg_loss = instances.reg_pred.sum() * 0 ctrness_loss = instances.ctrness_pred.sum() * 0 losses = { "loss_fcos_cls": class_loss, "loss_fcos_loc": reg_loss, "loss_fcos_ctr": ctrness_loss } extras = { "instances": instances, "loss_denorm": loss_denorm } return extras, losses
def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0, is_for_period=False): num_gpus = get_world_size() if is_train: images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_gpus == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of GPUs ({}) used.".format( images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = True num_iters = cfg.SOLVER.MAX_ITER else: images_per_batch = cfg.TEST.IMS_PER_BATCH assert ( images_per_batch % num_gpus == 0 ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number of GPUs ({}) used.".format( images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = False if not is_distributed else True num_iters = None start_iter = 0 aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] paths_catalog = import_file("configs.paths_catalog", cfg.PATHS_CATALOG, True) dataset_catalog = paths_catalog.DatasetCatalog dataset_list = cfg.DATASET.TRAIN if is_train else cfg.DATASET.TEST transforms = build_transforms(cfg, is_train) datasets = build_dataset(dataset_list, cfg.DATASET.DATA_TYPE, transforms, dataset_catalog, is_train or is_for_period) data_loaders = [] for dataset in datasets: sampler = make_data_sampler(dataset, shuffle, is_distributed) batch_sampler = make_batch_data_sampler(dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter) num_workers = cfg.DATALOADER.NUM_WORKERS data_loader = torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=BatchCollator(), ) data_loaders.append(data_loader) if is_train or is_for_period: # during training, a single (possibly concatenated) data_loader is returned assert len(data_loaders) == 1 return data_loaders[0] return data_loaders
def forward(self, input): if comm.get_world_size() == 1 or not self.training: return super().forward(input) B, C = input.shape[0], input.shape[1] mean = torch.mean(input, dim=[0, 2, 3]) meansqr = torch.mean(input * input, dim=[0, 2, 3]) if self._stats_mode == "": assert B > 0, 'SyncBatchNorm(stats_mode="") does not support zero batch size.' vec = torch.cat([mean, meansqr], dim=0) vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size()) mean, meansqr = torch.split(vec, C) momentum = self.momentum else: if B == 0: vec = torch.zeros([2 * C + 1], device=mean.device, dtype=mean.dtype) vec = vec + input.sum( ) # make sure there is gradient w.r.t input else: vec = torch.cat([ mean, meansqr, torch.ones([1], device=mean.device, dtype=mean.dtype) ], dim=0) vec = AllReduce.apply(vec * B) total_batch = vec[-1].detach() momentum = total_batch.clamp( max=1) * self.momentum # no update if total_batch is 0 total_batch = torch.max( total_batch, torch.ones_like(total_batch)) # avoid div-by-zero mean, meansqr, _ = torch.split(vec / total_batch, C) var = meansqr - mean * mean invstd = torch.rsqrt(var + self.eps) scale = self.weight * invstd bias = self.bias - mean * scale scale = scale.reshape(1, -1, 1, 1) bias = bias.reshape(1, -1, 1, 1) self.running_mean += momentum * (mean.detach() - self.running_mean) self.running_var += momentum * (var.detach() - self.running_var) return input * scale + bias
def make_epoch_data_loader(cfg, is_train=True, drop_last=True, is_distributed=False, start_iter=0): datasets = build_dataset(cfg, is_train) num_gpus = get_world_size() images_per_batch = cfg.DATALOADER.BSZ assert ( images_per_batch % num_gpus == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus logger = logging.getLogger(__name__) logger.info("Experiment with {} images per GPU".format(images_per_gpu)) if is_train: shuffle = True else: shuffle = False if not is_distributed else True data_loaders = [] for i, dataset in enumerate(datasets): sampler = make_data_sampler(dataset, shuffle, is_distributed, is_train, cfg) # default collator works! num_workers = cfg.DATALOADER.WORKERS data_loader = torch.utils.data.DataLoader( dataset, num_workers=num_workers, sampler=sampler, batch_size=images_per_gpu, drop_last=drop_last, pin_memory=True, ) data_loaders.append(data_loader) if is_train: # during training, a single (possibly concatenated) data_loader is returned assert len(data_loaders) == 1 return data_loaders[0] return data_loaders
def reduce_loss_dict(loss_dict): """ Reduce the loss dictionary from all processes so that process with rank 0 has the averaged results. Returns a dict with the same fields as loss_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return loss_dict with torch.no_grad(): loss_names = [] all_losses = [] for k in sorted(loss_dict.keys()): loss_names.append(k) all_losses.append(loss_dict[k]) all_losses = torch.stack(all_losses, dim=0) dist.reduce(all_losses, dst=0) if dist.get_rank() == 0: # only main process gets accumulated, so only divide by # world_size in this case all_losses /= world_size reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} return reduced_losses
def inference( model, data_loader, dataset_name, mem_active=False, output_folder=None, ): # convert to a torch.device for efficiency device = torch.device("cuda") num_devices = get_world_size() logger = logging.getLogger("AlphAction.inference") dataset = data_loader.dataset logger.info("Start evaluation on {} dataset({} videos).".format(dataset_name, len(dataset))) start_time = time.time() predictions = compute_on_dataset(model, data_loader, device, logger, mem_active) # wait for all processes to complete before measuring the time synchronize() total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) logger.info( "Total inference time: {} ({} s / video per device, on {} devices)".format( total_time_str, total_time * num_devices / len(dataset), num_devices ) ) predictions = _accumulate_predictions_from_multiple_gpus(predictions) if not is_main_process(): return if output_folder: torch.save(predictions, os.path.join(output_folder, "predictions.pth")) return evaluate( dataset=dataset, predictions=predictions, output_folder=output_folder, )
def make_train_data_loader(datasets, is_distributed=False, start_iter=0): num_gpus = get_world_size() ims_per_gpu = int(cfg.TRAIN.BATCH_SIZE / num_gpus) shuffle = True num_iters = cfg.SOLVER.MAX_ITER # group images which have similar aspect ratio. In this case, we only # group in two cases: those with width / height > 1, and the other way around, # but the code supports more general grouping strategy aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] sampler = make_data_sampler(datasets, shuffle, is_distributed) batch_sampler = make_batch_data_sampler(datasets, sampler, aspect_grouping, ims_per_gpu, num_iters, start_iter) collator = BatchCollator(cfg.TRAIN.SIZE_DIVISIBILITY) num_workers = cfg.TRAIN.LOADER_THREADS data_loader = torch.utils.data.DataLoader( datasets, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=collator, ) return data_loader
def compute_on_dataset_1stage(model, data_loader, device): # single stage inference, for model without memory features cpu_device = torch.device("cpu") results_dict = {} if get_world_size() == 1: extra_args = {} else: rank = get_rank() extra_args = dict(desc="rank {}".format(rank)) for batch in tqdm(data_loader, **extra_args): slow_clips, fast_clips, boxes, objects, extras, video_ids = batch slow_clips = slow_clips.to(device) fast_clips = fast_clips.to(device) boxes = [box.to(device) for box in boxes] objects = [None if (box is None) else box.to(device) for box in objects] with torch.no_grad(): output = model(slow_clips, fast_clips, boxes, objects, extras) output = [o.to(cpu_device) for o in output] results_dict.update( {video_id: result for video_id, result in zip(video_ids, output)} ) return results_dict
def inference(model, criterion, data_loader, dataset_name, save_result=False): logger = logging.getLogger('eve.' + __name__) device = torch.device('cuda') dataset = data_loader.dataset logger.info("Start evaluation on {} dataset ({} point clouds).".format( dataset_name, len(dataset))) if get_world_size() == 1: extra_args = {} else: rank = get_rank() extra_args = dict(desc="rank {}".format(rank)) start_time = time.time() model.eval() outputs_per_gpu = {} targets_per_gpu = {} file_path_per_gpu = {} times = [] with torch.no_grad(): for batch in tqdm(data_loader, **extra_args): locs, feats, targets, metadata = batch inputs = ME.SparseTensor(feats, coords=locs).to(device) targets = targets.to(device, non_blocking=True).long() torch.cuda.synchronize() start_time = time.time() outputs = model(inputs, y=targets) torch.cuda.synchronize() end_time = time.time() times.append(end_time - start_time) arch = cfg.MODEL.ARCHITECTURE if arch == 'minkunet4d' or arch == 'minkunet_eve': for batch_idx in range(len(metadata)): for time_idx in range(cfg.INPUT.VIDEO.NUM_FRAMES): inv_map = metadata[batch_idx][time_idx]['inverse_map'] file_path = metadata[batch_idx][time_idx]['file_path'] locs_frame = (locs[:, -1] == batch_idx) & \ (locs[:, -2] == time_idx) one_output, one_target = compute_one_frame( outputs, targets, locs_frame, inv_map) outputs_per_gpu[file_path] = one_output targets_per_gpu[file_path] = one_target file_path_per_gpu[file_path] = file_path else: # other minknet for batch_idx in range(len(metadata)): inv_map = metadata[batch_idx]['inverse_map'] file_path = metadata[batch_idx]['file_path'] # From MinkowskiEngine v0.3, batch index is on the first column locs_frame = locs[:, -1] == batch_idx one_output, one_target = compute_one_frame( outputs, targets, locs_frame, inv_map) outputs_per_gpu[file_path] = one_output targets_per_gpu[file_path] = one_target file_path_per_gpu[file_path] = file_path synchronize() logger.info("Total inference time: {}".format(np.sum(times))) # NOTE: `all_gather` will lead to CUDA out of memory # We use `scatter_gather` to save result of each process # in LOGS.DIR/tmp and will be cleared after gathering. outputs = scatter_gather(outputs_per_gpu) targets = scatter_gather(targets_per_gpu) file_paths = scatter_gather(file_path_per_gpu) if not is_main_process(): return None all_outputs = {k: v.numpy() for o in outputs for k, v in o.items()} all_targets = {k: v.numpy() for t in targets for k, v in t.items()} all_file_paths = {k: v for f in file_paths for k, v in f.items()} assert len(all_outputs) == len(dataset.all_files), \ '%d vs %d' % (len(all_outputs), len(dataset.all_files)) if cfg.LOGS.SAVE_RESULT is False: all_file_paths = None metrics = evaluate(dataset, all_outputs, all_targets, all_file_paths) return metrics
def compute_on_dataset_2stage(model, data_loader, device, logger): # two stage inference, for model with memory features. # first extract features and then do the inference cpu_device = torch.device("cpu") num_devices = get_world_size() dataset = data_loader.dataset if num_devices == 1: extra_args = {} else: rank = get_rank() extra_args = dict(desc="rank {}".format(rank)) loader_len = len(data_loader) person_feature_pool = MemoryPool() batch_info_list = [None]*loader_len logger.info("Stage 1: extracting clip features.") start_time = time.time() for i, batch in enumerate(tqdm(data_loader, **extra_args)): slow_clips, fast_clips, boxes, objects, extras, video_ids = batch slow_clips = slow_clips.to(device) fast_clips = fast_clips.to(device) boxes = [box.to(device) for box in boxes] objects = [None if (box is None) else box.to(device) for box in objects] movie_ids = [e["movie_id"] for e in extras] timestamps = [e["timestamp"] for e in extras] with torch.no_grad(): feature = model(slow_clips, fast_clips, boxes, objects, part_forward=0) person_feature = [ft.to(cpu_device) for ft in feature[0]] object_feature = [ft.to(cpu_device) for ft in feature[1]] # store person features into memory pool for movie_id, timestamp, p_ft, o_ft in zip(movie_ids, timestamps, person_feature, object_feature): person_feature_pool[movie_id, timestamp] = p_ft # store other information in list, for further inference batch_info_list[i] = (movie_ids, timestamps, video_ids, object_feature) # gather feature pools from different ranks synchronize() total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) logger.info( "Stage 1 time: {} ({} s / video per device, on {} devices)".format( total_time_str, total_time * num_devices / len(dataset), num_devices ) ) feature_pool = all_gather(person_feature_pool) all_feature_pool_p = MemoryPool() all_feature_pool_p.update_list(feature_pool) del feature_pool, person_feature_pool # do the inference results_dict = {} logger.info("Stage 2: predicting with extracted feature.") start_time = time.time() for movie_ids, timestamps, video_ids, object_feature in tqdm(batch_info_list, **extra_args): current_feat_p = [all_feature_pool_p[movie_id, timestamp].to(device) for movie_id, timestamp in zip(movie_ids, timestamps)] current_feat_o = [ft_o.to(device) for ft_o in object_feature] extras = dict( person_pool=all_feature_pool_p, movie_ids=movie_ids, timestamps=timestamps, current_feat_p=current_feat_p, current_feat_o=current_feat_o, ) with torch.no_grad(): output = model(None, None, None, None, extras=extras, part_forward=1) output = [o.to(cpu_device) for o in output] results_dict.update( {video_id: result for video_id, result in zip(video_ids, output)} ) synchronize() total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) logger.info( "Stage 2 time: {} ({} s / video per device, on {} devices)".format( total_time_str, total_time * num_devices / len(dataset), num_devices ) ) return results_dict
def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0): num_gpus = get_world_size() if is_train: # for training videos_per_batch = cfg.SOLVER.VIDEOS_PER_BATCH assert ( videos_per_batch % num_gpus == 0 ), "SOLVER.VIDEOS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(videos_per_batch, num_gpus) videos_per_gpu = videos_per_batch // num_gpus shuffle = True drop_last = True num_iters = cfg.SOLVER.MAX_ITER else: # for testing videos_per_batch = cfg.TEST.VIDEOS_PER_BATCH assert ( videos_per_batch % num_gpus == 0 ), "TEST.VIDEOS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(videos_per_batch, num_gpus) videos_per_gpu = videos_per_batch // num_gpus shuffle = False if not is_distributed else True drop_last = False num_iters = None start_iter = 0 # group images which have similar aspect ratio. In this case, we only # group in two cases: those with width / height > 1, and the other way around, # but the code supports more general grouping strategy aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] DatasetCatalog = paths_catalog.DatasetCatalog dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST # build dataset transforms = build_transforms(cfg, is_train) if has_object(cfg.IA_STRUCTURE): object_transforms = build_object_transforms(cfg, is_train=is_train) else: object_transforms = None datasets = build_dataset(cfg, dataset_list, transforms, DatasetCatalog, is_train, object_transforms) # build sampler and dataloader data_loaders = [] for dataset in datasets: sampler = make_data_sampler(dataset, shuffle, is_distributed) batch_sampler = make_batch_data_sampler(dataset, sampler, aspect_grouping, videos_per_gpu, num_iters, start_iter, drop_last) collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY) num_workers = cfg.DATALOADER.NUM_WORKERS data_loader = torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=collator, ) data_loaders.append(data_loader) if is_train: # during training, a single (possibly concatenated) data_loader is returned assert len(data_loaders) == 1 return data_loaders[0] return data_loaders
def __len__(self): return max(len(self.examples), get_world_size())
def do_infer( model, data_loader, dataset_name, device="cuda", output_folder=None, ): # convert to a torch.device for efficiency device = torch.device(device) num_devices = get_world_size() logger = logging.getLogger("EfficientDet.inference") dataset = data_loader.dataset logger.info("Start evaluation on {} dataset({} images).".format( dataset_name, len(dataset))) total_timer = Timer() inference_timer = Timer() total_timer.tic() predictions = compute_on_dataset(model, data_loader, device, inference_timer) # wait for all processes to complete before measuring the time synchronize() total_time = total_timer.toc() total_time_str = get_time_str(total_time) logger.info( "Total run time: {} ({} s / img per device, on {} devices)".format( total_time_str, total_time * num_devices / len(dataset), num_devices)) total_infer_time = get_time_str(inference_timer.total_time) logger.info( "Model inference time: {} ({} s / img per device, on {} devices)". format( total_infer_time, inference_timer.total_time * num_devices / len(dataset), num_devices, )) predictions = _accumulate_predictions_from_multiple_gpus(predictions) if not is_main_process(): return coco_results = [] image_ids = [] for image_id, prediction in enumerate(predictions): original_id = dataset.image_ids[image_id] image_ids.append(original_id) coco_results.extend([{ "image_id": original_id, "category_id": dataset.return_coco_label(e['class']), "bbox": e['bbox'], "score": e['score'] } for e in prediction]) map_05_09 = 0 with tempfile.NamedTemporaryFile() as f: file_path = f.name output_folder = './' if output_folder: file_path = os.path.join(output_folder, 'bbox_results.json') with open(file_path, "w") as w_obj: json.dump(coco_results, w_obj) # load results in COCO evaluation tool coco_true = dataset.coco coco_pred = coco_true.loadRes(file_path) # run COCO evaluation coco_eval = COCOeval(coco_true, coco_pred, 'bbox') coco_eval.params.imgIds = image_ids coco_eval.evaluate() coco_eval.accumulate() coco_eval.summarize() map_05_09 = coco_eval.stats[0] return map_05_09
def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0, partial_size=0): num_gpus = get_world_size() if is_train: images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert (images_per_batch % num_gpus == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = True num_iters = cfg.SOLVER.MAX_ITER else: images_per_batch = cfg.TEST.IMS_PER_BATCH assert (images_per_batch % num_gpus == 0 ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = False if not is_distributed else True num_iters = None start_iter = 0 if images_per_gpu > 1: logger = logging.getLogger(__name__) logger.warning( "When using more than one image per GPU you may encounter " "an out-of-memory (OOM) error if your GPU does not have " "sufficient memory. If this happens, you can reduce " "SOLVER.IMS_PER_BATCH (for training) or " "TEST.IMS_PER_BATCH (for inference). For training, you must " "also adjust the learning rate and schedule length according " "to the linear scaling rule. See for example: " "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14" ) # group images which have similar aspect ratio. In this case, we only # group in two cases: those with width / height > 1, and the other way around, # but the code supports more general grouping strategy aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] paths_catalog = import_file("maskrcnn_benchmark.config.paths_catalog", cfg.PATHS_CATALOG, True) DatasetCatalog = paths_catalog.DatasetCatalog dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST transforms = build_transforms(cfg, is_train) datasets = build_dataset(dataset_list, transforms, DatasetCatalog, is_train) data_loaders = [] for dataset in datasets: if partial_size: sampler = PartialSequentialSampler(dataset, partial_size) else: sampler = make_data_sampler(dataset, shuffle, is_distributed) batch_sampler = make_batch_data_sampler(dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter) collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY) num_workers = cfg.DATALOADER.NUM_WORKERS data_loader = torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=collator, ) data_loaders.append(data_loader) if is_train: # during training, a single (possibly concatenated) data_loader is returned assert len(data_loaders) == 1 return data_loaders[0] return data_loaders