def __init__(self, assignment: ClusterAssignment, split: str): self.config = assignment.config cluster_assignments = assignment.cluster_assignments[split] self.cluster_to_image = self._to_cluster_to_image_map( cluster_assignments) self.dataset = build_dataset(self.config, split) self.data_source = self.dataset.data_objs[0]
def save_cluster_assignment_as_dataset(cls, output_dir: str, assignments: ClusterAssignment): """ Create a 'disk_filelist' dataset out of the cluster assignments: - the inputs are the images - the labels are the index of the cluster assigned to the image """ os.makedirs(output_dir, exist_ok=True) for split in assignments.cluster_assignments.keys(): dataset = build_dataset(assignments.config, split) image_paths = dataset.get_image_paths() assert len(image_paths) == 1, "Multi-dataset not supported yet!" image_paths = image_paths[0] image_labels = [] for image_id in range(len(image_paths)): image_labels.append( assignments.cluster_assignments[split][image_id]) images_file_path = os.path.join(output_dir, f"{split.lower()}_images.npy") labels_file_path = os.path.join(output_dir, f"{split.lower()}_labels.npy") np.save(images_file_path, np.array(image_paths)) np.save(labels_file_path, np.array(image_labels))
def benchmark_data(cfg: AttrDict, split: str = "train"): split = split.upper() total_images = MAX_ITERS * cfg["DATA"][split]["BATCHSIZE_PER_REPLICA"] timer = Timer() dataset = build_dataset(cfg, split) try: device = torch.device("cuda" if cfg.MACHINE.DEVICE == "gpu" else "cpu") except AttributeError: device = torch.device("cuda") # Gives sampler same seed for entire distributed group as per pytorch documentation. sampler_seed = cfg.SEED_VALUE dataloader = get_loader( dataset=dataset, dataset_config=cfg["DATA"][split], num_dataloader_workers=cfg.DATA.NUM_DATALOADER_WORKERS, pin_memory=False, multi_processing_method=cfg.MULTI_PROCESSING_METHOD, device=device, sampler_seed=sampler_seed, ) # Fairstore data sampler would require setting the start iter before it can start. if hasattr(dataloader.sampler, "set_start_iter"): dataloader.sampler.set_start_iter(0) # initial warmup measured as warmup time timer.reset() data_iterator = iter(dataloader) for i in range(10): # warmup next(data_iterator) if i == 0: # the total number of seconds since the start/reset of the timer warmup_time = timer.seconds() logging.info(f"Warmup time {WARMUP_ITERS} batches: {warmup_time} seconds") # measure the number of images per sec in 1000 iterations. timer = Timer() for _ in tqdm.trange(MAX_ITERS): next(data_iterator) time_elapsed = timer.seconds() logging.info( f"iters: {MAX_ITERS}; images: {total_images}; time: {time_elapsed} seconds; " f"images/sec: {round(float(total_images / time_elapsed), 4)}; " f"ms/img: {round(float(1000 * time_elapsed / total_images), 4)} ") # run benchmark for a few more rounds to catch fluctuations for round_idx in range(BENCHMARK_ROUNDS): timer = Timer() for _ in tqdm.trange(MAX_ITERS): next(data_iterator) time_elapsed = timer.seconds() logging.info( f"round: {round_idx}: iters: {MAX_ITERS}; images: {total_images}; " f"time: {time_elapsed} seconds; " f"images/sec: {round(float(total_images / time_elapsed), 4)}; " f"ms/img: {round(float(1000 * time_elapsed / total_images), 4)} ") del data_iterator del dataloader
def get_image_paths(cfg: AttrDict, split: str) -> List[str]: """ Get the list of image path for the provided dataset and split """ dataset = build_dataset(cfg=cfg, split=split) feature_image_paths = dataset.get_image_paths() # due to multi-modality, we get image_paths as a nested list, one for each # dataset. Check it's a list and extract images. assert type(feature_image_paths) == list, "Image paths must be a list" assert len(feature_image_paths) == 1, "Multi-modality not supported yet!" return feature_image_paths[0]
def build_datasets(self): """ Get the datasets for the data splits we will use in the training. The set_available_splits variable determines the splits used in the training. """ datasets, data_and_label_keys = {}, {} for split in self.available_splits: datasets[split] = build_dataset(self.config, split) data_and_label_keys["input"] = self.config.DATA[split].INPUT_KEY_NAMES data_and_label_keys["target"] = self.config.DATA[split].TARGET_KEY_NAMES return datasets, data_and_label_keys
def get_data_features_and_images(cfg: AttrDict): output_dir = get_checkpoint_folder(cfg) split = cfg.RANKING.FEATURES.DATA_PARTITION logging.info("Merging features...") # merge the features across all nodes/gpus into one feature_data = merge_features(output_dir, split.lower(), cfg.RANKING.FEATURES.LAYER_NAME) logging.info("Getting the image paths...") # get the list of image Ids dataset = build_dataset(cfg=cfg, split=split) feature_image_paths = dataset.get_image_paths() # due to multi-modality, we get image_paths as a nested list, one for each # dataset. Check it's a list and extract images. assert type(feature_image_paths) == list, "Image paths must be a list" assert len(feature_image_paths) == 1, "Multi-modality not supported yet!" return feature_data, feature_image_paths[0]