Beispiel #1
0
 def __init__(self, assignment: ClusterAssignment, split: str):
     self.config = assignment.config
     cluster_assignments = assignment.cluster_assignments[split]
     self.cluster_to_image = self._to_cluster_to_image_map(
         cluster_assignments)
     self.dataset = build_dataset(self.config, split)
     self.data_source = self.dataset.data_objs[0]
Beispiel #2
0
    def save_cluster_assignment_as_dataset(cls, output_dir: str,
                                           assignments: ClusterAssignment):
        """
        Create a 'disk_filelist' dataset out of the cluster assignments:
        - the inputs are the images
        - the labels are the index of the cluster assigned to the image
        """
        os.makedirs(output_dir, exist_ok=True)
        for split in assignments.cluster_assignments.keys():
            dataset = build_dataset(assignments.config, split)
            image_paths = dataset.get_image_paths()
            assert len(image_paths) == 1, "Multi-dataset not supported yet!"
            image_paths = image_paths[0]

            image_labels = []
            for image_id in range(len(image_paths)):
                image_labels.append(
                    assignments.cluster_assignments[split][image_id])

            images_file_path = os.path.join(output_dir,
                                            f"{split.lower()}_images.npy")
            labels_file_path = os.path.join(output_dir,
                                            f"{split.lower()}_labels.npy")
            np.save(images_file_path, np.array(image_paths))
            np.save(labels_file_path, np.array(image_labels))
Beispiel #3
0
def benchmark_data(cfg: AttrDict, split: str = "train"):
    split = split.upper()
    total_images = MAX_ITERS * cfg["DATA"][split]["BATCHSIZE_PER_REPLICA"]
    timer = Timer()
    dataset = build_dataset(cfg, split)

    try:
        device = torch.device("cuda" if cfg.MACHINE.DEVICE == "gpu" else "cpu")
    except AttributeError:
        device = torch.device("cuda")

    # Gives sampler same seed for entire distributed group as per pytorch documentation.
    sampler_seed = cfg.SEED_VALUE
    dataloader = get_loader(
        dataset=dataset,
        dataset_config=cfg["DATA"][split],
        num_dataloader_workers=cfg.DATA.NUM_DATALOADER_WORKERS,
        pin_memory=False,
        multi_processing_method=cfg.MULTI_PROCESSING_METHOD,
        device=device,
        sampler_seed=sampler_seed,
    )

    # Fairstore data sampler would require setting the start iter before it can start.
    if hasattr(dataloader.sampler, "set_start_iter"):
        dataloader.sampler.set_start_iter(0)

    # initial warmup measured as warmup time
    timer.reset()
    data_iterator = iter(dataloader)
    for i in range(10):  # warmup
        next(data_iterator)
        if i == 0:
            # the total number of seconds since the start/reset of the timer
            warmup_time = timer.seconds()
    logging.info(f"Warmup time {WARMUP_ITERS} batches: {warmup_time} seconds")

    # measure the number of images per sec in 1000 iterations.
    timer = Timer()
    for _ in tqdm.trange(MAX_ITERS):
        next(data_iterator)
    time_elapsed = timer.seconds()
    logging.info(
        f"iters: {MAX_ITERS}; images: {total_images}; time: {time_elapsed} seconds; "
        f"images/sec: {round(float(total_images / time_elapsed), 4)}; "
        f"ms/img: {round(float(1000 * time_elapsed / total_images), 4)} ")

    # run benchmark for a few more rounds to catch fluctuations
    for round_idx in range(BENCHMARK_ROUNDS):
        timer = Timer()
        for _ in tqdm.trange(MAX_ITERS):
            next(data_iterator)
        time_elapsed = timer.seconds()
        logging.info(
            f"round: {round_idx}: iters: {MAX_ITERS}; images: {total_images}; "
            f"time: {time_elapsed} seconds; "
            f"images/sec: {round(float(total_images / time_elapsed), 4)}; "
            f"ms/img: {round(float(1000 * time_elapsed / total_images), 4)} ")
    del data_iterator
    del dataloader
def get_image_paths(cfg: AttrDict, split: str) -> List[str]:
    """
    Get the list of image path for the provided dataset and split
    """
    dataset = build_dataset(cfg=cfg, split=split)
    feature_image_paths = dataset.get_image_paths()
    # due to multi-modality, we get image_paths as a nested list, one for each
    # dataset. Check it's a list and extract images.
    assert type(feature_image_paths) == list, "Image paths must be a list"
    assert len(feature_image_paths) == 1, "Multi-modality not supported yet!"
    return feature_image_paths[0]
Beispiel #5
0
 def build_datasets(self):
     """
     Get the datasets for the data splits we will use in the training. The
     set_available_splits variable determines the splits used in the training.
     """
     datasets, data_and_label_keys = {}, {}
     for split in self.available_splits:
         datasets[split] = build_dataset(self.config, split)
         data_and_label_keys["input"] = self.config.DATA[split].INPUT_KEY_NAMES
         data_and_label_keys["target"] = self.config.DATA[split].TARGET_KEY_NAMES
     return datasets, data_and_label_keys
def get_data_features_and_images(cfg: AttrDict):
    output_dir = get_checkpoint_folder(cfg)
    split = cfg.RANKING.FEATURES.DATA_PARTITION
    logging.info("Merging features...")
    # merge the features across all nodes/gpus into one
    feature_data = merge_features(output_dir, split.lower(),
                                  cfg.RANKING.FEATURES.LAYER_NAME)

    logging.info("Getting the image paths...")
    # get the list of image Ids
    dataset = build_dataset(cfg=cfg, split=split)
    feature_image_paths = dataset.get_image_paths()
    # due to multi-modality, we get image_paths as a nested list, one for each
    # dataset. Check it's a list and extract images.
    assert type(feature_image_paths) == list, "Image paths must be a list"
    assert len(feature_image_paths) == 1, "Multi-modality not supported yet!"
    return feature_data, feature_image_paths[0]