Ejemplo n.º 1
0
def merge_features(output_dir, split, layer, cfg):
    """
    For multi-gpu feature extraction, each gpu saves features corresponding to its
    share of the data. We can merge the features across all gpus to get the features
    for the full data.

    The features are saved along with the data indexes and label. The data indexes can
    be used to sort the data and ensure the uniqueness.

    We organize the features, targets corresponding to the data index of each feature,
    ensure the uniqueness and return.

    Args:
        output_dir (str): input path where the features are dumped
        split (str): whether the features are train or test data features
        layer (str): the features correspond to what layer of the model
        cfg (AttrDict): the input configuration specified by user

    Returns:
        output (Dict): contains features, targets, inds as the keys
    """
    logging.info(f"Merging features: {split} {layer}")
    output_feats, output_targets = {}, {}
    for local_rank in range(0, cfg.DISTRIBUTED.NUM_PROC_PER_NODE):
        for node_id in range(0, cfg.DISTRIBUTED.NUM_NODES):
            dist_rank = cfg.DISTRIBUTED.NUM_PROC_PER_NODE * node_id + local_rank
            feat_file = f"{output_dir}/rank{dist_rank}_{split}_{layer}_features.npy"
            targets_file = f"{output_dir}/rank{dist_rank}_{split}_{layer}_targets.npy"
            inds_file = f"{output_dir}/rank{dist_rank}_{split}_{layer}_inds.npy"
            logging.info(f"Loading:\n{feat_file}\n{targets_file}\n{inds_file}")
            feats = load_file(feat_file)
            targets = load_file(targets_file)
            indices = load_file(inds_file)
            num_samples = feats.shape[0]
            for idx in range(num_samples):
                index = indices[idx]
                if not (index in output_feats):
                    output_feats[index] = feats[idx]
                    output_targets[index] = targets[idx]
    output = {}
    output_feats = dict(sorted(output_feats.items()))
    output_targets = dict(sorted(output_targets.items()))
    feats = np.array(list(output_feats.values()))
    N = feats.shape[0]
    output = {
        "features": feats.reshape(N, -1),
        "targets": np.array(list(output_targets.values())),
        "inds": np.array(list(output_feats.keys())),
    }
    logging.info(f"Features: {output['features'].shape}")
    logging.info(f"Targets: {output['targets'].shape}")
    logging.info(f"Indices: {output['inds'].shape}")
    return output
Ejemplo n.º 2
0
 def load_feature_shard(
         cls, paths: ExtractedFeaturesShardPaths) -> ExtractedFeatures:
     """
     Load a shard of the extracted features and returns its content:
     features, targets and indices.
     """
     logging.info(
         f"Loading:\n{paths.feature_file}\n{paths.targets_file}\n{paths.indices_file}"
     )
     return ExtractedFeatures(
         features=load_file(paths.feature_file),
         targets=load_file(paths.targets_file),
         indices=load_file(paths.indices_file),
     )
Ejemplo n.º 3
0
 def load_input_data(self, data_file, targets_file):
     """
     Given the input data (features) and targets (labels) files, load the
     features of shape N x D and labels of shape (N,)
     """
     assert g_pathmgr.exists(data_file), "Data file not found. Abort!"
     assert g_pathmgr.exists(targets_file), "Targets file not found. Abort!"
     # load the features and the targets
     logging.info("loading features and targets...")
     targets = load_file(targets_file)
     features = np.array(load_file(data_file)).astype(np.float64)
     assert features.shape[0] == targets.shape[0], "Mismatched #images"
     logging.info(
         f"Loaded features: {features.shape} and targets: {targets.shape}")
     return features, targets
Ejemplo n.º 4
0
    def _load_data(self, path):
        if self.data_source == "disk_filelist":
            if self.cfg["DATA"][self.split].MMAP_MODE:
                self.image_dataset = load_file(path, mmap_mode="r")
            else:
                self.image_dataset = load_file(path)
        elif self.data_source == "disk_folder":
            self.image_dataset = ImageFolder(path)
            logging.info(
                f"Loaded {len(self.image_dataset)} samples from folder {path}")

            # mark as initialized.
            # Creating ImageFolder dataset can be expensive because of repeated os.listdir calls
            # Avoid creating it over and over again.
            self.is_initialized = True
Ejemplo n.º 5
0
def generate_places_low_shot_samples(targets, k_values, sample_inds,
                                     output_path, images_data_file):
    logging.info("Generating low-shot samples for places data...")
    k_values = [int(val) for val in k_values]

    logging.info(f"Loading images data file: {images_data_file}")
    images = load_file(images_data_file)
    # get the maximum and minumum number of positives per class
    num_pos = find_num_positives(targets)
    logging.info(f"min #num_pos: {min(num_pos)}, max #num_pos: {max(num_pos)}")

    # start sampling now. the way sampling works is:
    # for each independent sample, and a given k value,
    # we create an output targets vector of shape same as the input targets.
    # We initialize this matrix with -1 (ignore values). We sample k positive
    # for each given class and set the value in the matrix to the class number.
    # Thus the resulting matrix has (k * num_classes) samples sampled and
    # remaining are ignored.
    for idx in sample_inds:
        for k in k_values:
            if k > min(num_pos):
                logging.info(f"Skip k: {k} min #pos: {min(num_pos)}")
                continue
            logging.info(f"Sampling: {idx} time for k-value: {k}")
            out_lbls = np.ones(targets.shape, dtype=np.int32) * -1
            out_imgs, out_lbls = sample_places_data(images, targets, k)
            out_img_file = f"{output_path}/train_images_sample{idx}_k{k}.npy"
            out_lbls_file = f"{output_path}/train_labels_sample{idx}_k{k}.npy"
            logging.info(f"Saving imgs file: {out_img_file} {len(out_imgs)}")
            logging.info(f"Saving lbls file: {out_lbls_file} {len(out_lbls)}")
            save_file(out_lbls, out_lbls_file)
            save_file(out_imgs, out_img_file)
    logging.info("Done!!")
Ejemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser(
        description="Sample Low-shot data for Places/VOC")
    parser.add_argument(
        "--dataset_name",
        type=str,
        default=None,
        help=
        "choose between places | voc. These are valid choices if your dataset is similar",
    )
    parser.add_argument(
        "--layername",
        type=str,
        default=None,
        help="Layer for which low shot is being general. Valid for voc07 only",
    )
    parser.add_argument(
        "--targets_data_file",
        type=str,
        default=None,
        help="Numpy file containing image labels",
    )
    parser.add_argument(
        "--images_data_file",
        type=str,
        default=None,
        help="Numpy file containing images information",
    )
    parser.add_argument(
        "--output_path",
        type=str,
        default=None,
        help="path where low-shot samples should be saved",
    )
    parser.add_argument(
        "--k_values",
        type=str,
        default="1,2,4,8,16,32,64,96",
        help="Low-shot k-values for svm testing.",
    )
    parser.add_argument("--num_samples",
                        type=int,
                        default=5,
                        help="Number of independent samples.")
    opts = parser.parse_args()

    assert PathManager.exists(
        opts.targets_data_file), "Target file not found. Abort"
    targets = load_file(opts.targets_data_file)
    sample_ids = list(range(1, 1 + opts.num_samples))

    generate_low_shot_samples(
        opts.dataset_name,
        targets,
        opts.k_values,
        sample_ids,
        opts.output_path,
        opts.layername,
        opts.images_data_file,
    )
Ejemplo n.º 7
0
    def load_config(self, dir_main, dataset):
        # loading imlist, qimlist, and gnd, in cfg as a dict
        gnd_fname = f"{dir_main}/{dataset}/gnd_{dataset}.pkl"
        cfg = load_file(gnd_fname)
        cfg["gnd_fname"] = gnd_fname

        return cfg
Ejemplo n.º 8
0
def generate_output_json_data(
    model_name, predictions_file, pred_confidence_scores_file, pred_img_indices_file
):
    my_model_in22k_subset_dollar_street_full_finetuned = {
        "model_name": model_name,
        "image_paths": "https://dl.fbaipublicfiles.com/vissl/fairness/dollarstreet_in22k_cls_overlapped_images.npy",
        "targets": "https://dl.fbaipublicfiles.com/vissl/fairness//subset_dollarstreet_in22k_cls_overlapped_labels.npy",
        "metadata": "https://dl.fbaipublicfiles.com/vissl/fairness/metadata_full_dollar_street.json",
        "id_to_label_map": "https://dl.fbaipublicfiles.com/vissl/fairness/in22k_cls_idx_to_dollar_street_labels_map.json",
        "predictions": predictions_file,
        "pred_img_indices": pred_img_indices_file,
        "pred_confidence_scores": pred_confidence_scores_file,
    }

    (
        my_model_output_attributes_acc_map,
        my_model_output_metadata_map,
    ) = generate_dollar_street_analysis(
        my_model_in22k_subset_dollar_street_full_finetuned,
        topk=PRED_TOPK,
        confidence_threshold=PRED_CONFIDENCE_THRESHOLD,
    )
    print(list(my_model_output_metadata_map.values())[:10])
    output_dir = "/tmp/dollar_street_models"
    output_file = f"{output_dir}/my_model_output_metadata_map.json"
    save_file(my_model_output_metadata_map, output_file)
    test_data_save = load_file(output_file)
    print(len(test_data_save))
    return output_dir
Ejemplo n.º 9
0
def launch_benchmark_suite_scheduler(config_file):
    assert g_pathmgr.exists(
        config_file), "Slurm evaluator config file must exist"

    user_config = load_file(config_file)
    config = _DEFAULT_CONFIG.copy()
    recursive_dict_merge(config, user_config)

    benchmark_suite_scheduler = BenchmarkSuiteScheduler(**config["params"])
    benchmark_suite_scheduler_job = SlurmEvaluatorJob(
        benchmark_suite_scheduler=benchmark_suite_scheduler)
    executor = submitit.AutoExecutor(
        folder=benchmark_suite_scheduler.evaluation_dir())

    assert "slurm_options" in config, "slurm_options must be specified"
    assert (
        "PARTITION" in config["slurm_options"]
    ), "slurm_options.PARTITION is a required field to launch the benchmark suite on slurm"

    slurm_options = AttrDict(config["slurm_options"])
    executor.update_parameters(
        name=slurm_options.NAME,
        slurm_comment=slurm_options.COMMENT,
        slurm_partition=slurm_options.PARTITION,
        slurm_constraint=slurm_options.CONSTRAINT,
        timeout_min=slurm_options.TIMEOUT_MIN,
        nodes=1,
        cpus_per_task=slurm_options.CPUS_PER_TASK,
        tasks_per_node=1,
        mem_gb=slurm_options.MEM_GB,
        slurm_additional_parameters=slurm_options.ADDITIONAL_PARAMETERS,
    )

    job = executor.submit(benchmark_suite_scheduler_job)
    print(f"SUBMITTED EVALUATION JOB: {job.job_id}")
Ejemplo n.º 10
0
    def __init__(self, dataset: str, dir_main: str, num_samples=None):
        # Credits: https://github.com/filipradenovic/revisitop/blob/master/python/dataset.py#L6     # NOQA

        self.DATASETS = ["roxford5k", "rparis6k"]
        dataset = dataset.lower()
        assert is_revisited_dataset(dataset), f"Unknown dataset: {dataset}!"

        # loading imlist, qimlist, and gnd, in cfg as a dict
        gnd_fname = f"{dir_main}/{dataset}/gnd_{dataset}.pkl"
        cfg = load_file(gnd_fname)
        cfg["gnd_fname"] = gnd_fname
        cfg["ext"] = ".jpg"
        cfg["qext"] = ".jpg"

        cfg["dir_data"] = f"{dir_main}/{dataset}"
        cfg["dir_images"] = f"{cfg['dir_data']}/jpg"

        cfg["n"] = len(cfg["imlist"])
        cfg["nq"] = len(cfg["qimlist"])

        cfg["dataset"] = dataset

        self.cfg = cfg

        self.N_images = self.cfg["n"]
        self.N_queries = self.cfg["nq"]

        if num_samples is not None:
            self.N_queries = min(self.N_queries, num_samples)
            self.N_images = min(self.N_images, num_samples)

        logging.info(f"Dataset: {dataset}, images: {self.get_num_images()}, "
                     f"queries: {self.get_num_query_images()}")
Ejemplo n.º 11
0
def train_voc07_low_shot(
    k_values: List[int],
    sample_inds: List[int],
    output_dir: str,
    layername: str,
    cfg: AttrDict,
):
    dataset_name = cfg["SVM"]["low_shot"]["dataset_name"]
    low_shot_trainer = SVMLowShotTrainer(cfg["SVM"],
                                         layer=layername,
                                         output_dir=output_dir)
    train_data = merge_features(output_dir, "train", layername)
    train_features, train_targets = train_data["features"], train_data[
        "targets"]
    test_data = merge_features(output_dir, "test", layername)
    test_features, test_targets = test_data["features"], test_data["targets"]
    # now we want to create the low-shot samples based on the kind of dataset.
    # We only create low-shot samples for training. We test on the full dataset.
    generate_low_shot_samples(dataset_name, train_targets, k_values,
                              sample_inds, output_dir, layername)
    # Now, we train and test the low-shot SVM for every sample and k-value.
    for sample_num in sample_inds:
        for low_shot_kvalue in k_values:
            train_targets = load_file(
                f"{output_dir}/{layername}_sample{sample_num}_k{low_shot_kvalue}.npy"
            )
            low_shot_trainer.train(train_features, train_targets, sample_num,
                                   low_shot_kvalue)
            low_shot_trainer.test(test_features, test_targets, sample_num,
                                  low_shot_kvalue)
    # now we aggregate the stats across all independent samples and for each
    # k-value and report mean/min/max/std stats
    results = low_shot_trainer.aggregate_stats(k_values, sample_inds)
    logging.info("All Done!")
    return results
Ejemplo n.º 12
0
 def process_train_image(i, out_dir):
     if i % LOG_FREQUENCY == 0:
         logging.info(f"Train Image: {i}"),
     fname_out = f"{out_dir}/{i}.npy"
     if PathManager.exists(fname_out):
         feat = load_file(fname_out)
         train_features.append(feat)
     else:
         fname_in = train_dataset.get_filename(i)
         if is_revisited_dataset(train_dataset_name):
             img = image_helper.load_and_prepare_revisited_image(fname_in)
         elif is_whiten_dataset(train_dataset_name):
             img = image_helper.load_and_prepare_whitening_image(fname_in)
         else:
             img = image_helper.load_and_prepare_image(fname_in, roi=None)
         v = torch.autograd.Variable(img.unsqueeze(0))
         vc = v.cuda()
         # the model output is a list always.
         activation_map = model(vc)[0].cpu()
         # once we have the features,
         # we can perform: rmac | gem pooling | l2 norm
         if cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE == "rmac":
             descriptors = get_rmac_descriptors(activation_map,
                                                spatial_levels)
         else:
             descriptors = activation_map
         save_file(descriptors.data.numpy(), fname_out)
         train_features.append(descriptors.data.numpy())
Ejemplo n.º 13
0
    def _load_training_config(self):
        # Load training yaml config.
        self.training_config = load_file(self.training_checkpoint_file)
        self.training_config = AttrDict(self.training_config)

        logging.info(
            f"Loaded training checkpoint config from: { self.training_checkpoint_file }"
        )
Ejemplo n.º 14
0
 def _load_perms(self):
     assert PathManager.exists(
         self.perm_file), f"Permutation file NOT found: {self.perm_file}"
     logging.info(f"Loading permutation: {self.perm_file}")
     self.perms = load_file(self.perm_file)
     if np.min(self.perms) == 1:
         self.perms = self.perms - 1
     logging.info(f"Loaded perm: {self.perms.shape}")
     self.perm_loaded = True
def generate_openimages_disentangle_analysis(
    common_map,
    openimages_miap_predictions,
    topk=1,
    confidence_threshold=0.0,
    DISPARATE_THRESHOLD=0.8,
    DISPARATE_LABELS_LIST=None,
    LABEL_ASSOC_MAPPING=None,
):
    DISPARATE_LABELS_LIST = DISPARATE_LABELS_LIST or []
    LABEL_ASSOC_MAPPING = LABEL_ASSOC_MAPPING or {}
    openimages_miap_predictions.update(common_map)
    print(
        f"======================== {openimages_miap_predictions['model_name']} ============================"
    )
    class_to_label_name_map = load_file(
        "https://dl.fbaipublicfiles.com/vissl/fairness/label_association/in22k_cls_name_to_label_name_map.json"
    )
    in22k_subset_label_name_map = {
        key: value[0]
        for key, value in class_to_label_name_map.items()
    }
    (
        output_metadata,
        label_to_id,
    ) = load_in22k_subset_opeinimages_miap_labels_images_preds_metadata(
        openimages_miap_predictions, confidence_threshold=confidence_threshold)

    (
        sorted_output_attributes_pred_freq_map,
        output_attributes_count_map,
        output_attributes_pred_rate_difference_map,
        output_attributes_img_map,
        attribute_label_assoc_map,
        output_attributes_confidence_score_map,
        output_mean_attributes_confidence_score_map,
        output_attributes_label_assoc_conf_scores_map,
        output_attribute_disparate_label_map,
    ) = get_per_attribute_predictions_freq(
        output_metadata,
        label_to_id,
        LABEL_ASSOC_MAPPING,
        DISPARATE_LABELS_LIST,
        DISPARATE_THRESHOLD,
        in22k_subset_label_name_map,
        topk=topk,
    )

    _ = convert_and_print_dataframe(
        attribute_label_assoc_map,
        openimages_miap_predictions["model_name"],
        label_assoc_mapping=LABEL_ASSOC_MAPPING,
        threshold=confidence_threshold,
        topk=topk,
    )
    return output_attributes_img_map, output_metadata, attribute_label_assoc_map
Ejemplo n.º 16
0
def get_queries_features(
    cfg,
    temp_dir,
    eval_dataset_name,
    resize_img,
    spatial_levels,
    image_helper,
    eval_dataset,
    model,
    pca,
):
    features_queries = []
    num_queries = eval_dataset.get_num_query_images()
    if cfg.IMG_RETRIEVAL.DEBUG_MODE:
        num_queries = 50
    logging.info(f"Getting features for queries: {num_queries}")
    q_fname_out_dir = "{}/{}_S{}_q".format(temp_dir, eval_dataset_name,
                                           resize_img)
    makedir(q_fname_out_dir)

    for idx in range(num_queries):
        if idx % LOG_FREQUENCY == 0:
            logging.info(f"Eval Query: {idx}"),
        q_fname_in = eval_dataset.get_query_filename(idx)
        roi = eval_dataset.get_query_roi(idx)
        q_fname_out = f"{q_fname_out_dir}/{idx}.npy"
        if PathManager.exists(q_fname_out):
            query_feature = load_file(q_fname_out)
        else:
            query_feature = process_eval_image(
                cfg,
                q_fname_in,
                roi,
                q_fname_out,
                spatial_levels,
                image_helper,
                model,
                pca,
                eval_dataset_name,
            )
        features_queries.append(query_feature)

    if cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE == "gem":
        # GeM pool the features and apply the PCA
        gem_out_fname = f"{q_fname_out_dir}/{eval_dataset_name}_GeM.npy"
        features_queries = torch.tensor(np.concatenate(features_queries))
        features_queries = gem_pool_and_save_features(
            features_queries,
            p=cfg.IMG_RETRIEVAL.GEM_POOL_POWER,
            add_bias=True,
            gem_out_fname=gem_out_fname,
        )
        features_queries = pca.apply(features_queries)
    features_queries = np.vstack(features_queries)
    logging.info(f"features queries: {features_queries.shape}")
    return features_queries
Ejemplo n.º 17
0
    def process_train_image(i, out_dir, verbose=False):
        if i % LOG_FREQUENCY == 0:
            logging.info(f"Train Image: {i}"),

        fname_out = None
        if out_dir:
            fname_out = f"{out_dir}/{i}.npy"

        if fname_out and PathManager.exists(fname_out):
            feat = load_file(fname_out)
            train_features.append(feat)
        else:
            fname_in = train_dataset.get_filename(i)
            if is_revisited_dataset(train_dataset_name):
                img = image_helper.load_and_prepare_revisited_image(fname_in,
                                                                    roi=None)
            elif is_whiten_dataset(train_dataset_name):
                img = image_helper.load_and_prepare_whitening_image(fname_in)
            else:
                img = image_helper.load_and_prepare_image(fname_in, roi=None)
            v = torch.autograd.Variable(img.unsqueeze(0))
            vc = v.cuda()
            # the model output is a list always.
            activation_map = model(vc)[0].cpu()

            if verbose:
                print(
                    f"Train Image raw activation map shape: { activation_map.shape }"
                )

            # once we have the features,
            # we can perform: rmac | gem pooling | l2 norm
            if cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE == "rmac":
                descriptors = get_rmac_descriptors(
                    activation_map,
                    spatial_levels,
                    normalize=cfg.IMG_RETRIEVAL.NORMALIZE_FEATURES,
                )
            elif cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE == "gem":
                descriptors = gem(
                    activation_map,
                    p=cfg.IMG_RETRIEVAL.GEM_POOL_POWER,
                    add_bias=True,
                )
            else:
                descriptors = activation_map

            # Optionally l2 normalize the features.
            if (cfg.IMG_RETRIEVAL.NORMALIZE_FEATURES
                    and cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE != "rmac"):
                # RMAC performs normalization within the algorithm, hence we skip it here.
                descriptors = l2n(descriptors, dim=1)

            if fname_out:
                save_file(descriptors.data.numpy(), fname_out, verbose=False)
            train_features.append(descriptors.data.numpy())
Ejemplo n.º 18
0
def gem_pool_and_save_features(features, p, add_bias, gem_out_fname):
    if PathManager.exists(gem_out_fname):
        logging.info("Loading train GeM features...")
        features = load_file(gem_out_fname)
    else:
        logging.info(f"GeM pooling features: {features.shape}")
        features = l2n(gem(features, p=p, add_bias=True))
        save_file(features, gem_out_fname)
        logging.info(f"Saved GeM features to: {gem_out_fname}")
    return features
Ejemplo n.º 19
0
def get_queries_features(
    cfg,
    temp_dir,
    eval_dataset_name,
    resize_img,
    spatial_levels,
    image_helper,
    eval_dataset,
    model,
    pca,
):
    features_queries = []
    num_queries = eval_dataset.get_num_query_images()

    num_queries = (num_queries if cfg.IMG_RETRIEVAL.NUM_QUERY_SAMPLES == -1
                   else cfg.IMG_RETRIEVAL.NUM_QUERY_SAMPLES)

    logging.info(f"Getting features for queries: {num_queries}")
    q_fname_out_dir = None
    if q_fname_out_dir:
        q_fname_out_dir = f"{temp_dir}/{eval_dataset_name}_S{resize_img}_q"
        makedir(q_fname_out_dir)

    for idx in range(num_queries):
        if idx % LOG_FREQUENCY == 0:
            logging.info(f"Eval Query: {idx}"),
        q_fname_in = eval_dataset.get_query_filename(idx)
        # Optionally crop the query by the region-of-interest (ROI).
        roi = (eval_dataset.get_query_roi(idx)
               if cfg.IMG_RETRIEVAL.CROP_QUERY_ROI else None)

        q_fname_out = None
        if q_fname_out_dir:
            q_fname_out = f"{q_fname_out_dir}/{idx}.npy"

        if q_fname_out and PathManager.exists(q_fname_out):
            query_feature = load_file(q_fname_out)
        else:
            query_feature = process_eval_image(
                cfg,
                q_fname_in,
                roi,
                q_fname_out,
                spatial_levels,
                image_helper,
                model,
                pca,
                eval_dataset_name,
                verbose=(idx == 0),
            )
        features_queries.append(query_feature)

    features_queries = np.vstack(features_queries)
    logging.info(f"Queries Features Size: {features_queries.shape}")
    return features_queries
def generate_in22k_subset_cc_face_crops_disentangle_analysis(
    common_map,
    cc_face_crops_predictions,
    topk=1,
    confidence_threshold=0.0,
    DISPARATE_THRESHOLD=0.8,
    disparate_labels_list=None,
    label_assoc_mapping=None,
):
    disparate_labels_list = disparate_labels_list or []
    label_assoc_mapping = label_assoc_mapping or []
    cc_face_crops_predictions.update(common_map)
    print(
        f"============================================================================ {cc_face_crops_predictions['model_name']} ============================================================================"
    )
    class_to_label_name_map = load_file(
        "https://dl.fbaipublicfiles.com/vissl/fairness/label_association/in22k_cls_name_to_label_name_map.json"
    )
    in22k_subset_label_name_map = {
        key: value[0]
        for key, value in class_to_label_name_map.items()
    }
    (output_metadata,
     label_to_id) = load_cc_face_crops_labels_images_preds_metadata(
         cc_face_crops_predictions, confidence_threshold=confidence_threshold)

    (
        sorted_output_attributes_pred_freq_map,
        output_attributes_count_map,
        output_attributes_pred_rate_difference_map,
        output_attributes_img_map,
        attribute_label_assoc_map,
        output_attributes_confidence_score_map,
        output_mean_attributes_confidence_score_map,
        output_attributes_label_assoc_conf_scores_map,
        output_attribute_disparate_label_map,
    ) = get_per_attribute_predictions_freq(
        output_metadata,
        label_to_id,
        label_assoc_mapping,
        disparate_labels_list,
        DISPARATE_THRESHOLD,
        in22k_subset_label_name_map,
        topk=topk,
    )

    _ = convert_and_print_dataframe(
        attribute_label_assoc_map,
        cc_face_crops_predictions["model_name"],
        label_assoc_mapping=label_assoc_mapping,
        threshold=confidence_threshold,
        topk=topk,
    )
    return output_attributes_img_map, output_metadata, attribute_label_assoc_map
Ejemplo n.º 21
0
def load_labels_metadata_predictions_images(input_data, confidence_threshold=0.0):
    # load the image paths
    image_paths = load_file(input_data["image_paths"])
    print(f"Number of image_paths: {image_paths.shape}\n")

    # load the predictions
    predictions = load_file(input_data["predictions"])
    print(f"Number of predictions: {predictions.shape}\n")

    # load the indices
    indices = load_file(input_data["pred_img_indices"])
    print(f"Number of indices: {indices.shape}\n")

    # load the targets
    targets = load_file(input_data["targets"])
    print(f"Number of targets: {targets.shape}\n")

    # load the metadata
    metadata = load_file(input_data["metadata"])
    if isinstance(metadata, list):
        print(f"metadata: {len(metadata)}")
        print(f"metadata keys: {metadata[0].keys()}")
    else:
        print(f"metadata: {list(metadata.values())[0].keys()}")

    # load the label id map
    id_to_label = load_file(input_data["id_to_label_map"])
    print(f"Loaded label_to_id and generated id_to_label map: {len(id_to_label)}")

    # load the confidence scores if provided
    filtered_confidence_scores = []
    if "pred_confidence_scores" in input_data:
        confidence_scores = load_file(input_data["pred_confidence_scores"])
        filtered_confidence_scores, out_predictions = [], []
        for img_idx in range(len(predictions)):
            img_predictions, img_scores = [], []
            for pred_idx in predictions[img_idx]:
                if confidence_scores[img_idx][pred_idx] >= confidence_threshold:
                    img_predictions.append(pred_idx)
                    img_scores.append(
                        str(round(confidence_scores[img_idx][pred_idx], 5))
                    )
            filtered_confidence_scores.append(img_scores)
            out_predictions.append(img_predictions)
        predictions = out_predictions
        print(f"Confidence scores: {len(filtered_confidence_scores)}\n")

    return (
        image_paths,
        predictions,
        indices,
        targets,
        metadata,
        id_to_label,
        filtered_confidence_scores,
    )
Ejemplo n.º 22
0
def get_dataset_features(
    cfg,
    temp_dir,
    eval_dataset_name,
    resize_img,
    spatial_levels,
    image_helper,
    eval_dataset,
    model,
    pca,
):
    features_dataset = []
    num_images = eval_dataset.get_num_images()
    logging.info(f"Getting features for dataset images: {num_images}")
    db_fname_out_dir = "{}/{}_S{}_db".format(temp_dir, eval_dataset_name,
                                             resize_img)
    makedir(db_fname_out_dir)

    for idx in range(num_images):
        if idx % LOG_FREQUENCY == 0:
            logging.info(f"Eval Dataset Image: {idx}"),
        db_fname_in = eval_dataset.get_filename(idx)
        db_fname_out = f"{db_fname_out_dir}/{idx}.npy"
        if PathManager.exists(db_fname_out):
            db_feature = load_file(db_fname_out)
        else:
            db_feature = process_eval_image(
                cfg,
                db_fname_in,
                None,
                db_fname_out,
                spatial_levels,
                image_helper,
                model,
                pca,
                eval_dataset_name,
            )
        features_dataset.append(db_feature)

    if cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE == "gem":
        # GeM pool the features and apply the PCA
        gem_out_fname = f"{db_fname_out_dir}/{eval_dataset_name}_GeM.npy"
        features_dataset = torch.tensor(np.concatenate(features_dataset))
        features_dataset = gem_pool_and_save_features(
            features_dataset,
            p=cfg.IMG_RETRIEVAL.GEM_POOL_POWER,
            add_bias=True,
            gem_out_fname=gem_out_fname,
        )
        features_dataset = pca.apply(features_dataset)
    features_dataset = np.vstack(features_dataset)
    logging.info(f"features dataset: {features_dataset.shape}")
    return features_dataset
Ejemplo n.º 23
0
    def _load_data(self, path):
        if self.data_source == "disk_filelist":
            if self.cfg["DATA"][self.split].MMAP_MODE:
                self.image_dataset = load_file(path, mmap_mode="r")
            else:
                self.image_dataset = load_file(path)
        elif self.data_source == "disk_folder":
            self.image_dataset = ImageFolder(path)
            logging.info(
                f"Loaded {len(self.image_dataset)} samples from folder {path}")

            # mark as initialized.
            # Creating ImageFolder dataset can be expensive because of repeated os.listdir calls
            # Avoid creating it over and over again.
            self.is_initialized = True
        elif self.data_source == "disk_roi_annotations":
            # we load the annotations and then parse the image paths and the image roi
            self.image_dataset, self.image_roi_bbox = [], []
            json_annotations = load_file(path)
            self.image_dataset = [item["path"] for item in json_annotations]
            self.image_roi_bbox = [item["bbox"] for item in json_annotations]
Ejemplo n.º 24
0
    def _load_evaluation_results_checkpoint(self):
        default_checkpoint = os.path.join(self.evaluation_dir(),
                                          "evaluation_metrics.json")
        checkpoint_file = (default_checkpoint
                           if self.autoload_slurm_evaluator_checkpoint else
                           self.slurm_evaluator_checkpoint)

        evaluation_config = load_file(checkpoint_file)

        logging.info(
            f"Loaded evaluation results checkpoint from: { checkpoint_file }")

        return evaluation_config
Ejemplo n.º 25
0
 def load_feature_shard(cls,
                        paths: ExtractedFeaturesShardPaths,
                        verbose=True,
                        allow_pickle=False) -> ExtractedFeatures:
     """
     Load a shard of the extracted features and returns its content:
     features, targets and indices.
     """
     if verbose:
         logging.info(
             f"Loading:\n{paths.feature_file}\n{paths.targets_file}\n{paths.indices_file}"
         )
     return ExtractedFeatures(
         features=load_file(paths.feature_file,
                            verbose=verbose,
                            allow_pickle=allow_pickle),
         targets=load_file(paths.targets_file,
                           verbose=verbose,
                           allow_pickle=allow_pickle),
         indices=load_file(paths.indices_file,
                           verbose=verbose,
                           allow_pickle=allow_pickle),
     )
Ejemplo n.º 26
0
def main(args: Namespace, config: AttrDict):
    # setup logging
    setup_logging(__name__, output_dir=get_checkpoint_folder(config))

    # print the coniguration used
    print_cfg(config)

    assert config.MODEL.FEATURE_EVAL_SETTINGS.EVAL_MODE_ON, (
        "Feature eval mode is not ON. Can't run train_svm. "
        "Set config.MODEL.FEATURE_EVAL_SETTINGS.EVAL_MODE_ON=True "
        "in your config or from command line.")

    # extract the features
    if not config.SVM_FEATURES_PATH:
        launch_distributed(
            config,
            args.node_id,
            engine_name="extract_features",
            hook_generator=default_hook_generator,
        )
        config.SVM_FEATURES_PATH = get_checkpoint_folder(config)

    # Get the names of the features that we extracted features for. If user doesn't
    # specify the features to evaluate, we get the full model output and freeze
    # head/trunk both as caution.
    layers = get_trunk_output_feature_names(config.MODEL)
    if len(layers) == 0:
        layers = ["heads"]

    output_dir = get_checkpoint_folder(config)
    running_tasks = [
        mp.Process(target=train_svm, args=(config, output_dir, layer))
        for layer in layers
    ]
    for running_task in running_tasks:
        running_task.start()
    for running_task in running_tasks:
        running_task.join()

    # collect the mAP stats for all the layers and report
    output_mAP = []
    for layer in layers:
        try:
            ap_file = f"{output_dir}/{layer}/test_ap.npy"
            output_mAP.append(round(100.0 * np.mean(load_file(ap_file)), 3))
        except Exception:
            output_mAP.append(-1)
    logging.info(f"AP for various layers:\n {layers}: {output_mAP}")
    # close the logging streams including the filehandlers
    shutdown_logging()
Ejemplo n.º 27
0
def get_dataset_features(
    cfg,
    temp_dir,
    eval_dataset_name,
    resize_img,
    spatial_levels,
    image_helper,
    eval_dataset,
    model,
    pca,
):
    features_dataset = []
    num_images = eval_dataset.get_num_images()
    logging.info(f"Getting features for dataset images: {num_images}")

    db_fname_out_dir = None
    if temp_dir:
        db_fname_out_dir = f"{temp_dir}/{eval_dataset_name}_S{resize_img}_db"

    makedir(db_fname_out_dir)

    for idx in range(num_images):
        if idx % LOG_FREQUENCY == 0:
            logging.info(f"Eval Dataset Image: {idx}"),
        db_fname_in = eval_dataset.get_filename(idx)

        db_fname_out = None
        if db_fname_out_dir:
            db_fname_out = f"{db_fname_out_dir}/{idx}.npy"

        if db_fname_out and PathManager.exists(db_fname_out):
            db_feature = load_file(db_fname_out)
        else:
            db_feature = process_eval_image(
                cfg,
                db_fname_in,
                None,
                db_fname_out,
                spatial_levels,
                image_helper,
                model,
                pca,
                eval_dataset_name,
                verbose=(idx == 0),
            )
        features_dataset.append(db_feature)

    features_dataset = np.vstack(features_dataset)
    logging.info(f"Dataset Features Size: {features_dataset.shape}")
    return features_dataset
Ejemplo n.º 28
0
def extract_low_shot_features(args: Namespace, cfg: AttrDict, output_dir: str):
    dataset_name = cfg["SVM"]["low_shot"]["dataset_name"]
    k_values = cfg["SVM"]["low_shot"]["k_values"]
    sample_inds = cfg["SVM"]["low_shot"]["sample_inds"]
    if "voc" in dataset_name:
        # extract the features. In case of voc07 low-shot, we extract the
        # features on full train and test sets. Both sets have about 5K images
        # we extract
        launch_distributed(
            cfg,
            args.node_id,
            engine_name="extract_features",
            hook_generator=default_hook_generator,
        )
    elif "places" in dataset_name:
        # in case of places, since the features size could become large, we need
        # to extract features at smaller subsamples
        data_paths, label_paths = dataset_catalog.get_data_files(
            split="TRAIN", dataset_config=cfg["DATA"])
        targets = load_file(label_paths[0])
        logging.info("Generating low-shot samples for Places205...")
        generate_places_low_shot_samples(targets, k_values, sample_inds,
                                         output_dir, data_paths[0])

        test_features_extracted = False
        for idx in sample_inds:
            for k in k_values:
                out_img_file = f"{output_dir}/train_images_sample{idx}_k{k}.npy"
                out_lbls_file = f"{output_dir}/train_labels_sample{idx}_k{k}.npy"
                cfg.DATA.TRAIN.DATA_PATHS = [out_img_file]
                cfg.DATA.TRAIN.LABEL_PATHS = [out_lbls_file]
                cfg.CHECKPOINT.DIR = f"{output_dir}/sample{idx}_k{k}"
                logging.info(
                    f"Extracting features for places low shot: sample{idx}_k{k}"
                )
                # we want to extract the test features only once since the test
                # features are commonly used for testing for all low-shot setup.
                if test_features_extracted:
                    cfg.TEST_MODEL = False
                launch_distributed(
                    cfg,
                    args.node_id,
                    engine_name="extract_features",
                    hook_generator=default_hook_generator,
                )
                test_features_extracted = True
        # set the test model to true again after feature extraction is done
        cfg.TEST_MODEL = True
    else:
        raise RuntimeError(f"Dataset not recognised: {dataset_name}")
Ejemplo n.º 29
0
 def _load_perms(self):
     if is_url(self.perm_file):
         temp_cache_dir = tempfile.mkdtemp()
         cache_dir = os.path.join(temp_cache_dir, "perm_file_cache")
         cached_url_path = cache_url(url=self.perm_file,
                                     cache_dir=cache_dir)
         self.perm_file = cached_url_path
     assert PathManager.exists(
         self.perm_file), f"Permutation file NOT found: {self.perm_file}"
     logging.info(f"Loading permutation: {self.perm_file}")
     self.perms = load_file(self.perm_file)
     if np.min(self.perms) == 1:
         self.perms = self.perms - 1
     logging.info(f"Loaded perm: {self.perms.shape}")
     self.perm_loaded = True
Ejemplo n.º 30
0
 def get_best_cost_value(self):
     """
     During the SVM training, we write the cross vaildation
     AP value for training at each class and cost value
     combination. We load the AP values and for each
     class, determine the cost value that gives the maximum
     AP. We return the chosen cost values for each class as a
     numpy matrix.
     """
     crossval_ap_file = f"{self.output_dir}/crossval_ap.npy"
     chosen_cost_file = f"{self.output_dir}/chosen_cost.npy"
     if PathManager.exists(crossval_ap_file) and PathManager.exists(
             chosen_cost_file):
         self.chosen_cost = load_file(chosen_cost_file)
         self.train_ap_matrix = load_file(crossval_ap_file)
         return self.chosen_cost
     if self.train_ap_matrix is None:
         num_classes = len(self.cls_list)
         self.train_ap_matrix = np.zeros(
             (num_classes, len(self.costs_list)))
         for cls_num in range(num_classes):
             for cost_idx in range(len(self.costs_list)):
                 cost = self.costs_list[cost_idx]
                 _, ap_out_file = self._get_svm_model_filename(
                     cls_num, cost)
                 self.train_ap_matrix[cls_num][cost_idx] = float(
                     load_file(ap_out_file)[0])
     argmax_cls = np.argmax(self.train_ap_matrix, axis=1)
     chosen_cost = [self.costs_list[idx] for idx in argmax_cls]
     logging.info(f"chosen_cost: {chosen_cost}")
     save_file(np.array(self.train_ap_matrix), crossval_ap_file)
     save_file(np.array(chosen_cost), chosen_cost_file)
     logging.info(f"saved crossval_ap AP to file: {crossval_ap_file}")
     logging.info(f"saved chosen costs to file: {chosen_cost_file}")
     self.chosen_cost = chosen_cost
     return np.array(chosen_cost)