Ejemplo n.º 1
0
def main(args: Namespace, config: AttrDict):
    # setup logging
    setup_logging(__name__, output_dir=get_checkpoint_folder(config))

    # print the coniguration used
    print_cfg(config)

    assert config.MODEL.FEATURE_EVAL_SETTINGS.EVAL_MODE_ON, (
        "Feature eval mode is not ON. Can't run train_svm. "
        "Set config.MODEL.FEATURE_EVAL_SETTINGS.EVAL_MODE_ON=True "
        "in your config or from command line.")

    # extract the features
    if not config.SVM_FEATURES_PATH:
        launch_distributed(
            config,
            args.node_id,
            engine_name="extract_features",
            hook_generator=default_hook_generator,
        )
        config.SVM_FEATURES_PATH = get_checkpoint_folder(config)

    # Get the names of the features that we extracted features for. If user doesn't
    # specify the features to evaluate, we get the full model output and freeze
    # head/trunk both as caution.
    layers = get_trunk_output_feature_names(config.MODEL)
    if len(layers) == 0:
        layers = ["heads"]

    output_dir = get_checkpoint_folder(config)
    running_tasks = [
        mp.Process(target=train_svm, args=(config, output_dir, layer))
        for layer in layers
    ]
    for running_task in running_tasks:
        running_task.start()
    for running_task in running_tasks:
        running_task.join()

    # collect the mAP stats for all the layers and report
    output_mAP = []
    for layer in layers:
        try:
            ap_file = f"{output_dir}/{layer}/test_ap.npy"
            output_mAP.append(round(100.0 * np.mean(load_file(ap_file)), 3))
        except Exception:
            output_mAP.append(-1)
    logging.info(f"AP for various layers:\n {layers}: {output_mAP}")
    # close the logging streams including the filehandlers
    shutdown_logging()
Ejemplo n.º 2
0
def main(args: Namespace, config: AttrDict):
    # setup logging
    setup_logging(__name__)

    # print the coniguration used
    print_cfg(config)

    # setup the environment variables
    set_env_vars(local_rank=0, node_id=0, cfg=config)

    # Extract the features if no path to the extract features is provided
    if not config.NEAREST_NEIGHBOR.FEATURES.PATH:
        launch_distributed(
            config,
            args.node_id,
            engine_name="extract_features",
            hook_generator=default_hook_generator,
        )
        config.NEAREST_NEIGHBOR.FEATURES.PATH = get_checkpoint_folder(config)

    # Run KNN at all the extract features
    run_knn_at_all_layers(config)

    # close the logging streams including the filehandlers
    shutdown_logging()
Ejemplo n.º 3
0
    def test_run(self, config_file_path: str):
        """
        Instantiate and run all the test tasks

        Arguments:
            config_file_path {str} -- path to the config for the task to be run
        """
        logger.info(f"Loading {config_file_path}")
        cfg = SSLHydraConfig.from_configs([config_file_path])
        args, config = convert_to_attrdict(cfg.default_cfg)
        checkpoint_folder = get_checkpoint_folder(config)

        # Complete the data localization at runtime
        config.DATA.TRAIN.DATA_PATHS = [
            pkg_resources.resource_filename(__name__, "test_data")
        ]

        if torch.distributed.is_initialized():
            # Destroy process groups as torch may be initialized with NCCL, which
            # is incompatible with test_cpu_regnet_moco.yaml
            torch.distributed.destroy_process_group()

        # run training and make sure no exception is raised
        dist_run_id = get_dist_run_id(config, config.DISTRIBUTED.NUM_NODES)
        train_main(
            config,
            dist_run_id=dist_run_id,
            checkpoint_path=None,
            checkpoint_folder=checkpoint_folder,
            local_rank=0,
            node_id=0,
            hook_generator=default_hook_generator,
        )
Ejemplo n.º 4
0
    def test_run(self, config_file_path: str):
        """
        Instantiate and run all the test tasks

        Arguments:
            config_file_path {str} -- path to the config for the task to be run
        """
        logger.info(f"Loading {config_file_path}")
        cfg = SSLHydraConfig.from_configs([config_file_path])
        args, config = convert_to_attrdict(cfg.default_cfg)
        checkpoint_folder = get_checkpoint_folder(config)

        # Complete the data localization at runtime
        config.DATA.TRAIN.DATA_PATHS = [
            pkg_resources.resource_filename(__name__, "test_data")
        ]

        # run training and make sure no exception is raised
        dist_run_id = get_dist_run_id(config, config.DISTRIBUTED.NUM_NODES)
        train_main(
            config,
            dist_run_id=dist_run_id,
            checkpoint_path=None,
            checkpoint_folder=checkpoint_folder,
            local_rank=0,
            node_id=0,
            hook_generator=default_hook_generator,
        )
Ejemplo n.º 5
0
def main(args: Namespace, cfg: AttrDict):
    # setup logging
    setup_logging(__name__)

    # print the cfg
    print_cfg(cfg)

    # setup the environment variables
    set_env_vars(local_rank=0, node_id=0, cfg=cfg)

    output_dir = get_checkpoint_folder(cfg)

    assert cfg.MODEL.FEATURE_EVAL_SETTINGS.EVAL_MODE_ON, (
        "Feature eval mode is not ON. Can't run train_svm. "
        "Set config.MODEL.FEATURE_EVAL_SETTINGS.EVAL_MODE_ON=True "
        "in your config or from command line.")
    extract_low_shot_features(args, cfg, output_dir)

    # Get the names of the features that we extracted features for. If user doesn't
    # specify the features to evaluate, we get the full model output and freeze
    # head/trunk both as caution.
    layers = get_trunk_output_feature_names(cfg.MODEL)
    if len(layers) == 0:
        layers = ["heads"]

    # train low shot svm for each layer.
    output = {}
    for layer in layers:
        results = train_svm_low_shot(cfg, output_dir, layer)
        output[layer] = results
    logging.info(f"Results: {output}")

    # close the logging streams including the filehandlers
    shutdown_logging()
Ejemplo n.º 6
0
def cluster_features_and_label(args: Namespace, cfg: AttrDict):
    # faiss is an optional dependency for VISSL.
    assert is_faiss_available(), (
        "Please install faiss using conda install faiss-gpu -c pytorch "
        "if using conda or pip install faiss-gpu"
    )
    import faiss

    cluster_backend = cfg.CLUSTERFIT.CLUSTER_BACKEND
    num_clusters = cfg.CLUSTERFIT.NUM_CLUSTERS
    data_split = cfg.CLUSTERFIT.FEATURES.DATA_PARTITION
    data_name = cfg.CLUSTERFIT.FEATURES.DATASET_NAME
    n_iter = cfg.CLUSTERFIT.N_ITER
    output_dir = get_checkpoint_folder(cfg)

    ########### Step 1: Extract the features on full dataset ###################
    feature_data, image_paths = get_data_features_and_images(cfg)

    ########### Step 2: Get the data information ###################
    features = feature_data["features"]
    # features are of shape num_samples x feature_dim
    assert features.ndim == 2, f"Features incorrect shape: {features.shape}"
    assert features.dtype == np.float32, "Features are not float32 type"
    logging.info(f"Clustering Features: {features.shape}")

    ########### Step 3: L2 normalize features ###################
    # TODO: we could support PCA here if needed in future.
    logging.info("L2 normalizing the features now...")
    feat_norm = np.linalg.norm(features, axis=1) + 1e-5
    features = features / feat_norm[:, np.newaxis]

    ########### Step 4: Cluster the features ###################
    logging.info("Clustering the features now...")
    assert cluster_backend == "faiss", "Only faiss clustering is supported currently"
    kmeans = faiss.Kmeans(features.shape[1], num_clusters, niter=n_iter, verbose=True)
    kmeans.train(features)
    centroids = kmeans.centroids

    ########### Step 5: Get the cluster assignment for the features ############
    logging.info("Getting cluster label assignment now...")
    distances, hard_cluster_labels = kmeans.index.search(features, 1)

    #### Step 6: Save clustering data and hard cluster labels for the images ###
    data_split = data_split.lower()
    clustering_output_dict = {
        "hard_labels": hard_cluster_labels,
        "centroids": centroids,
        "distances": distances,
    }
    cluster_output_filepath = (
        f"{output_dir}/{data_name}_{data_split}_N{num_clusters}_{cluster_backend}.pkl"
    )
    hard_labels_output_filepath = (
        f"{output_dir}/"
        f"{data_name}_{data_split}_N{num_clusters}_{cluster_backend}_lbls.npy"
    )
    out_hard_labels = np.array(hard_cluster_labels.tolist(), dtype=np.int64).reshape(-1)
    save_file(clustering_output_dict, cluster_output_filepath)
    save_file(out_hard_labels, hard_labels_output_filepath)
    logging.info("All Done!")
Ejemplo n.º 7
0
def main(args: Namespace, cfg: AttrDict):
    setup_logging(__name__, output_dir=get_checkpoint_folder(cfg))

    # Extract the features if the feature extract is enabled
    if cfg.CLUSTERFIT.FEATURES.EXTRACT:

        # We cannot have automatic extraction with more than 1 node or otherwise
        # we would have to run this script on several nodes and thus have several
        # parallel clustering of the features. The automatic extraction is only
        # there as a shortcut when running on a single node
        assert (cfg.DISTRIBUTED.NUM_NODES == 1
                ), "Automatic extraction can only work with 1 node"

        # Make sure to dump the features at the desired path
        cfg.CHECKPOINT.DIR = cfg.CLUSTERFIT.FEATURES.PATH
        cfg.CHECKPOINT.APPEND_DISTR_RUN_ID = False

        # Run the extraction of features
        set_env_vars(local_rank=0, node_id=0, cfg=cfg)
        logging.info("Setting seed....")
        set_seeds(cfg, args.node_id)
        launch_distributed(
            cfg,
            args.node_id,
            engine_name="extract_features",
            hook_generator=default_hook_generator,
        )

    # Else setup the path manager (done in set_env_vars) in
    # case of feature extraction above
    else:
        setup_path_manager()

    cluster_features(cfg)
    shutdown_logging()
Ejemplo n.º 8
0
def extract_clusters(
    cfg: AttrDict,
    dist_run_id: str,
    checkpoint_folder: str,
    local_rank: int = 0,
    node_id: int = 0,
):
    """
    Sets up and executes model visualisation extraction workflow on one node
    """

    # setup the environment variables
    set_env_vars(local_rank, node_id, cfg)
    dist_rank = int(os.environ["RANK"])

    # setup logging
    setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank)

    logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}")
    # print the environment info for the current node
    if local_rank == 0:
        current_env = os.environ.copy()
        print_system_env_info(current_env)

    # setup the multiprocessing to be forkserver.
    # See https://fb.quip.com/CphdAGUaM5Wf
    setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD)

    # set seeds
    logging.info("Setting seed....")
    set_seeds(cfg, dist_rank)

    # We set the CUDA device here as well as a safe solution for all downstream
    # `torch.cuda.current_device()` calls to return correct device.
    if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available():
        local_rank, _ = get_machine_local_and_dist_rank()
        torch.cuda.set_device(local_rank)

    # print the training settings and system settings
    if local_rank == 0:
        print_cfg(cfg)
        logging.info("System config:\n{}".format(collect_env_info()))

    # Build the SSL trainer to set up distributed training and then
    # extract the cluster assignments for all entries in the dataset
    trainer = SelfSupervisionTrainer(cfg, dist_run_id)
    cluster_assignments = trainer.extract_clusters()

    # Save the cluster assignments in the output folder
    if dist_rank == 0:
        ClusterAssignmentLoader.save_cluster_assignment(
            output_dir=get_checkpoint_folder(cfg),
            assignments=ClusterAssignment(
                config=cfg, cluster_assignments=cluster_assignments),
        )

    # close the logging streams including the file handlers
    logging.info("All Done!")
    shutdown_logging()
Ejemplo n.º 9
0
 def _save_label_cls_idx_map(self, cls_idx_map: Dict[str, int], split: str):
     local_rank, dist_rank = get_machine_local_and_dist_rank()
     if dist_rank == 0:
         checkpoint_folder = get_checkpoint_folder(self.cfg)
         class_idx_file_path = (
             f"{checkpoint_folder}/{split.lower()}_label_to_index_map.json")
         if not g_pathmgr.exists(class_idx_file_path):
             save_file(cls_idx_map,
                       class_idx_file_path,
                       append_to_json=False)
Ejemplo n.º 10
0
def main(args: Namespace, config: AttrDict):
    config = validate_and_infer_config(config)
    # setup the environment variables
    set_env_vars(local_rank=0, node_id=0, cfg=config)

    # setup the logging
    checkpoint_folder = get_checkpoint_folder(config)
    setup_logging(__name__, output_dir=checkpoint_folder)

    # print the config
    print_cfg(config)

    instance_retrieval_test(args, config)
    # close the logging streams including the filehandlers
    shutdown_logging()
Ejemplo n.º 11
0
def get_tensorboard_dir(cfg):
    """
    Get the output directory where the tensorboard events will be written.

    Args:
        cfg (AttrDict): User specified config file containing the settings for the
                        tensorboard as well like log directory, logging frequency etc

    Returns:
        tensorboard_dir (str): output directory path

    """
    checkpoint_folder = get_checkpoint_folder(cfg)
    tensorboard_dir = f"{checkpoint_folder}/tb_logs"
    logging.info(f"Tensorboard dir: {tensorboard_dir}")
    makedir(tensorboard_dir)
    return tensorboard_dir
Ejemplo n.º 12
0
def get_data_features_and_images(cfg: AttrDict):
    output_dir = get_checkpoint_folder(cfg)
    split = cfg.RANKING.FEATURES.DATA_PARTITION
    logging.info("Merging features...")
    # merge the features across all nodes/gpus into one
    feature_data = merge_features(output_dir, split.lower(),
                                  cfg.RANKING.FEATURES.LAYER_NAME)

    logging.info("Getting the image paths...")
    # get the list of image Ids
    dataset = build_dataset(cfg=cfg, split=split)
    feature_image_paths = dataset.get_image_paths()
    # due to multi-modality, we get image_paths as a nested list, one for each
    # dataset. Check it's a list and extract images.
    assert type(feature_image_paths) == list, "Image paths must be a list"
    assert len(feature_image_paths) == 1, "Multi-modality not supported yet!"
    return feature_data, feature_image_paths[0]
Ejemplo n.º 13
0
def main(args: Namespace, config: AttrDict, node_id=0):
    config = validate_and_infer_config(config)

    # setup the environment variables
    set_env_vars(local_rank=0, node_id=node_id, cfg=config)

    # setup the logging
    checkpoint_folder = get_checkpoint_folder(config)
    setup_logging(__name__,
                  output_dir=checkpoint_folder,
                  rank=os.environ["RANK"])

    if (config.IMG_RETRIEVAL.USE_FEATURE_EXTRACTION_ENGINE
            and not config.IMG_RETRIEVAL.FEATURE_EXTRACTION_DIR):
        # extract the train/database features.
        config = adapt_train_database_extract_config(config, checkpoint_folder)

        logging.info("Beginning extract features for database set.")
        launch_distributed(
            config,
            args.node_id,
            engine_name="extract_features",
            hook_generator=default_hook_generator,
        )

        # extract the query features.
        config = adapt_query_extract_config(config, checkpoint_folder)

        logging.info("Beginning extract features for query set.")

        launch_distributed(
            config,
            args.node_id,
            engine_name="extract_features",
            hook_generator=default_hook_generator,
        )

    # print the config
    print_cfg(config)

    instance_retrieval_test(args, config)
    logging.info(f"Performance time breakdow:\n{PERF_STATS.report_str()}")

    # close the logging streams including the filehandlers
    shutdown_logging()
Ejemplo n.º 14
0
def extract_features_and_run_knn(node_id: int, config: AttrDict):
    setup_logging(__name__)
    print_cfg(config)
    set_env_vars(local_rank=0, node_id=0, cfg=config)

    # Extract the features if no path to the extract features is provided
    if not config.NEAREST_NEIGHBOR.FEATURES.PATH:
        launch_distributed(
            config,
            node_id,
            engine_name="extract_features",
            hook_generator=default_hook_generator,
        )
        config.NEAREST_NEIGHBOR.FEATURES.PATH = get_checkpoint_folder(config)

    # Run KNN on all the extract features
    run_knn_at_all_layers(config)

    # close the logging streams including the file handlers
    shutdown_logging()
Ejemplo n.º 15
0
def infer_losses_config(cfg):
    """
    Infer settings for various self-supervised losses. Takes care of setting various loss
    parameters correctly like world size, batch size per gpu, effective global batch size,
    collator etc.
    Each loss has additional set of parameters that can be inferred to ensure smooth
    training in case user forgets to adjust all the parameters.
    """
    # some inference for the Info-NCE loss.
    if "simclr_info_nce_loss" in cfg.LOSS.name:
        cfg.LOSS[cfg.LOSS.name]["buffer_params"]["world_size"] = (
            cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE)

        world_size = cfg.LOSS[cfg.LOSS.name]["buffer_params"]["world_size"]
        batch_size = cfg.DATA.TRAIN.BATCHSIZE_PER_REPLICA
        num_positives = 2  # simclr uses 2 copies per image
        cfg.LOSS[cfg.LOSS.name]["buffer_params"]["effective_batch_size"] = (
            num_positives * batch_size * world_size)

    # bce_logits_multiple_output_single_target
    if cfg.LOSS.name == "bce_logits_multiple_output_single_target":
        world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE
        cfg.LOSS.bce_logits_multiple_output_single_target.world_size = world_size

    # multicrop version of simclr loss
    if cfg.LOSS.name == "multicrop_simclr_info_nce_loss":
        world_size = cfg.LOSS.multicrop_simclr_info_nce_loss.buffer_params.world_size
        batch_size = cfg.DATA.TRAIN.BATCHSIZE_PER_REPLICA
        total_num_crops = cfg.DATA.TRAIN.TRANSFORMS[0]["total_num_crops"]
        cfg.LOSS.multicrop_simclr_info_nce_loss.buffer_params.world_size = world_size
        cfg.LOSS.multicrop_simclr_info_nce_loss.buffer_params.effective_batch_size = (
            batch_size * world_size)
        cfg.LOSS.multicrop_simclr_info_nce_loss.num_crops = total_num_crops
        cfg.DATA.TRAIN.COLLATE_FUNCTION = "multicrop_collator"

    # some inference for the DeepCluster-v2 loss.
    if cfg.LOSS.name == "deepclusterv2_loss":
        cfg.LOSS.deepclusterv2_loss.DROP_LAST = cfg.DATA.TRAIN.DROP_LAST
        cfg.LOSS.deepclusterv2_loss.BATCHSIZE_PER_REPLICA = (
            cfg.DATA.TRAIN.BATCHSIZE_PER_REPLICA)
        cfg.LOSS.deepclusterv2_loss.num_crops = cfg.DATA.TRAIN.TRANSFORMS[0][
            "total_num_crops"]
        cfg.DATA.TRAIN.COLLATE_FUNCTION = "multicrop_collator"

    # some inference for the SwAV loss.
    if cfg.LOSS.name == "swav_loss":
        assert len(cfg.MODEL.HEAD.PARAMS) == 1
        assert cfg.MODEL.HEAD.PARAMS[0][0] == "swav_head"
        assert cfg.DATA.TRAIN.COLLATE_FUNCTION in [
            "multicrop_collator",
            "multicrop_mixup_collator",
            "cutmixup_collator",
        ], ("for swav loss, use either a collator from "
            "[multicrop_collator, multicrop_mixup_collator]")
        cfg.LOSS.swav_loss.num_prototypes = cfg.MODEL.HEAD.PARAMS[0][1][
            "num_clusters"]
        cfg.LOSS.swav_loss.embedding_dim = cfg.MODEL.HEAD.PARAMS[0][1]["dims"][
            -1]
        cfg.LOSS.swav_loss.num_crops = cfg.DATA.TRAIN.TRANSFORMS[0][
            "total_num_crops"]
        from vissl.utils.checkpoint import get_checkpoint_folder

        cfg.LOSS.swav_loss.output_dir = get_checkpoint_folder(cfg)
        world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE
        batch_size = cfg.DATA.TRAIN.BATCHSIZE_PER_REPLICA
        batch_size *= world_size
        queue_length = cfg.LOSS.swav_loss.queue.queue_length
        queue_length -= queue_length % batch_size
        cfg.LOSS.swav_loss.queue.queue_length = queue_length
        cfg.LOSS.swav_loss.queue.local_queue_length = queue_length // world_size

    # some inference for the SwAV momentum loss.
    if cfg.LOSS.name == "swav_momentum_loss":
        assert len(cfg.MODEL.HEAD.PARAMS) == 1
        assert cfg.MODEL.HEAD.PARAMS[0][0] == "swav_head"
        cfg.LOSS.swav_momentum_loss.num_prototypes = cfg.MODEL.HEAD.PARAMS[0][
            1]["num_clusters"]
        cfg.LOSS.swav_momentum_loss.embedding_dim = cfg.MODEL.HEAD.PARAMS[0][
            1]["dims"][-1]
        cfg.LOSS.swav_momentum_loss.num_crops = cfg.DATA.TRAIN.TRANSFORMS[0][
            "total_num_crops"]
        cfg.DATA.TRAIN.COLLATE_FUNCTION = "multicrop_collator"
        world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE
        batch_size = cfg.DATA.TRAIN.BATCHSIZE_PER_REPLICA
        batch_size *= world_size
        queue_length = cfg.LOSS.swav_momentum_loss.queue.queue_length
        queue_length -= queue_length % batch_size
        cfg.LOSS.swav_momentum_loss.queue.queue_length = queue_length
        cfg.LOSS.swav_momentum_loss.queue.local_queue_length = (queue_length //
                                                                world_size)
    return cfg
Ejemplo n.º 16
0
def default_hook_generator(cfg: AttrDict) -> List[ClassyHook]:
    """
    The utility function that prepares all the hoooks that will be used in training
    based on user selection. Some basic hooks are used by default.

    Optional hooks:
        - Tensorboard hook,
        - loss specific hooks (swav loss, deepcluster loss, moco loss) used only when the
          loss is being used
        - model complexity hook (if user wants to compute model flops, activations, params)
          enable the hook via HOOKS.MODEL_COMPLEXITY.COMPUTE_COMPLEXITY = True

    Returns:
        hooks (List(functions)): list containing the hook functions that will be used
    """
    hooks = []

    # conditionally add hooks based on use-case
    if cfg.HOOKS.PERF_STATS.MONITOR_PERF_STATS:
        perf_stat_freq = (
            cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY
            if cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY > 0
            else None
        )
        hooks.append(LogPerfTimeMetricsHook(perf_stat_freq))

    # add the loss hooks based on the loss being used
    hooks = add_loss_hooks(hooks, cfg.LOSS, cfg)

    if cfg.HOOKS.MODEL_COMPLEXITY.COMPUTE_COMPLEXITY:
        hooks.extend([SSLModelComplexityHook()])
    if cfg.HOOKS.LOG_GPU_STATS:
        hooks.extend([LogGpuStatsHook()])
    if cfg.HOOKS.MEMORY_SUMMARY.PRINT_MEMORY_SUMMARY:
        hooks.extend([LogGpuMemoryHook(cfg.HOOKS.MEMORY_SUMMARY.LOG_ITERATION_NUM)])
    if cfg.HOOKS.MEMORY_SUMMARY.DUMP_MEMORY_ON_EXCEPTION:
        hooks.append(DumpMemoryOnException())
    if cfg.HOOKS.TENSORBOARD_SETUP.USE_TENSORBOARD:
        assert is_tensorboard_available(), (
            "Tensorboard must be installed to use it. Please install tensorboard using:"
            "If pip environment: `pip install tensorboard` "
            "If using conda and you prefer conda install of tensorboard: "
            "`conda install -c conda-forge tensorboard`"
        )
        tb_hook = get_tensorboard_hook(cfg)
        hooks.extend([tb_hook])
    if cfg.MODEL.GRAD_CLIP.USE_GRAD_CLIP:
        hooks.extend(
            [
                GradClipHook(
                    norm_type=cfg.MODEL.GRAD_CLIP.NORM_TYPE,
                    max_norm=cfg.MODEL.GRAD_CLIP.MAX_NORM,
                )
            ]
        )

    # hooks that are used irrespective of workflow type
    rolling_btime_freq = (
        cfg.HOOKS.PERF_STATS.ROLLING_BTIME_FREQ
        if cfg.HOOKS.PERF_STATS.ROLLING_BTIME_FREQ > 0
        else None
    )

    if CudaSynchronizeHook.is_enabled(cfg.MODEL):
        hooks.append(CudaSynchronizeHook())

    if ProfilingHook.is_enabled(cfg.PROFILING):
        hooks.append(ProfilingHook(profiling_config=cfg.PROFILING))

    world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE
    checkpoint_folder = get_checkpoint_folder(cfg)

    hooks.extend(
        [
            SetDataSamplerEpochHook(),
            FreezeParametersHook(),
            LogLossMetricsCheckpointHook(world_size),
            LogLossLrEtaHook(checkpoint_folder, rolling_btime_freq),
        ]
    )

    if cfg.METERS.model_output_mask:
        hooks.extend([ModelOutputMaskHook()])

    if cfg.HOOKS.CHECK_NAN:
        hooks.extend([CheckNanLossHook(), CheckNanModelOutputHook(world_size)])

    return hooks
Ejemplo n.º 17
0
def infer_and_assert_hydra_config(cfg):
    """
    Infer values of few parameters in the config file using the value of other config parameters
    1. Inferring losses
    2. Auto scale learning rate if user has specified auto scaling to be True.
    3. Infer meter names (model layer name being evaluated) since we support list meters
       that have multiple output and same target. This is very common in self-supervised
       learning where we want to evaluate metric for several layers of the models. VISSL
       supports running evaluation for multiple model layers in a single training run.
    4. Support multi-gpu DDP eval model by attaching a dummy parameter. This is particularly
       helpful for the multi-gpu feature extraction especially when the dataset is large for
       which features are being extracted.
    5. Infer what kind of labels are being used. If user has specified a labels source, we set
       LABEL_TYPE to "standard" (also vissl default), otherwise if no label is specified, we
       set the LABEL_TYPE to "sample_index".
    """
    cfg = infer_losses_config(cfg)
    cfg = infer_learning_rate(cfg)

    # pass the seed to cfg["MODEL"] so that model init on different nodes can
    # use the same seed.
    # TODO (Min): once FSDP supports sync'ing weights from rank 0, we don't need
    #             this anymore.
    cfg["MODEL"]["_MODEL_INIT_SEED"] = cfg.SEED_VALUE

    # in case of linear evaluation, we often evaluate several layers at a time. For each
    # layer, there's a separate accuracy meter. In such case, we want to output the layer
    # name in the meters output to make it easy to interpret the results. This is
    # currently only supported for cases where we have linear evaluation.
    if cfg.METERS is not None:
        from vissl.models import is_feature_extractor_model

        meter_name = cfg.METERS.get("name", "")
        valid_meters = ["accuracy_list_meter", "mean_ap_list_meter"]
        if meter_name:
            if meter_name in valid_meters and is_feature_extractor_model(cfg.MODEL):
                cfg.METERS[meter_name]["num_meters"] = len(
                    cfg.MODEL.FEATURE_EVAL_SETTINGS.LINEAR_EVAL_FEAT_POOL_OPS_MAP
                )
                cfg.METERS[meter_name]["meter_names"] = [
                    item[0]
                    for item in cfg.MODEL.FEATURE_EVAL_SETTINGS.LINEAR_EVAL_FEAT_POOL_OPS_MAP
                ]

    # in case of feature evaluation mode, we freeze the trunk. The Feature evaluation mode
    # is used for the feature extraction of trunk as well. VISSL supports distributed feature
    # extraction to speed up the extraction time. Since the model needs to be DDP for the
    # distributed extraction, we need some dummy parameters in the model otherwise model
    # can't be converted to DDP. So we attach some dummy head to the model.
    world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE
    if (
        cfg.MODEL.FEATURE_EVAL_SETTINGS.EVAL_MODE_ON
        and cfg.MODEL.FEATURE_EVAL_SETTINGS.FREEZE_TRUNK_ONLY
        and cfg.MODEL.FEATURE_EVAL_SETTINGS.EXTRACT_TRUNK_FEATURES_ONLY
        and world_size > 1
        and len(cfg.MODEL.HEAD.PARAMS) == 0
    ):
        cfg.MODEL.HEAD.PARAMS = [["mlp", {"dims": [2048, 1000]}]]

    # in SSL, during pre-training we don't want to use annotated labels or during feature
    # extraction, we don't have annotated labels for some datasets. In such cases, we set
    # the label type to be just the image index in the dataset, unless the
    # user has specifically provided "zero" as the label type, which is
    # necessary when the CutMixUp collator is being used for self-supervised
    # training.
    if len(cfg.DATA.TRAIN.LABEL_SOURCES) == 0 and cfg.DATA.TRAIN.LABEL_TYPE != "zero":
        cfg.DATA.TRAIN.LABEL_TYPE = "sample_index"
    if len(cfg.DATA.TEST.LABEL_SOURCES) == 0 and cfg.DATA.TEST.LABEL_TYPE != "zero":
        cfg.DATA.TEST.LABEL_TYPE = "sample_index"

    # if the user has specified the model initialization from a params_file, we check if
    # the params_file is a url. If it is, we download the file to a local cache directory
    # and use that instead
    from vissl.utils.checkpoint import get_checkpoint_folder
    from vissl.utils.io import cache_url, is_url

    if is_url(cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE):
        checkpoint_dir = get_checkpoint_folder(cfg)
        cache_dir = f"{checkpoint_dir}/params_file_cache/"
        cached_url_path = cache_url(
            url=cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE, cache_dir=cache_dir
        )
        cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE = cached_url_path

    # ZeRO2: Infer the settings for ShardedDDP which shards the optimizer state
    # and the model weights. For ShardedDDP, we must use the OSS optimizer,
    # set the right task name, use the PyTorch AMP if AMP is used.
    if cfg.MODEL.SHARDED_DDP_SETUP.USE_SDP:
        cfg.OPTIMIZER.use_zero = True
        cfg.TRAINER.TASK_NAME = "self_supervision_sdp_task"
        if cfg.MODEL.AMP_PARAMS.USE_AMP:
            cfg.MODEL.AMP_PARAMS.AMP_TYPE = "pytorch"

    # if we use a zero optimizer, we nest the optimizer related settings under the
    # base_optimizer.
    if cfg.OPTIMIZER.use_zero:
        cfg.OPTIMIZER["base_optimizer"] = cfg.OPTIMIZER.copy()
        cfg.OPTIMIZER.name = "zero"
        del cfg.OPTIMIZER.base_optimizer["param_schedulers"]
        del cfg.OPTIMIZER.base_optimizer["regularize_bn"]
        del cfg.OPTIMIZER.base_optimizer["regularize_bias"]
        del cfg.OPTIMIZER.base_optimizer["num_epochs"]
        del cfg.OPTIMIZER.base_optimizer["use_zero"]
        del cfg.OPTIMIZER.base_optimizer["head_optimizer_params"]

    # inference for the FSDP settings. Conditions are:
    # 1) use the FSDP task
    # 2) use the single param group in the optimizer
    # 3) if AMP is used, it must be PyTorch AMP
    # 4) If training SwAV, we automatically set the head to SwAV FSDP head
    # 4) Inference for the FSDP parameters to ensure the good convergence
    if cfg.MODEL.FSDP_CONFIG.AUTO_SETUP_FSDP:
        cfg.TRAINER.TASK_NAME = "self_supervision_fsdp_task"
        cfg.OPTIMIZER.construct_single_param_group_only = True

        # safely set flatten_parameters=True for FSDP trainings.
        cfg["MODEL"]["FSDP_CONFIG"]["flatten_parameters"] = True
        # recommended FSDP settings below for the convergence
        cfg["MODEL"]["FSDP_CONFIG"]["compute_dtype"] = "float32"

        # Inference of optimizer configuration
        if cfg["OPTIMIZER"]["use_larc"]:
            cfg["OPTIMIZER"]["name"] = "sgd_fsdp"

        # AMP based inference
        if cfg["MODEL"]["AMP_PARAMS"]["USE_AMP"]:
            cfg["MODEL"]["AMP_PARAMS"]["AMP_TYPE"] = "pytorch"
            cfg["MODEL"]["FSDP_CONFIG"]["mixed_precision"] = True
            cfg["MODEL"]["FSDP_CONFIG"]["fp32_reduce_scatter"] = True
        else:
            # if not using AMP, we can't use mixed_precision as it requires PyTorch AMP
            cfg["MODEL"]["FSDP_CONFIG"]["mixed_precision"] = False
            # if mixed_precision=False, FSDP mandates setting fp32_reduce_scatter=False
            cfg["MODEL"]["FSDP_CONFIG"]["fp32_reduce_scatter"] = False

        # Inference of the head in case of training with FSDP
        for i, head_param in enumerate(cfg.MODEL.HEAD.PARAMS):
            if head_param[0] == "swav_head":
                cfg.MODEL.HEAD.PARAMS[i][0] = "swav_head_fsdp"
            if head_param[0] == "eval_mlp":
                cfg.MODEL.HEAD.PARAMS[i][0] = "eval_mlp_fsdp"
            if head_param[0] == "mlp":
                cfg.MODEL.HEAD.PARAMS[i][0] = "mlp_fsdp"

        # Inference of the trunk in case of training with FSDP
        if cfg.MODEL.TRUNK.NAME == "regnet":
            cfg.MODEL.TRUNK.NAME = "regnet_fsdp"

        # Profiling the communication requires some setup for FSDP
        if cfg.PROFILING.MEMORY_PROFILING.TRACK_BY_LAYER_MEMORY:
            cfg["MODEL"]["FSDP_CONFIG"]["_TRACK_COMMUNICATIONS"] = True

        logging.info(f"Using the FSDP config: {cfg.MODEL.FSDP_CONFIG}")

    # Delete the AUTO_SETUP_FSDP key since we send the FSDP_CONFIG
    # to FSDP from fairscale which doesn't know about AUTO_SETUP_FSDP
    del cfg.MODEL.FSDP_CONFIG["AUTO_SETUP_FSDP"]
Ejemplo n.º 18
0
def infer_and_assert_hydra_config(cfg, engine_name: str):
    """
    Infer values of few parameters in the config file using the value of other config parameters
    1. Inferring losses
    2. Auto scale learning rate if user has specified auto scaling to be True.
    3. Infer meter names (model layer name being evaluated) since we support list meters
       that have multiple output and same target. This is very common in self-supervised
       learning where we want to evaluate metric for several layers of the models. VISSL
       supports running evaluation for multiple model layers in a single training run.
    4. Support multi-gpu DDP eval model by attaching a dummy parameter. This is particularly
       helpful for the multi-gpu feature extraction especially when the dataset is large for
       which features are being extracted.
    5. Infer what kind of labels are being used. If user has specified a labels source, we set
       LABEL_TYPE to "standard" (also vissl default), otherwise if no label is specified, we
       set the LABEL_TYPE to "sample_index".
    """
    cfg = infer_losses_config(cfg)
    cfg = infer_learning_rate(cfg)
    assert_transforms(cfg)

    # pass the seed to cfg["MODEL"] so that model init on different nodes can
    # use the same seed.
    # TODO (Min): once FSDP supports sync'ing weights from rank 0, we don't need
    #             this anymore.
    cfg["MODEL"]["_MODEL_INIT_SEED"] = cfg.SEED_VALUE

    # in case of linear evaluation, we often evaluate several layers at a time. For each
    # layer, there's a separate accuracy meter. In such case, we want to output the layer
    # name in the meters output to make it easy to interpret the results. This is
    # currently only supported for cases where we have linear evaluation.
    if cfg.METERS is not None:
        from vissl.models import is_feature_extractor_model

        # Ensure backwards compatibility of cfg.METERS.name.
        meter_name = cfg.METERS.get("name", "")
        if meter_name:
            meter_names = set(cfg.METERS.get("names", []))
            meter_names.add(meter_name)
            cfg.METERS.names = list(meter_names)

        meter_names = cfg.METERS.get("names", [])
        valid_meters = [
            "accuracy_list_meter",
            "mean_ap_list_meter",
            "precision_at_k_list_meter",
            "recall_at_k_list_meter",
        ]

        for meter_name in meter_names:
            if meter_name in valid_meters:
                feat_eval_ops_map = (cfg.MODEL.FEATURE_EVAL_SETTINGS.
                                     LINEAR_EVAL_FEAT_POOL_OPS_MAP)
                all_meter_names = [item[0] for item in feat_eval_ops_map]
                if is_feature_extractor_model(cfg.MODEL):
                    cfg.METERS[meter_name]["num_meters"] = len(
                        feat_eval_ops_map)
                    cfg.METERS[meter_name]["meter_names"] = all_meter_names
                elif engine_name == "extract_label_predictions":
                    if len(feat_eval_ops_map) > 0:
                        cfg.METERS[meter_name]["num_meters"] = len(
                            feat_eval_ops_map)
                        cfg.METERS[meter_name]["meter_names"] = all_meter_names
                    else:
                        # if user is not extracting from multiple layers, we assume
                        # the model head is being used.
                        cfg.METERS[meter_name]["num_meters"] = 1

    # in SSL, during pre-training we don't want to use annotated labels or during feature
    # extraction, we don't have annotated labels for some datasets. In such cases, we set
    # the label type to be just the image index in the dataset, unless the
    # user has specifically provided "zero" as the label type, which is
    # necessary when the CutMixUp collator is being used for self-supervised
    # training.
    if len(cfg.DATA.TRAIN.LABEL_SOURCES
           ) == 0 and cfg.DATA.TRAIN.LABEL_TYPE != "zero":
        cfg.DATA.TRAIN.LABEL_TYPE = "sample_index"
    if len(cfg.DATA.TEST.LABEL_SOURCES
           ) == 0 and cfg.DATA.TEST.LABEL_TYPE != "zero":
        cfg.DATA.TEST.LABEL_TYPE = "sample_index"

    # if the user has specified the model initialization from a params_file, we check if
    # the params_file is a url. If it is, we download the file to a local cache directory
    # and use that instead
    from vissl.utils.checkpoint import get_checkpoint_folder
    from vissl.utils.io import cache_url, is_url

    if is_url(cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE):
        checkpoint_dir = get_checkpoint_folder(cfg)
        cache_dir = f"{checkpoint_dir}/params_file_cache/"
        cached_url_path = cache_url(url=cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE,
                                    cache_dir=cache_dir)
        cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE = cached_url_path

    # ZeRO2: Infer the settings for ShardedDDP which shards the optimizer state
    # and the model weights. For ShardedDDP, we must use the OSS optimizer,
    # set the right task name, use the PyTorch AMP if AMP is used.
    if cfg.MODEL.SHARDED_DDP_SETUP.USE_SDP:
        cfg.OPTIMIZER.use_zero = True
        cfg.TRAINER.TASK_NAME = "self_supervision_sdp_task"
        if cfg.MODEL.AMP_PARAMS.USE_AMP:
            cfg.MODEL.AMP_PARAMS.AMP_TYPE = "pytorch"

    # if we use a zero optimizer, we nest the optimizer related settings under the
    # base_optimizer.
    if cfg.OPTIMIZER.use_zero:
        cfg.OPTIMIZER["base_optimizer"] = cfg.OPTIMIZER.copy()
        cfg.OPTIMIZER.name = "zero"
        del cfg.OPTIMIZER.base_optimizer["param_schedulers"]
        del cfg.OPTIMIZER.base_optimizer["regularize_bn"]
        del cfg.OPTIMIZER.base_optimizer["regularize_bias"]
        del cfg.OPTIMIZER.base_optimizer["num_epochs"]
        del cfg.OPTIMIZER.base_optimizer["use_zero"]
        del cfg.OPTIMIZER.base_optimizer["head_optimizer_params"]

    # Infer fsdp settings
    cfg = infer_fsdp_setup(cfg)

    if cfg.DATA.TRAIN.BASE_DATASET == "generic_ssl":
        assert (
            cfg.DATA.TRAIN.get("TRAIN_PHASES_PER_EPOCH", 1) == 1
        ), "When using the generic_ssl, we must set TRAIN_PHASES_PER_EPOCH = 1."

    if cfg.METERS.model_output_mask:
        assert (
            len(cfg.DATA.TEST.DATA_SOURCES) > 0
        ), "Model output mask is only applicable when there is a test dataset."

        assert (cfg.DATA.TEST.BASE_DATASET == "generic_ssl"
                ), "Model output mask is only supported with ssl dataset."

        # Remove CHECK_NAN hooks, as model output masking casts the logits
        # to -inf, which will throw an error from the CHECK_NAN hooks.
        cfg.HOOKS.CHECK_NAN = False

    if cfg.HOOKS.EMA_MODEL.ENABLE_EMA_METERS:
        assert cfg.METERS.get("name", "") or cfg.METERS.get(
            "names", []
        ), "Please specify METER.name or METER.names if you are enabling the EMA_MODEL hook."
Ejemplo n.º 19
0
def launch_distributed(
    cfg: AttrDict,
    node_id: int,
    engine_name: str,
    hook_generator: Callable[[Any], List[ClassyHook]],
):
    """
    Launch the distributed training across gpus, according to the cfg

    Args:
        cfg  -- VISSL yaml configuration
        node_id -- node_id for this node
        engine_name -- what engine to run: train or extract_features
        hook_generator -- Callback to generate all the ClassyVision hooks for this engine
    """
    node_id = get_node_id(node_id)
    dist_run_id = get_dist_run_id(cfg, cfg.DISTRIBUTED.NUM_NODES)
    world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE
    set_env_vars(local_rank=0, node_id=node_id, cfg=cfg)
    copy_to_local(cfg)

    # given the checkpoint folder, we check that there's not already a final checkpoint
    checkpoint_folder = get_checkpoint_folder(cfg)
    if is_training_finished(cfg, checkpoint_folder=checkpoint_folder):
        logging.info(
            f"Training already succeeded on node: {node_id}, exiting.")
        return

    # Get the checkpoint where to load from. The load_checkpoints function will
    # automatically take care of detecting whether it's a resume or not.
    symlink_checkpoint_path = f"{checkpoint_folder}/checkpoint.torch"
    if cfg.CHECKPOINT.USE_SYMLINK_CHECKPOINT_FOR_RESUME and PathManager.exists(
            symlink_checkpoint_path):
        checkpoint_path = f"{checkpoint_folder}/checkpoint.torch"
    else:
        checkpoint_path = get_resume_checkpoint(
            cfg, checkpoint_folder=checkpoint_folder)

    try:
        if world_size > 1:
            torch.multiprocessing.spawn(
                _distributed_worker,
                nprocs=cfg.DISTRIBUTED.NUM_PROC_PER_NODE,
                args=(
                    cfg,
                    node_id,
                    dist_run_id,
                    engine_name,
                    checkpoint_path,
                    checkpoint_folder,
                    hook_generator,
                ),
                daemon=False,
            )
        else:
            _distributed_worker(
                local_rank=0,
                cfg=cfg,
                node_id=node_id,
                dist_run_id=dist_run_id,
                engine_name=engine_name,
                checkpoint_path=checkpoint_path,
                checkpoint_folder=checkpoint_folder,
                hook_generator=hook_generator,
            )

    except (KeyboardInterrupt, RuntimeError) as e:
        logging.error("Wrapping up, caught exception: ", e)
        if isinstance(e, RuntimeError):
            raise e
    finally:
        cleanup_local_dir(cfg)

    logging.info("All Done!")
Ejemplo n.º 20
0
def default_hook_generator(cfg: AttrDict) -> List[ClassyHook]:
    """
    The utility function that prepares all the hoooks that will be used in training
    based on user selection. Some basic hooks are used by default.

    Optional hooks:
        - Tensorboard hook,
        - loss specific hooks (swav loss, deepcluster loss, moco loss) used only when the
          loss is being used
        - model complexity hook (if user wants to compute model flops, activations, params)
          enable the hook via HOOKS.MODEL_COMPLEXITY.COMPUTE_COMPLEXITY = True

    Returns:
        hooks (List(functions)): list containing the hook functions that will be used
    """
    hooks = []

    # conditionally add hooks based on use-case
    if cfg.HOOKS.PERF_STATS.MONITOR_PERF_STATS:
        perf_stat_freq = (cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY
                          if cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY > 0 else
                          None)
        hooks.append(LogPerfTimeMetricsHook(perf_stat_freq))
    if cfg.LOSS.name == "swav_loss":
        hooks.extend([SwAVUpdateQueueScoresHook(), NormalizePrototypesHook()])
    if cfg.LOSS.name == "swav_momentum_loss":
        hooks.extend([
            SwAVMomentumHook(
                cfg.LOSS["swav_momentum_loss"]["momentum"],
                cfg.LOSS["swav_momentum_loss"]
                ["momentum_eval_mode_iter_start"],
                cfg.LOSS["swav_momentum_loss"]["crops_for_assign"],
            ),
            SwAVMomentumNormalizePrototypesHook(),
        ])
    if cfg.LOSS.name == "deepclusterv2_loss":
        hooks.extend([InitMemoryHook(), ClusterMemoryHook()])
    if cfg.LOSS.name == "moco_loss":
        hooks.extend([
            MoCoHook(
                cfg.LOSS["moco_loss"]["momentum"],
                shuffle_batch=(
                    not cfg.MODEL.SYNC_BN_CONFIG.CONVERT_BN_TO_SYNC_BN),
            )
        ])
    if cfg.HOOKS.MODEL_COMPLEXITY.COMPUTE_COMPLEXITY:
        hooks.extend([SSLModelComplexityHook()])
    if cfg.HOOKS.LOG_GPU_STATS:
        hooks.extend([LogGpuStatsHook()])
    if cfg.HOOKS.MEMORY_SUMMARY.PRINT_MEMORY_SUMMARY:
        hooks.extend(
            [LogGpuMemoryHook(cfg.HOOKS.MEMORY_SUMMARY.LOG_ITERATION_NUM)])
    if cfg.HOOKS.TENSORBOARD_SETUP.USE_TENSORBOARD:
        assert is_tensorboard_available(), (
            "Tensorboard must be installed to use it. Please install tensorboard using:"
            "If pip environment: `pip install tensorboard` "
            "If using conda and you prefer conda install of tensorboard: "
            "`conda install -c conda-forge tensorboard`")
        tb_hook = get_tensorboard_hook(cfg)
        hooks.extend([tb_hook])
    if cfg.MODEL.GRAD_CLIP.USE_GRAD_CLIP:
        hooks.extend([
            GradClipHook(
                norm_type=cfg.MODEL.GRAD_CLIP.NORM_TYPE,
                max_norm=cfg.MODEL.GRAD_CLIP.MAX_NORM,
            )
        ])

    # hooks that are used irrespective of workflow type
    rolling_btime_freq = (cfg.HOOKS.PERF_STATS.ROLLING_BTIME_FREQ
                          if cfg.HOOKS.PERF_STATS.ROLLING_BTIME_FREQ > 0 else
                          None)

    if ProfilingHook.is_enabled(cfg.PROFILING):
        hooks.append(ProfilingHook(profiling_config=cfg.PROFILING))

    world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE
    checkpoint_folder = get_checkpoint_folder(cfg)
    hooks.extend([
        CheckNanLossHook(),
        SetDataSamplerEpochHook(),
        FreezeParametersHook(),
        UpdateBatchesSeenHook(),
        UpdateTrainBatchTimeHook(),
        UpdateTestBatchTimeHook(),
        UpdateTrainIterationNumHook(),
        LogLossMetricsCheckpointHook(world_size),
        LogLossLrEtaHook(checkpoint_folder, rolling_btime_freq),
    ])
    return hooks
Ejemplo n.º 21
0
def instance_retrieval_test(args, cfg):
    # We require 1-gpu for feature extraction. Hence check CUDA is available.
    assert torch.cuda.is_available(), "CUDA not available, Exit!"

    train_dataset_name = cfg.IMG_RETRIEVAL.TRAIN_DATASET_NAME
    eval_dataset_name = cfg.IMG_RETRIEVAL.EVAL_DATASET_NAME
    spatial_levels = cfg.IMG_RETRIEVAL.SPATIAL_LEVELS
    resize_img = cfg.IMG_RETRIEVAL.RESIZE_IMG
    eval_binary_path = cfg.IMG_RETRIEVAL.EVAL_BINARY_PATH
    root_dataset_path = cfg.IMG_RETRIEVAL.DATASET_PATH
    save_features = cfg.IMG_RETRIEVAL.SAVE_FEATURES

    temp_dir = None
    if save_features:
        temp_dir = os.path.join(get_checkpoint_folder(cfg), "features")
        logging.info(f"Temp directory: {temp_dir}")

    ############################################################################
    # Step 1: Prepare the train/eval datasets, create model and load weights
    # We only create the train dataset if we need PCA/whitening otherwise
    # train_dataset is None
    train_dataset = get_train_dataset(cfg, root_dataset_path,
                                      train_dataset_name, eval_binary_path)

    # create the eval dataset. INSTRE data evaluation requires whitening.
    eval_dataset = get_eval_dataset(cfg, root_dataset_path, eval_dataset_name,
                                    eval_binary_path)

    # Setup the data transforms (basic) that we apply on the train/eval dataset.
    transforms = get_transforms(cfg, eval_dataset_name)

    # Create the image helper
    image_helper = InstanceRetrievalImageLoader(S=resize_img,
                                                transforms=transforms)

    # Build the model on gpu and set in the eval mode
    model = build_retrieval_model(cfg)
    model = copy_model_to_gpu(model)

    logging.info("Freezing the model.....")
    model.eval()
    model.freeze_head_and_trunk()

    ############################################################################
    # Step 2: Extract the features for the train dataset, calculate PCA or
    # whitening and save
    if cfg.IMG_RETRIEVAL.TRAIN_PCA_WHITENING:
        logging.info("Extracting training features...")
        # the features are already processed based on type: rmac | GeM | l2 norm
        train_features = get_train_features(
            cfg,
            temp_dir,
            train_dataset_name,
            resize_img,
            spatial_levels,
            image_helper,
            train_dataset,
            model,
        )
        ########################################################################
        # Train PCA on the train features
        pca_out_fname = None
        if temp_dir:

            pca_out_fname = f"{temp_dir}/{train_dataset_name}_S{resize_img}_PCA.pickle"
        if pca_out_fname and PathManager.exists(pca_out_fname):
            logging.info("Loading PCA...")
            pca = load_pca(pca_out_fname)
        else:
            logging.info("Training and saving PCA...")
            pca = train_and_save_pca(train_features, cfg.IMG_RETRIEVAL.N_PCA,
                                     pca_out_fname)
    else:
        pca = None

    ############################################################################
    # Step 4: Extract db_features and q_features for the eval dataset
    logging.info("Extracting Queries features...")
    features_queries = get_queries_features(
        cfg,
        temp_dir,
        eval_dataset_name,
        resize_img,
        spatial_levels,
        image_helper,
        eval_dataset,
        model,
        pca,
    )
    logging.info("Extracting Dataset features...")
    features_dataset = get_dataset_features(
        cfg,
        temp_dir,
        eval_dataset_name,
        resize_img,
        spatial_levels,
        image_helper,
        eval_dataset,
        model,
        pca,
    )

    ############################################################################
    # Step 5: Compute similarity, score, and save results
    logging.info("Calculating similarity and score...")
    sim = features_queries.dot(features_dataset.T)
    logging.info(f"Similarity tensor: {sim.shape}")
    results = eval_dataset.score(sim, temp_dir)

    ############################################################################
    # Step 6: save results and cleanup the temp directory
    if cfg.IMG_RETRIEVAL.SAVE_RETRIEVAL_RANKINGS_SCORES:
        checkpoint_folder = get_checkpoint_folder(cfg)

        # Save the rankings
        sim = sim.T
        ranks = np.argsort(-sim, axis=0)
        save_file(ranks.T.tolist(),
                  os.path.join(checkpoint_folder, "rankings.json"))

        # Save the similarity scores
        save_file(
            sim.tolist(),
            os.path.join(checkpoint_folder, "similarity_scores.json"),
        )

        # Save the result metrics
        save_file(results, os.path.join(checkpoint_folder, "metrics.json"))

    logging.info("All done!!")
Ejemplo n.º 22
0
def run_knn_at_layer(cfg: AttrDict, layer_name: str = "heads"):
    """
    Run the Nearest Neighbour benchmark at the layer "layer_name"
    """
    temperature = cfg.NEAREST_NEIGHBOR.SIGMA
    num_neighbors = cfg.NEAREST_NEIGHBOR.TOPK
    feature_dir = cfg.NEAREST_NEIGHBOR.FEATURES.PATH
    output_dir = get_checkpoint_folder(cfg)
    logging.info(f"Testing with sigma: {temperature}, topk neighbors: {num_neighbors}")

    ############################################################################
    # Step 1: get train and test features
    train_out = ExtractedFeaturesLoader.load_features(
        feature_dir, "train", layer_name, flatten_features=True
    )
    train_features, train_labels = train_out["features"], train_out["targets"]
    test_out = ExtractedFeaturesLoader.load_features(
        feature_dir, "test", layer_name, flatten_features=True
    )
    test_features, test_labels = test_out["features"], test_out["targets"]
    train_features = torch.from_numpy(train_features).float()
    test_features = torch.from_numpy(test_features).float()
    train_labels = torch.LongTensor(train_labels)
    num_classes = train_labels.max() + 1

    ###########################################################################
    # Step 2: calculate the nearest neighbor and the metrics
    accuracies = Accuracies()
    if cfg.NEAREST_NEIGHBOR.L2_NORM_FEATS:
        train_features = nn.functional.normalize(train_features, dim=1, p=2)
        test_features = nn.functional.normalize(test_features, dim=1, p=2)

    # put train features and labels on gpu and transpose train features
    if cfg.NEAREST_NEIGHBOR.USE_CUDA:
        train_features = train_features.cuda().t()
        test_features = test_features.cuda()
        train_labels = train_labels.cuda()
    else:
        train_features = train_features.t()

    num_test_images, num_chunks = test_labels.shape[0], 100
    imgs_per_chunk = num_test_images // num_chunks
    output_targets, output_predicted_label, output_inds = [], [], []
    with torch.no_grad():
        for idx in range(0, num_test_images, imgs_per_chunk):
            # get the features for test images and normalize the features if needed
            features = test_features[
                idx : min((idx + imgs_per_chunk), num_test_images), :
            ]
            targets = test_labels[idx : min((idx + imgs_per_chunk), num_test_images), :]
            batch_size = targets.shape[0]
            targets = torch.LongTensor(targets)
            if cfg.NEAREST_NEIGHBOR.USE_CUDA:
                targets = torch.LongTensor(targets).cuda()

            # calculate the dot product and compute top-k neighbors
            similarity = torch.mm(features, train_features)
            distances, indices = similarity.topk(
                num_neighbors, largest=True, sorted=True
            )
            candidates = train_labels.view(1, -1).expand(batch_size, -1)
            retrieved_neighbors = torch.gather(candidates, 1, indices)

            retrieval_one_hot = torch.zeros(batch_size * num_neighbors, num_classes)
            if cfg.NEAREST_NEIGHBOR.USE_CUDA:
                retrieval_one_hot = retrieval_one_hot.cuda()
            retrieval_one_hot.scatter_(1, retrieved_neighbors.view(-1, 1), 1)
            predictions = _get_sorted_predictions(
                batch_size, num_classes, distances, retrieval_one_hot, temperature
            )

            # find the predictions that match the target
            accuracies = accuracies + Accuracies.from_batch(predictions, targets)

            # get the predictions, nearest neighbors, inds to save
            output_inds.extend(range(idx, min((idx + imgs_per_chunk), num_test_images)))
            output_predicted_label.append(predictions.data.cpu().numpy())
            output_targets.append(targets.data.cpu().numpy())

    _save_knn_results(
        output_dir, layer_name, output_inds, output_predicted_label, output_targets
    )
    accuracies.log(layer_name)
    return accuracies.top_1, accuracies.top_5, accuracies.total
Ejemplo n.º 23
0
def launch_distributed(
    cfg: AttrDict,
    node_id: int,
    engine_name: str,
    hook_generator: Callable[[Any], List[ClassyHook]],
):
    """
    Launch the distributed training across gpus of the current node according to the cfg.

    If more than 1 nodes are needed for training, this function should be called on each
    of the different nodes, each time with an unique node_id in the range [0..N-1] if N
    is the total number of nodes to take part in training.

    Alternatively, you can use SLURM or any cluster management system to run this function
    for you.

    Configure the node_id, dist_run_id, setup the environment variabled

    Args:
        cfg (AttrDict): VISSL yaml configuration
        node_id (int): node_id for this node
        engine_name (str): what engine to run: train or extract_features
        hook_generator (Callable): Callback to generate all the ClassyVision hooks
            for this engine
    """

    setup_logging(__name__)
    node_id = get_node_id(node_id)
    dist_run_id = get_dist_run_id(cfg, cfg.DISTRIBUTED.NUM_NODES)

    # If using gpus, we check that the user has specified <= gpus available on user system.
    if cfg.MACHINE.DEVICE == "gpu":
        assert cfg.DISTRIBUTED.NUM_PROC_PER_NODE <= torch.cuda.device_count(
        ), (f"User system doesn't have requested {cfg.DISTRIBUTED.NUM_PROC_PER_NODE} gpus "
            f"available. Number of gpus found on user system={torch.cuda.device_count()}. "
            "Please set the DISTRIBUTED.NUM_PROC_PER_NODE properly.")

    # set the environment variables including local rank, node id etc.
    set_env_vars(local_rank=0, node_id=node_id, cfg=cfg)

    # given the checkpoint folder, we check that there's not already a final checkpoint
    # and that if there already exists a final checkpoint and user is not overriding
    # to ignore the final checkpoint
    checkpoint_folder = get_checkpoint_folder(cfg)
    if is_training_finished(cfg, checkpoint_folder=checkpoint_folder):
        logging.info(
            f"Training already succeeded on node: {node_id}, exiting.")
        return

    # Get the checkpoint where to resume from. The get_resume_checkpoint function will
    # automatically take care of detecting whether it's a resume or not.
    symlink_checkpoint_path = f"{checkpoint_folder}/checkpoint.torch"
    if cfg.CHECKPOINT.USE_SYMLINK_CHECKPOINT_FOR_RESUME and g_pathmgr.exists(
            symlink_checkpoint_path):
        checkpoint_path = f"{checkpoint_folder}/checkpoint.torch"
    else:
        checkpoint_path = get_resume_checkpoint(
            cfg, checkpoint_folder=checkpoint_folder)

    # assert that if the user set the PARAMS_FILE, it must exist and be valid.
    # we only use the PARAMS_FILE init if the checkpoint doesn't exist for the
    # given training. This ensures that if the same training resumes, then it
    # resumes from the checkpoint and not the weight init
    if checkpoint_path is None and cfg["MODEL"]["WEIGHTS_INIT"]["PARAMS_FILE"]:
        params_file = cfg["MODEL"]["WEIGHTS_INIT"]["PARAMS_FILE"]
        error_message = f"Specified PARAMS_FILE does NOT exist: {params_file}"
        assert g_pathmgr.exists(params_file), error_message

    # copy the data to local if user wants. This can speed up dataloading.
    _copy_to_local(cfg)

    try:
        torch.multiprocessing.spawn(
            _distributed_worker,
            nprocs=cfg.DISTRIBUTED.NUM_PROC_PER_NODE,
            args=(
                cfg,
                node_id,
                dist_run_id,
                engine_name,
                checkpoint_path,
                checkpoint_folder,
                hook_generator,
            ),
            daemon=False,
        )
    except (KeyboardInterrupt, RuntimeError) as e:
        logging.error("Wrapping up, caught exception: ", e)
        if isinstance(e, RuntimeError):
            raise e
    finally:
        _cleanup_local_dir(cfg)

    logging.info("All Done!")
    shutdown_logging()
Ejemplo n.º 24
0
def instance_retrieval_test(args, cfg):
    if (cfg.IMG_RETRIEVAL.USE_FEATURE_EXTRACTION_ENGINE
            and not cfg.IMG_RETRIEVAL.FEATURE_EXTRACTION_DIR):
        # We require 1-gpu for feature extraction. Hence check CUDA is available.
        # If we provide FEATURE_EXTRACTION_DIR, we have already extracted the features
        # and do not require GPU.
        assert torch.cuda.is_available(), "CUDA not available, Exit!"

    train_dataset_name = cfg.IMG_RETRIEVAL.TRAIN_DATASET_NAME
    eval_dataset_name = cfg.IMG_RETRIEVAL.EVAL_DATASET_NAME
    spatial_levels = cfg.IMG_RETRIEVAL.SPATIAL_LEVELS
    resize_img = cfg.IMG_RETRIEVAL.RESIZE_IMG
    eval_binary_path = cfg.IMG_RETRIEVAL.EVAL_BINARY_PATH
    root_dataset_path = cfg.IMG_RETRIEVAL.DATASET_PATH
    save_features = cfg.IMG_RETRIEVAL.SAVE_FEATURES
    use_feature_extractor = cfg.IMG_RETRIEVAL.USE_FEATURE_EXTRACTION_ENGINE

    temp_dir = None

    if save_features:
        temp_dir = os.path.join(get_checkpoint_folder(cfg), "features")
        logging.info(f"Temp directory: {temp_dir}")

    ############################################################################
    # Step 1: Prepare the train/eval datasets, create model and load weights
    # We only create the train dataset if we need PCA/whitening otherwise
    # train_dataset is None
    train_dataset = get_train_dataset(cfg, root_dataset_path,
                                      train_dataset_name, eval_binary_path)

    # create the eval dataset. INSTRE data evaluation requires whitening.
    eval_dataset = get_eval_dataset(cfg, root_dataset_path, eval_dataset_name,
                                    eval_binary_path)

    # Setup the data transforms (basic) that we apply on the train/eval dataset.
    transforms = get_transforms(cfg, eval_dataset_name)

    # Create the image helper
    image_helper = InstanceRetrievalImageLoader(
        S=resize_img,
        transforms=transforms,
        center_crop=cfg.IMG_RETRIEVAL.CENTER_CROP)

    model = None
    if not use_feature_extractor:
        # Build the model on gpu and set in the eval mode
        model = build_retrieval_model(cfg)
        model = copy_model_to_gpu(model)

        logging.info("Freezing the model.....")
        model.eval()
        model.freeze_head_and_trunk()

    ############################################################################
    # Step 2: Extract the features for the train dataset, calculate PCA or
    # whitening and save
    if cfg.IMG_RETRIEVAL.TRAIN_PCA_WHITENING:
        logging.info("Extracting training features...")
        # the features are already processed based on type: rmac | GeM | l2 norm
        with PerfTimer("get_train_features", PERF_STATS):
            # TODO: encapsulate the approach "WithFeatureExtractor" from the other one.
            if use_feature_extractor:
                input_dir = (cfg.IMG_RETRIEVAL.FEATURE_EXTRACTION_DIR
                             or get_checkpoint_folder(cfg))
                input_dir = os.path.join(input_dir, "train_database")
                train_features = load_and_process_features(
                    cfg, input_dir, "train")

            else:
                train_features = extract_train_features(
                    cfg,
                    temp_dir,
                    train_dataset_name,
                    resize_img,
                    spatial_levels,
                    image_helper,
                    train_dataset,
                    model,
                )

            train_features = np.vstack(
                [x.reshape(-1, x.shape[-1]) for x in train_features])

        ########################################################################
        # Train PCA on the train features
        pca_out_fname = None
        if temp_dir:
            pca_out_fname = f"{temp_dir}/{train_dataset_name}_S{resize_img}_PCA.pickle"
        if pca_out_fname and g_pathmgr.exists(pca_out_fname):
            logging.info("Loading PCA...")
            pca = load_pca(pca_out_fname)
        else:
            logging.info("Training and saving PCA...")
            pca = train_and_save_pca(train_features, cfg.IMG_RETRIEVAL.N_PCA,
                                     pca_out_fname)
    else:
        pca = None

    ############################################################################
    # Step 4: Extract db_features and q_features for the eval dataset
    with PerfTimer("get_query_features", PERF_STATS):
        logging.info("Extracting Queries features...")
        # TODO: encapsulate the approach "WithFeatureExtractor" from the other one.
        if use_feature_extractor:
            input_dir = (cfg.IMG_RETRIEVAL.FEATURE_EXTRACTION_DIR
                         or get_checkpoint_folder(cfg))
            input_dir = os.path.join(input_dir, "query")
            features_queries = load_and_process_features(
                cfg, input_dir, "test", pca)

        else:
            features_queries = get_queries_features(
                cfg,
                temp_dir,
                eval_dataset_name,
                resize_img,
                spatial_levels,
                image_helper,
                eval_dataset,
                model,
                pca,
            )

        features_queries = np.vstack(features_queries)

    with PerfTimer("get_dataset_features", PERF_STATS):
        logging.info("Extracting Dataset features...")
        # TODO: encapsulate the approach "WithFeatureExtractor" from the other one.
        if use_feature_extractor:
            input_dir = (cfg.IMG_RETRIEVAL.FEATURE_EXTRACTION_DIR
                         or get_checkpoint_folder(cfg))
            input_dir = os.path.join(input_dir, "train_database")
            features_dataset = load_and_process_features(
                cfg, input_dir, "test", pca)
        else:
            features_dataset = get_dataset_features(
                cfg,
                temp_dir,
                eval_dataset_name,
                resize_img,
                spatial_levels,
                image_helper,
                eval_dataset,
                model,
                pca,
            )

        features_dataset = np.vstack(features_dataset)

    ############################################################################
    # Step 5: Compute similarity, score, and save results
    with PerfTimer("scoring_results", PERF_STATS):
        logging.info("Calculating similarity and score...")

        if cfg.IMG_RETRIEVAL.SIMILARITY_MEASURE == "cosine_similarity":
            sim = features_queries.dot(features_dataset.T)
        elif cfg.IMG_RETRIEVAL.SIMILARITY_MEASURE == "l2":
            sim = -compute_l2_distance_matrix(features_queries,
                                              features_dataset)
        else:
            raise ValueError(
                f"{ cfg.IMG_RETRIEVAL.SIMILARITY_MEASURE } not supported.")

        logging.info(f"Similarity tensor: {sim.shape}")
        results = eval_dataset.score(sim, temp_dir)

    ############################################################################
    # Step 6: save results and cleanup the temp directory
    if cfg.IMG_RETRIEVAL.SAVE_RETRIEVAL_RANKINGS_SCORES:
        checkpoint_folder = get_checkpoint_folder(cfg)

        # Save the rankings
        sim = sim.T
        ranks = np.argsort(-sim, axis=0)
        save_file(ranks.T.tolist(),
                  os.path.join(checkpoint_folder, "rankings.json"))

        # Save the similarity scores
        save_file(sim.tolist(),
                  os.path.join(checkpoint_folder, "similarity_scores.json"))
        # Save the result metrics
        save_file(
            results,
            os.path.join(checkpoint_folder, "metrics.json"),
            append_to_json=False,
        )

    logging.info("All done!!")
Ejemplo n.º 25
0
def rank_features(args: Namespace, cfg: AttrDict):
    # faiss is an optional dependency for VISSL.
    assert is_faiss_available(), (
        "Please install faiss using conda install faiss-gpu -c pytorch "
        "if using conda or pip install faiss-gpu")
    import faiss

    ranking_backend = cfg.RANKING.RANKING_BACKEND
    data_split = cfg.RANKING.FEATURES.DATA_PARTITION
    data_name = cfg.RANKING.FEATURES.DATASET_NAME
    output_dir = get_checkpoint_folder(cfg)

    ########### Step 1: Extract the features on full dataset ###################
    feature_data, image_paths = get_data_features_and_images(cfg)

    ########### Step 2: Get the data information ###################
    features = feature_data["features"]
    # features are of shape num_samples x feature_dim
    assert features.ndim == 2, f"Features incorrect shape: {features.shape}"
    assert features.dtype == np.float32, "Features are not float32 type"
    logging.info(f"Ranking Features: {features.shape}")

    ########### Step 3: Optionally L2 normalize features ###################
    if cfg.RANKING.APPLY_PCA:
        logging.info("L2 normalizing the features now...")
        feat_norm = np.linalg.norm(features, axis=1) + 1e-5
        features = features / feat_norm[:, np.newaxis]
        logging.info(f"Projecting down to {cfg.RANKING.PCA_DIM} dims ...")
        features = PCA(
            n_components=cfg.RANKING.PCA_DIM).fit_transform(features)
        logging.info(f"PCA features: {features.shape}")

    if cfg.RANKING.NORMALIZE_FEATS:
        logging.info("L2 normalizing the features now...")
        feat_norm = np.linalg.norm(features, axis=1) + 1e-5
        features = features / feat_norm[:, np.newaxis]

    ########### Step 4: Build the L2 index on the features ###################
    logging.info(
        "Building the L2 index and searching nearest neighbor with faiss now..."
    )
    assert ranking_backend == "faiss", "Only faiss clustering is supported currently"
    if cfg.RANKING.USE_GPU:
        logging.info("Using gpu for faiss indexing...")
        index = faiss.GpuIndexFlatL2(
            faiss.StandardGpuResources(),
            features.shape[1],
        )
    else:
        logging.info("Using CPU for faiss indexing...")
        index = faiss.IndexFlatL2(features.shape[1])
    index.add(features)
    logging.info("Doing the nearest neighbor search now...")
    # Num. neighbors here is 2, so for a given point we find that same point at
    # distance 0, and its nearest neighbor
    distances, nn_indices = index.search(features, 2)
    # Remove distance to self, which is always 0
    distances = [d[1] for d in distances]

    ########### Step 5: Sorting the distances now ############
    logging.info("Sorting and ranking based on the L2 distance now...")
    img_paths_and_distances = zip(image_paths, distances)
    img_paths_and_distances = sorted(img_paths_and_distances,
                                     key=lambda x: x[1],
                                     reverse=True)
    paths, distances = [x[0] for x in img_paths_and_distances
                        ], [x[1] for x in img_paths_and_distances]

    #### Step 6: Save image paths and distances... ###
    data_split = data_split.lower()
    ranking_output_dict = {
        "img_paths": paths,
        "distances": distances,
    }
    ranking_output_filepath = (
        f"{output_dir}/ranking_{data_name}_{data_split}_{ranking_backend}.pkl")
    save_file(ranking_output_dict, ranking_output_filepath)
    logging.info("All Done!")
Ejemplo n.º 26
0
def extract_main(cfg: AttrDict,
                 dist_run_id: str,
                 local_rank: int = 0,
                 node_id: int = 0):
    """
    Sets up and executes feature extraction workflow per machine.

    Args:
        cfg (AttrDict): user specified input config that has optimizer, loss, meters etc
                        settings relevant to the training
        dist_run_id (str): For multi-gpu training with PyTorch, we have to specify
                           how the gpus are going to rendezvous. This requires specifying
                           the communication method: file, tcp and the unique rendezvous
                           run_id that is specific to 1 run.
                           We recommend:
                                1) for 1node: use init_method=tcp and run_id=auto
                                2) for multi-node, use init_method=tcp and specify
                                run_id={master_node}:{port}
        local_rank (int): id of the current device on the machine. If using gpus,
                        local_rank = gpu number on the current machine
        node_id (int): id of the current machine. starts from 0. valid for multi-gpu
    """

    # setup logging
    setup_logging(__name__)
    # setup the environment variables
    set_env_vars(local_rank, node_id, cfg)

    # setup the multiprocessing to be forkserver.
    # See https://fb.quip.com/CphdAGUaM5Wf
    setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD)

    # set seeds
    logging.info("Setting seed....")
    set_seeds(cfg)

    # print the training settings and system settings
    local_rank, _ = get_machine_local_and_dist_rank()
    if local_rank == 0:
        print_cfg(cfg)
        logging.info("System config:\n{}".format(collect_env_info()))

    output_dir = get_checkpoint_folder(cfg)
    trainer = SelfSupervisionTrainer(cfg, dist_run_id)
    features = trainer.extract()

    for split in features.keys():
        logging.info(f"============== Split: {split} =======================")
        layers = features[split].keys()
        for layer in layers:
            out_feat_file = (
                f"{output_dir}/rank{local_rank}_{split}_{layer}_features.npy")
            out_target_file = (
                f"{output_dir}/rank{local_rank}_{split}_{layer}_targets.npy")
            out_inds_file = f"{output_dir}/rank{local_rank}_{split}_{layer}_inds.npy"
            logging.info("Saving extracted features: {} {} to: {}".format(
                layer, features[split][layer]["features"].shape,
                out_feat_file))
            save_file(features[split][layer]["features"], out_feat_file)
            logging.info("Saving extracted targets: {} to: {}".format(
                features[split][layer]["targets"].shape, out_target_file))
            save_file(features[split][layer]["targets"], out_target_file)
            logging.info("Saving extracted indices: {} to: {}".format(
                features[split][layer]["inds"].shape, out_inds_file))
            save_file(features[split][layer]["inds"], out_inds_file)
    logging.info("All Done!")
    # close the logging streams including the filehandlers
    shutdown_logging()
Ejemplo n.º 27
0
def assert_hydra_conf(cfg):
    """
    Infer values of few parameters in the config file using the value of other config parameters
    1. Inferring losses
    2. Auto scale learning rate if user has specified auto scaling to be True.
    3. Infer meter names (model layer name being evaluated) since we support list meters
       that have multiple output and same target. This is very common in self-supervised
       learning where we want to evaluate metric for several layers of the models. VISSL
       supports running evaluation for multiple model layers in a single training run.
    4. Support multi-gpu DDP eval model by attaching a dummy parameter. This is particularly
       helpful for the multi-gpu feature extraction especially when the dataset is large for
       which features are being extracted.
    5. Infer what kind of labels are being used. If user has specified a labels source, we set
       LABEL_TYPE to "standard" (also vissl default), otherwise if no label is specified, we
       set the LABEL_TYPE to "sample_index".
    """
    cfg = infer_losses_config(cfg)
    cfg = infer_learning_rate(cfg)

    # in case of linear evaluation, we often evaluate several layers at a time. For each
    # layer, there's a separate accuracy meter. In such case, we want to output the layer
    # name in the meters output to make it easy to interpret the results. This is
    # currently only supported for cases where we have linear evaluation.
    if cfg.METERS is not None:
        from vissl.models import is_feature_extractor_model

        meter_name = cfg.METERS.get("name", "")
        valid_meters = ["accuracy_list_meter", "mean_ap_list_meter"]
        if meter_name:
            if meter_name in valid_meters and is_feature_extractor_model(
                    cfg.MODEL):
                cfg.METERS[meter_name]["num_meters"] = len(
                    cfg.MODEL.FEATURE_EVAL_SETTINGS.
                    LINEAR_EVAL_FEAT_POOL_OPS_MAP)
                cfg.METERS[meter_name]["meter_names"] = [
                    item[0] for item in cfg.MODEL.FEATURE_EVAL_SETTINGS.
                    LINEAR_EVAL_FEAT_POOL_OPS_MAP
                ]

    # in case of feature evaluation mode, we freeze the trunk. The Feature evaluation mode
    # is used for the feature extraction of trunk as well. VISSL supports distributed feature
    # extraction to speed up the extraction time. Since the model needs to be DDP for the
    # distributed extraction, we need some dummy parameters in the model otherwise model
    # can't be converted to DDP. So we attach some dummy head to the model.
    world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE
    if (cfg.MODEL.FEATURE_EVAL_SETTINGS.EVAL_MODE_ON
            and cfg.MODEL.FEATURE_EVAL_SETTINGS.FREEZE_TRUNK_ONLY
            and cfg.MODEL.FEATURE_EVAL_SETTINGS.EXTRACT_TRUNK_FEATURES_ONLY
            and world_size > 1 and len(cfg.MODEL.HEAD.PARAMS) == 0):
        cfg.MODEL.HEAD.PARAMS = [["mlp", {"dims": [2048, 1000]}]]

    # in SSL, during pre-training we don't want to use annotated labels or during feature
    # extraction, we don't have annotated labels for some datasets. In such cases, we set
    # the label type to be just the image index in the dataset, unless the
    # user has specifically provided "zero" as the label type, which is
    # necessary when the CutMixUp collator is being used for self-supervised
    # training.
    if len(cfg.DATA.TRAIN.LABEL_SOURCES
           ) == 0 and cfg.DATA.TRAIN.LABEL_TYPE != "zero":
        cfg.DATA.TRAIN.LABEL_TYPE = "sample_index"
    if len(cfg.DATA.TEST.LABEL_SOURCES
           ) == 0 and cfg.DATA.TEST.LABEL_TYPE != "zero":
        cfg.DATA.TEST.LABEL_TYPE = "sample_index"

    # if the user has specified the model initialization from a params_file, we check if
    # the params_file is a url. If it is, we download the file to a local cache directory
    # and use that instead
    from vissl.utils.checkpoint import get_checkpoint_folder
    from vissl.utils.io import cache_url, is_url

    if is_url(cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE):
        checkpoint_dir = get_checkpoint_folder(cfg)
        cache_dir = f"{checkpoint_dir}/params_file_cache/"
        cached_url_path = cache_url(url=cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE,
                                    cache_dir=cache_dir)
        cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE = cached_url_path

    # if we use a zero optimizer, we nest the optimizer related settings under the
    # base_optimizer.
    if cfg.OPTIMIZER.use_zero:
        cfg.OPTIMIZER["base_optimizer"] = cfg.OPTIMIZER.copy()
        cfg.OPTIMIZER.name = "zero"
        del cfg.OPTIMIZER.base_optimizer["param_schedulers"]
        del cfg.OPTIMIZER.base_optimizer["regularize_bn"]
        del cfg.OPTIMIZER.base_optimizer["regularize_bias"]
        del cfg.OPTIMIZER.base_optimizer["num_epochs"]
        del cfg.OPTIMIZER.base_optimizer["use_zero"]
        del cfg.OPTIMIZER.base_optimizer["head_optimizer_params"]
Ejemplo n.º 28
0
def run_knn_at_layer_low_memory(cfg: AttrDict, layer_name: str = "heads"):
    """
    Alternate implementation of kNN which scales to bigger features
    and bigger "train" splits
    """
    if cfg.NEAREST_NEIGHBOR.USE_CUDA:
        logging.warning(
            "config.NEAREST_NEIGHBOR.USE_CUDA is not available when "
            "config.NEAREST_NEIGHBOR.OPTIMIZE_MEMORY is set to True, "
            "using CPU instead"
        )

    temperature = cfg.NEAREST_NEIGHBOR.SIGMA
    num_neighbors = cfg.NEAREST_NEIGHBOR.TOPK
    feature_dir = cfg.NEAREST_NEIGHBOR.FEATURES.PATH
    output_dir = get_checkpoint_folder(cfg)
    logging.info(f"Testing with sigma: {temperature}, topk neighbors: {num_neighbors}")

    # Step 1: get the test features (the train features might not feat in memory)
    test_out = ExtractedFeaturesLoader.load_features(
        feature_dir, "test", layer_name, flatten_features=True
    )
    test_features, test_labels = test_out["features"], test_out["targets"]
    test_features = torch.from_numpy(test_features).float()
    test_feature_num = test_features.shape[0]

    # Step 2: normalize the features if needed
    if cfg.NEAREST_NEIGHBOR.L2_NORM_FEATS:
        test_features = nn.functional.normalize(test_features, dim=1, p=2)

    # Step 3: collect the similarity score of each test feature
    # to all the train features, making sure:
    # - never to load the all train features at once to avoid OOM
    # - to keep just the 'num_neighbors' best similarity scores
    shard_paths = ExtractedFeaturesLoader.get_shard_file_names(
        input_dir=feature_dir, split="train", layer=layer_name
    )
    similarity_queue = MaxSimilarityPriorityQueue(max_size=num_neighbors)
    num_classes = 0
    for shard_path in shard_paths:
        shard_content = ExtractedFeaturesLoader.load_feature_shard(shard_path)
        train_features = torch.from_numpy(shard_content.features)
        train_features = train_features.float().reshape((train_features.shape[0], -1))
        if cfg.NEAREST_NEIGHBOR.L2_NORM_FEATS:
            train_features = nn.functional.normalize(train_features, dim=1, p=2)
        train_features = train_features.t()

        train_labels = torch.LongTensor(shard_content.targets).squeeze(-1)
        num_classes = max(num_classes, train_labels.max().item() + 1)
        similarities = torch.mm(test_features, train_features)
        if similarities.shape[0] > num_neighbors:
            distances, indices = similarities.topk(
                num_neighbors, largest=True, sorted=True
            )
        else:
            distances, indices = torch.sort(similarities, descending=True)
        closest_labels = train_labels[indices]
        similarity_queue.push_all(distances, closest_labels)

    # Step 4: collect the samples with the closest similarities
    # for each test sample, and assemble it in a matrix with
    # shape (num_test_samples, num_neighbors)
    topk_distances, topk_labels = similarity_queue.pop_all()

    # Step 5: go through each of the test samples, batch by batch,
    # to compute the label of each test sample based on the top k
    # nearest neighbors and their corresponding labels
    accuracies = Accuracies()
    output_targets, output_predicted_label, output_inds = [], [], []

    batch_size = 100
    num_test_images = test_feature_num
    for idx in range(0, num_test_images, batch_size):
        min_idx = idx
        max_idx = min(idx + batch_size, num_test_images)

        distances = topk_distances[min_idx:max_idx, ...]
        retrieved_neighbors = topk_labels[min_idx:max_idx, ...]
        targets = torch.LongTensor(test_labels[min_idx:max_idx])

        retrieval_one_hot = torch.zeros(batch_size * num_neighbors, num_classes)
        retrieval_one_hot.scatter_(1, retrieved_neighbors.view(-1, 1), 1)
        predictions = _get_sorted_predictions(
            batch_size, num_classes, distances, retrieval_one_hot, temperature
        )

        # find the predictions that match the target
        accuracies = accuracies + Accuracies.from_batch(predictions, targets)

        # get the predictions, nearest neighbors, inds to save
        output_inds.extend(range(min_idx, max_idx))
        output_predicted_label.append(predictions.data.cpu().numpy())
        output_targets.append(targets.data.cpu().numpy())

    _save_knn_results(
        output_dir, layer_name, output_inds, output_predicted_label, output_targets
    )
    accuracies.log(layer_name)
    return accuracies.top_1, accuracies.top_5, accuracies.total
Ejemplo n.º 29
0
def nearest_neighbor_test(cfg: AttrDict, layer_name: str = "heads"):
    temperature = cfg.NEAREST_NEIGHBOR.SIGMA
    num_neighbors = cfg.NEAREST_NEIGHBOR.TOPK
    output_dir = get_checkpoint_folder(cfg)
    logging.info(f"Testing with sigma: {temperature}, topk neighbors: {num_neighbors}")

    ############################################################################
    # Step 1: get train and test features
    train_out = merge_features(output_dir, "train", layer_name, cfg)
    train_features, train_labels = train_out["features"], train_out["targets"]
    # put train features and labels on gpu and transpose train features
    train_features = torch.from_numpy(train_features).float().cuda().t()
    train_labels = torch.LongTensor(train_labels).cuda()
    num_classes = train_labels.max() + 1
    if cfg.NEAREST_NEIGHBOR.L2_NORM_FEATS:
        train_features = nn.functional.normalize(train_features, dim=0, p=2)

    test_out = merge_features(output_dir, "test", layer_name, cfg)
    test_features, test_labels = test_out["features"], test_out["targets"]

    ###########################################################################
    # Step 2: calculate the nearest neighbor and the metrics
    top1, top5, total = 0.0, 0.0, 0
    num_test_images, num_chunks = test_labels.shape[0], 100
    imgs_per_chunk = num_test_images // num_chunks
    with torch.no_grad():
        retrieval_one_hot = torch.zeros(num_neighbors, num_classes).cuda()
        for idx in range(0, num_test_images, imgs_per_chunk):
            # get the features for test images and normalize the features if needed
            features = test_features[
                idx : min((idx + imgs_per_chunk), num_test_images), :
            ]
            targets = test_labels[idx : min((idx + imgs_per_chunk), num_test_images), :]
            batch_size = targets.shape[0]
            features = torch.from_numpy(features).float().cuda()
            targets = torch.LongTensor(targets).cuda()
            if cfg.NEAREST_NEIGHBOR.L2_NORM_FEATS:
                features = nn.functional.normalize(features, dim=1, p=2)

            # calculate the dot product and compute top-k neighbors
            similarity = torch.mm(features, train_features)
            distances, indices = similarity.topk(
                num_neighbors, largest=True, sorted=True
            )
            candidates = train_labels.view(1, -1).expand(batch_size, -1)
            retrieved_neighbors = torch.gather(candidates, 1, indices)

            retrieval_one_hot.resize_(batch_size * num_neighbors, num_classes).zero_()
            retrieval_one_hot.scatter_(1, retrieved_neighbors.view(-1, 1), 1)
            distances_transform = distances.clone().div_(temperature).exp_()
            probs = torch.sum(
                torch.mul(
                    retrieval_one_hot.view(batch_size, -1, num_classes),
                    distances_transform.view(batch_size, -1, 1),
                ),
                1,
            )
            _, predictions = probs.sort(1, True)

            # find the predictions that match the target
            correct = predictions.eq(targets.data.view(-1, 1))
            top1 = top1 + correct.narrow(1, 0, 1).sum().item()
            top5 = top5 + correct.narrow(1, 0, 5).sum().item()
            total += targets.size(0)
    top1 = top1 * 100.0 / total
    top5 = top5 * 100.0 / total
    logging.info(f"Total images: {total}, Top1: {top1}, Top5: {top5}")
    return top1, top5
Ejemplo n.º 30
0
def geolocalization_test(cfg: AttrDict, layer_name: str = "heads", topk: int = 1):
    output_dir = get_checkpoint_folder(cfg)
    logging.info(f"Output dir: {output_dir} ...")

    ############################################################################
    # Step 1: Load the mapping file and partition it
    # Also load the test images and targets (latitude/longitude)
    # lastly, load the model predictions
    logging.info(
        f"Loading the label partitioning file: {cfg.GEO_LOCALIZATION.TRAIN_LABEL_MAPPING}"
    )
    partitioning = Partitioning(cfg.GEO_LOCALIZATION.TRAIN_LABEL_MAPPING)

    data_files, label_files = get_data_files("TEST", cfg.DATA)
    test_image_paths = load_file(data_files[0])
    target_lat_long = load_file(label_files[0])
    logging.info(
        f"Loaded val image paths: {test_image_paths.shape}, "
        f"ground truth latitude/longitude: {target_lat_long.shape}"
    )

    prediction_image_indices_filepath = f"{output_dir}/rank0_test_{layer_name}_inds.npy"
    predictions_filepath = f"{output_dir}/rank0_test_{layer_name}_predictions.npy"
    predictions = load_file(predictions_filepath)
    predictions_inds = load_file(prediction_image_indices_filepath)
    logging.info(
        f"Loaded predictions: {predictions.shape}, inds: {predictions_inds.shape}"
    )

    ############################################################################
    # Step 2: Convert the predicted classes to latitude/longitude and compute
    # accuracy at different km thresholds.
    gt_latitudes, gt_longitudes, predicted_lats, predicted_longs = [], [], [], []
    output_metadata = {}
    num_images = len(test_image_paths)
    num_images = min(num_images, len(predictions))
    for idx in range(num_images):
        img_index = predictions_inds[idx]
        inp_img_path = test_image_paths[img_index]
        gt_latitude = float(target_lat_long[img_index][0])
        gt_longitude = float(target_lat_long[img_index][1])
        pred_cls = int(predictions[idx][:topk])
        pred_lat, pred_long = partitioning.get_lat_lng(pred_cls)
        output_metadata[inp_img_path] = {
            "target_lat": gt_latitude,
            "target_long": gt_longitude,
            "pred_lat": pred_lat,
            "pred_long": pred_long,
            "pred_cls": pred_cls,
        }
        gt_latitudes.append(gt_latitude)
        gt_longitudes.append(gt_longitude)
        predicted_lats.append(pred_lat)
        predicted_longs.append(pred_long)

    predicted_lats = torch.tensor(predicted_lats, dtype=torch.float)
    predicted_longs = torch.tensor(predicted_longs, dtype=torch.float)
    gt_latitudes = torch.tensor(gt_latitudes, dtype=torch.float)
    gt_longitudes = torch.tensor(gt_longitudes, dtype=torch.float)
    distances = vectorized_gc_distance(
        predicted_lats,
        predicted_longs,
        gt_latitudes,
        gt_longitudes,
    )

    # accuracy for all distances (in km)
    acc_dict = gcd_threshold_eval(
        distances, thresholds=cfg.GEO_LOCALIZATION.ACC_KM_THRESHOLDS
    )
    gcd_dict = {}
    for gcd_thres, acc in acc_dict.items():
        gcd_dict[f"{gcd_thres}"] = round(acc * 100.0, 4)
    logging.info(f"acc dist in percentage: {gcd_dict}")
    save_file(
        output_metadata,
        f"{output_dir}/output_metadata_predictions.json",
        append_to_json=False,
    )
    save_file(
        gcd_dict,
        f"{output_dir}/metrics.json",
        append_to_json=False,
    )
    return output_metadata, acc_dict