def main(args: Namespace, config: AttrDict): # setup logging setup_logging(__name__, output_dir=get_checkpoint_folder(config)) # print the coniguration used print_cfg(config) assert config.MODEL.FEATURE_EVAL_SETTINGS.EVAL_MODE_ON, ( "Feature eval mode is not ON. Can't run train_svm. " "Set config.MODEL.FEATURE_EVAL_SETTINGS.EVAL_MODE_ON=True " "in your config or from command line.") # extract the features if not config.SVM_FEATURES_PATH: launch_distributed( config, args.node_id, engine_name="extract_features", hook_generator=default_hook_generator, ) config.SVM_FEATURES_PATH = get_checkpoint_folder(config) # Get the names of the features that we extracted features for. If user doesn't # specify the features to evaluate, we get the full model output and freeze # head/trunk both as caution. layers = get_trunk_output_feature_names(config.MODEL) if len(layers) == 0: layers = ["heads"] output_dir = get_checkpoint_folder(config) running_tasks = [ mp.Process(target=train_svm, args=(config, output_dir, layer)) for layer in layers ] for running_task in running_tasks: running_task.start() for running_task in running_tasks: running_task.join() # collect the mAP stats for all the layers and report output_mAP = [] for layer in layers: try: ap_file = f"{output_dir}/{layer}/test_ap.npy" output_mAP.append(round(100.0 * np.mean(load_file(ap_file)), 3)) except Exception: output_mAP.append(-1) logging.info(f"AP for various layers:\n {layers}: {output_mAP}") # close the logging streams including the filehandlers shutdown_logging()
def main(args: Namespace, config: AttrDict): # setup logging setup_logging(__name__) # print the coniguration used print_cfg(config) # setup the environment variables set_env_vars(local_rank=0, node_id=0, cfg=config) # Extract the features if no path to the extract features is provided if not config.NEAREST_NEIGHBOR.FEATURES.PATH: launch_distributed( config, args.node_id, engine_name="extract_features", hook_generator=default_hook_generator, ) config.NEAREST_NEIGHBOR.FEATURES.PATH = get_checkpoint_folder(config) # Run KNN at all the extract features run_knn_at_all_layers(config) # close the logging streams including the filehandlers shutdown_logging()
def test_run(self, config_file_path: str): """ Instantiate and run all the test tasks Arguments: config_file_path {str} -- path to the config for the task to be run """ logger.info(f"Loading {config_file_path}") cfg = SSLHydraConfig.from_configs([config_file_path]) args, config = convert_to_attrdict(cfg.default_cfg) checkpoint_folder = get_checkpoint_folder(config) # Complete the data localization at runtime config.DATA.TRAIN.DATA_PATHS = [ pkg_resources.resource_filename(__name__, "test_data") ] if torch.distributed.is_initialized(): # Destroy process groups as torch may be initialized with NCCL, which # is incompatible with test_cpu_regnet_moco.yaml torch.distributed.destroy_process_group() # run training and make sure no exception is raised dist_run_id = get_dist_run_id(config, config.DISTRIBUTED.NUM_NODES) train_main( config, dist_run_id=dist_run_id, checkpoint_path=None, checkpoint_folder=checkpoint_folder, local_rank=0, node_id=0, hook_generator=default_hook_generator, )
def test_run(self, config_file_path: str): """ Instantiate and run all the test tasks Arguments: config_file_path {str} -- path to the config for the task to be run """ logger.info(f"Loading {config_file_path}") cfg = SSLHydraConfig.from_configs([config_file_path]) args, config = convert_to_attrdict(cfg.default_cfg) checkpoint_folder = get_checkpoint_folder(config) # Complete the data localization at runtime config.DATA.TRAIN.DATA_PATHS = [ pkg_resources.resource_filename(__name__, "test_data") ] # run training and make sure no exception is raised dist_run_id = get_dist_run_id(config, config.DISTRIBUTED.NUM_NODES) train_main( config, dist_run_id=dist_run_id, checkpoint_path=None, checkpoint_folder=checkpoint_folder, local_rank=0, node_id=0, hook_generator=default_hook_generator, )
def main(args: Namespace, cfg: AttrDict): # setup logging setup_logging(__name__) # print the cfg print_cfg(cfg) # setup the environment variables set_env_vars(local_rank=0, node_id=0, cfg=cfg) output_dir = get_checkpoint_folder(cfg) assert cfg.MODEL.FEATURE_EVAL_SETTINGS.EVAL_MODE_ON, ( "Feature eval mode is not ON. Can't run train_svm. " "Set config.MODEL.FEATURE_EVAL_SETTINGS.EVAL_MODE_ON=True " "in your config or from command line.") extract_low_shot_features(args, cfg, output_dir) # Get the names of the features that we extracted features for. If user doesn't # specify the features to evaluate, we get the full model output and freeze # head/trunk both as caution. layers = get_trunk_output_feature_names(cfg.MODEL) if len(layers) == 0: layers = ["heads"] # train low shot svm for each layer. output = {} for layer in layers: results = train_svm_low_shot(cfg, output_dir, layer) output[layer] = results logging.info(f"Results: {output}") # close the logging streams including the filehandlers shutdown_logging()
def cluster_features_and_label(args: Namespace, cfg: AttrDict): # faiss is an optional dependency for VISSL. assert is_faiss_available(), ( "Please install faiss using conda install faiss-gpu -c pytorch " "if using conda or pip install faiss-gpu" ) import faiss cluster_backend = cfg.CLUSTERFIT.CLUSTER_BACKEND num_clusters = cfg.CLUSTERFIT.NUM_CLUSTERS data_split = cfg.CLUSTERFIT.FEATURES.DATA_PARTITION data_name = cfg.CLUSTERFIT.FEATURES.DATASET_NAME n_iter = cfg.CLUSTERFIT.N_ITER output_dir = get_checkpoint_folder(cfg) ########### Step 1: Extract the features on full dataset ################### feature_data, image_paths = get_data_features_and_images(cfg) ########### Step 2: Get the data information ################### features = feature_data["features"] # features are of shape num_samples x feature_dim assert features.ndim == 2, f"Features incorrect shape: {features.shape}" assert features.dtype == np.float32, "Features are not float32 type" logging.info(f"Clustering Features: {features.shape}") ########### Step 3: L2 normalize features ################### # TODO: we could support PCA here if needed in future. logging.info("L2 normalizing the features now...") feat_norm = np.linalg.norm(features, axis=1) + 1e-5 features = features / feat_norm[:, np.newaxis] ########### Step 4: Cluster the features ################### logging.info("Clustering the features now...") assert cluster_backend == "faiss", "Only faiss clustering is supported currently" kmeans = faiss.Kmeans(features.shape[1], num_clusters, niter=n_iter, verbose=True) kmeans.train(features) centroids = kmeans.centroids ########### Step 5: Get the cluster assignment for the features ############ logging.info("Getting cluster label assignment now...") distances, hard_cluster_labels = kmeans.index.search(features, 1) #### Step 6: Save clustering data and hard cluster labels for the images ### data_split = data_split.lower() clustering_output_dict = { "hard_labels": hard_cluster_labels, "centroids": centroids, "distances": distances, } cluster_output_filepath = ( f"{output_dir}/{data_name}_{data_split}_N{num_clusters}_{cluster_backend}.pkl" ) hard_labels_output_filepath = ( f"{output_dir}/" f"{data_name}_{data_split}_N{num_clusters}_{cluster_backend}_lbls.npy" ) out_hard_labels = np.array(hard_cluster_labels.tolist(), dtype=np.int64).reshape(-1) save_file(clustering_output_dict, cluster_output_filepath) save_file(out_hard_labels, hard_labels_output_filepath) logging.info("All Done!")
def main(args: Namespace, cfg: AttrDict): setup_logging(__name__, output_dir=get_checkpoint_folder(cfg)) # Extract the features if the feature extract is enabled if cfg.CLUSTERFIT.FEATURES.EXTRACT: # We cannot have automatic extraction with more than 1 node or otherwise # we would have to run this script on several nodes and thus have several # parallel clustering of the features. The automatic extraction is only # there as a shortcut when running on a single node assert (cfg.DISTRIBUTED.NUM_NODES == 1 ), "Automatic extraction can only work with 1 node" # Make sure to dump the features at the desired path cfg.CHECKPOINT.DIR = cfg.CLUSTERFIT.FEATURES.PATH cfg.CHECKPOINT.APPEND_DISTR_RUN_ID = False # Run the extraction of features set_env_vars(local_rank=0, node_id=0, cfg=cfg) logging.info("Setting seed....") set_seeds(cfg, args.node_id) launch_distributed( cfg, args.node_id, engine_name="extract_features", hook_generator=default_hook_generator, ) # Else setup the path manager (done in set_env_vars) in # case of feature extraction above else: setup_path_manager() cluster_features(cfg) shutdown_logging()
def extract_clusters( cfg: AttrDict, dist_run_id: str, checkpoint_folder: str, local_rank: int = 0, node_id: int = 0, ): """ Sets up and executes model visualisation extraction workflow on one node """ # setup the environment variables set_env_vars(local_rank, node_id, cfg) dist_rank = int(os.environ["RANK"]) # setup logging setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank) logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}") # print the environment info for the current node if local_rank == 0: current_env = os.environ.copy() print_system_env_info(current_env) # setup the multiprocessing to be forkserver. # See https://fb.quip.com/CphdAGUaM5Wf setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD) # set seeds logging.info("Setting seed....") set_seeds(cfg, dist_rank) # We set the CUDA device here as well as a safe solution for all downstream # `torch.cuda.current_device()` calls to return correct device. if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available(): local_rank, _ = get_machine_local_and_dist_rank() torch.cuda.set_device(local_rank) # print the training settings and system settings if local_rank == 0: print_cfg(cfg) logging.info("System config:\n{}".format(collect_env_info())) # Build the SSL trainer to set up distributed training and then # extract the cluster assignments for all entries in the dataset trainer = SelfSupervisionTrainer(cfg, dist_run_id) cluster_assignments = trainer.extract_clusters() # Save the cluster assignments in the output folder if dist_rank == 0: ClusterAssignmentLoader.save_cluster_assignment( output_dir=get_checkpoint_folder(cfg), assignments=ClusterAssignment( config=cfg, cluster_assignments=cluster_assignments), ) # close the logging streams including the file handlers logging.info("All Done!") shutdown_logging()
def _save_label_cls_idx_map(self, cls_idx_map: Dict[str, int], split: str): local_rank, dist_rank = get_machine_local_and_dist_rank() if dist_rank == 0: checkpoint_folder = get_checkpoint_folder(self.cfg) class_idx_file_path = ( f"{checkpoint_folder}/{split.lower()}_label_to_index_map.json") if not g_pathmgr.exists(class_idx_file_path): save_file(cls_idx_map, class_idx_file_path, append_to_json=False)
def main(args: Namespace, config: AttrDict): config = validate_and_infer_config(config) # setup the environment variables set_env_vars(local_rank=0, node_id=0, cfg=config) # setup the logging checkpoint_folder = get_checkpoint_folder(config) setup_logging(__name__, output_dir=checkpoint_folder) # print the config print_cfg(config) instance_retrieval_test(args, config) # close the logging streams including the filehandlers shutdown_logging()
def get_tensorboard_dir(cfg): """ Get the output directory where the tensorboard events will be written. Args: cfg (AttrDict): User specified config file containing the settings for the tensorboard as well like log directory, logging frequency etc Returns: tensorboard_dir (str): output directory path """ checkpoint_folder = get_checkpoint_folder(cfg) tensorboard_dir = f"{checkpoint_folder}/tb_logs" logging.info(f"Tensorboard dir: {tensorboard_dir}") makedir(tensorboard_dir) return tensorboard_dir
def get_data_features_and_images(cfg: AttrDict): output_dir = get_checkpoint_folder(cfg) split = cfg.RANKING.FEATURES.DATA_PARTITION logging.info("Merging features...") # merge the features across all nodes/gpus into one feature_data = merge_features(output_dir, split.lower(), cfg.RANKING.FEATURES.LAYER_NAME) logging.info("Getting the image paths...") # get the list of image Ids dataset = build_dataset(cfg=cfg, split=split) feature_image_paths = dataset.get_image_paths() # due to multi-modality, we get image_paths as a nested list, one for each # dataset. Check it's a list and extract images. assert type(feature_image_paths) == list, "Image paths must be a list" assert len(feature_image_paths) == 1, "Multi-modality not supported yet!" return feature_data, feature_image_paths[0]
def main(args: Namespace, config: AttrDict, node_id=0): config = validate_and_infer_config(config) # setup the environment variables set_env_vars(local_rank=0, node_id=node_id, cfg=config) # setup the logging checkpoint_folder = get_checkpoint_folder(config) setup_logging(__name__, output_dir=checkpoint_folder, rank=os.environ["RANK"]) if (config.IMG_RETRIEVAL.USE_FEATURE_EXTRACTION_ENGINE and not config.IMG_RETRIEVAL.FEATURE_EXTRACTION_DIR): # extract the train/database features. config = adapt_train_database_extract_config(config, checkpoint_folder) logging.info("Beginning extract features for database set.") launch_distributed( config, args.node_id, engine_name="extract_features", hook_generator=default_hook_generator, ) # extract the query features. config = adapt_query_extract_config(config, checkpoint_folder) logging.info("Beginning extract features for query set.") launch_distributed( config, args.node_id, engine_name="extract_features", hook_generator=default_hook_generator, ) # print the config print_cfg(config) instance_retrieval_test(args, config) logging.info(f"Performance time breakdow:\n{PERF_STATS.report_str()}") # close the logging streams including the filehandlers shutdown_logging()
def extract_features_and_run_knn(node_id: int, config: AttrDict): setup_logging(__name__) print_cfg(config) set_env_vars(local_rank=0, node_id=0, cfg=config) # Extract the features if no path to the extract features is provided if not config.NEAREST_NEIGHBOR.FEATURES.PATH: launch_distributed( config, node_id, engine_name="extract_features", hook_generator=default_hook_generator, ) config.NEAREST_NEIGHBOR.FEATURES.PATH = get_checkpoint_folder(config) # Run KNN on all the extract features run_knn_at_all_layers(config) # close the logging streams including the file handlers shutdown_logging()
def infer_losses_config(cfg): """ Infer settings for various self-supervised losses. Takes care of setting various loss parameters correctly like world size, batch size per gpu, effective global batch size, collator etc. Each loss has additional set of parameters that can be inferred to ensure smooth training in case user forgets to adjust all the parameters. """ # some inference for the Info-NCE loss. if "simclr_info_nce_loss" in cfg.LOSS.name: cfg.LOSS[cfg.LOSS.name]["buffer_params"]["world_size"] = ( cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE) world_size = cfg.LOSS[cfg.LOSS.name]["buffer_params"]["world_size"] batch_size = cfg.DATA.TRAIN.BATCHSIZE_PER_REPLICA num_positives = 2 # simclr uses 2 copies per image cfg.LOSS[cfg.LOSS.name]["buffer_params"]["effective_batch_size"] = ( num_positives * batch_size * world_size) # bce_logits_multiple_output_single_target if cfg.LOSS.name == "bce_logits_multiple_output_single_target": world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE cfg.LOSS.bce_logits_multiple_output_single_target.world_size = world_size # multicrop version of simclr loss if cfg.LOSS.name == "multicrop_simclr_info_nce_loss": world_size = cfg.LOSS.multicrop_simclr_info_nce_loss.buffer_params.world_size batch_size = cfg.DATA.TRAIN.BATCHSIZE_PER_REPLICA total_num_crops = cfg.DATA.TRAIN.TRANSFORMS[0]["total_num_crops"] cfg.LOSS.multicrop_simclr_info_nce_loss.buffer_params.world_size = world_size cfg.LOSS.multicrop_simclr_info_nce_loss.buffer_params.effective_batch_size = ( batch_size * world_size) cfg.LOSS.multicrop_simclr_info_nce_loss.num_crops = total_num_crops cfg.DATA.TRAIN.COLLATE_FUNCTION = "multicrop_collator" # some inference for the DeepCluster-v2 loss. if cfg.LOSS.name == "deepclusterv2_loss": cfg.LOSS.deepclusterv2_loss.DROP_LAST = cfg.DATA.TRAIN.DROP_LAST cfg.LOSS.deepclusterv2_loss.BATCHSIZE_PER_REPLICA = ( cfg.DATA.TRAIN.BATCHSIZE_PER_REPLICA) cfg.LOSS.deepclusterv2_loss.num_crops = cfg.DATA.TRAIN.TRANSFORMS[0][ "total_num_crops"] cfg.DATA.TRAIN.COLLATE_FUNCTION = "multicrop_collator" # some inference for the SwAV loss. if cfg.LOSS.name == "swav_loss": assert len(cfg.MODEL.HEAD.PARAMS) == 1 assert cfg.MODEL.HEAD.PARAMS[0][0] == "swav_head" assert cfg.DATA.TRAIN.COLLATE_FUNCTION in [ "multicrop_collator", "multicrop_mixup_collator", "cutmixup_collator", ], ("for swav loss, use either a collator from " "[multicrop_collator, multicrop_mixup_collator]") cfg.LOSS.swav_loss.num_prototypes = cfg.MODEL.HEAD.PARAMS[0][1][ "num_clusters"] cfg.LOSS.swav_loss.embedding_dim = cfg.MODEL.HEAD.PARAMS[0][1]["dims"][ -1] cfg.LOSS.swav_loss.num_crops = cfg.DATA.TRAIN.TRANSFORMS[0][ "total_num_crops"] from vissl.utils.checkpoint import get_checkpoint_folder cfg.LOSS.swav_loss.output_dir = get_checkpoint_folder(cfg) world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE batch_size = cfg.DATA.TRAIN.BATCHSIZE_PER_REPLICA batch_size *= world_size queue_length = cfg.LOSS.swav_loss.queue.queue_length queue_length -= queue_length % batch_size cfg.LOSS.swav_loss.queue.queue_length = queue_length cfg.LOSS.swav_loss.queue.local_queue_length = queue_length // world_size # some inference for the SwAV momentum loss. if cfg.LOSS.name == "swav_momentum_loss": assert len(cfg.MODEL.HEAD.PARAMS) == 1 assert cfg.MODEL.HEAD.PARAMS[0][0] == "swav_head" cfg.LOSS.swav_momentum_loss.num_prototypes = cfg.MODEL.HEAD.PARAMS[0][ 1]["num_clusters"] cfg.LOSS.swav_momentum_loss.embedding_dim = cfg.MODEL.HEAD.PARAMS[0][ 1]["dims"][-1] cfg.LOSS.swav_momentum_loss.num_crops = cfg.DATA.TRAIN.TRANSFORMS[0][ "total_num_crops"] cfg.DATA.TRAIN.COLLATE_FUNCTION = "multicrop_collator" world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE batch_size = cfg.DATA.TRAIN.BATCHSIZE_PER_REPLICA batch_size *= world_size queue_length = cfg.LOSS.swav_momentum_loss.queue.queue_length queue_length -= queue_length % batch_size cfg.LOSS.swav_momentum_loss.queue.queue_length = queue_length cfg.LOSS.swav_momentum_loss.queue.local_queue_length = (queue_length // world_size) return cfg
def default_hook_generator(cfg: AttrDict) -> List[ClassyHook]: """ The utility function that prepares all the hoooks that will be used in training based on user selection. Some basic hooks are used by default. Optional hooks: - Tensorboard hook, - loss specific hooks (swav loss, deepcluster loss, moco loss) used only when the loss is being used - model complexity hook (if user wants to compute model flops, activations, params) enable the hook via HOOKS.MODEL_COMPLEXITY.COMPUTE_COMPLEXITY = True Returns: hooks (List(functions)): list containing the hook functions that will be used """ hooks = [] # conditionally add hooks based on use-case if cfg.HOOKS.PERF_STATS.MONITOR_PERF_STATS: perf_stat_freq = ( cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY if cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY > 0 else None ) hooks.append(LogPerfTimeMetricsHook(perf_stat_freq)) # add the loss hooks based on the loss being used hooks = add_loss_hooks(hooks, cfg.LOSS, cfg) if cfg.HOOKS.MODEL_COMPLEXITY.COMPUTE_COMPLEXITY: hooks.extend([SSLModelComplexityHook()]) if cfg.HOOKS.LOG_GPU_STATS: hooks.extend([LogGpuStatsHook()]) if cfg.HOOKS.MEMORY_SUMMARY.PRINT_MEMORY_SUMMARY: hooks.extend([LogGpuMemoryHook(cfg.HOOKS.MEMORY_SUMMARY.LOG_ITERATION_NUM)]) if cfg.HOOKS.MEMORY_SUMMARY.DUMP_MEMORY_ON_EXCEPTION: hooks.append(DumpMemoryOnException()) if cfg.HOOKS.TENSORBOARD_SETUP.USE_TENSORBOARD: assert is_tensorboard_available(), ( "Tensorboard must be installed to use it. Please install tensorboard using:" "If pip environment: `pip install tensorboard` " "If using conda and you prefer conda install of tensorboard: " "`conda install -c conda-forge tensorboard`" ) tb_hook = get_tensorboard_hook(cfg) hooks.extend([tb_hook]) if cfg.MODEL.GRAD_CLIP.USE_GRAD_CLIP: hooks.extend( [ GradClipHook( norm_type=cfg.MODEL.GRAD_CLIP.NORM_TYPE, max_norm=cfg.MODEL.GRAD_CLIP.MAX_NORM, ) ] ) # hooks that are used irrespective of workflow type rolling_btime_freq = ( cfg.HOOKS.PERF_STATS.ROLLING_BTIME_FREQ if cfg.HOOKS.PERF_STATS.ROLLING_BTIME_FREQ > 0 else None ) if CudaSynchronizeHook.is_enabled(cfg.MODEL): hooks.append(CudaSynchronizeHook()) if ProfilingHook.is_enabled(cfg.PROFILING): hooks.append(ProfilingHook(profiling_config=cfg.PROFILING)) world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE checkpoint_folder = get_checkpoint_folder(cfg) hooks.extend( [ SetDataSamplerEpochHook(), FreezeParametersHook(), LogLossMetricsCheckpointHook(world_size), LogLossLrEtaHook(checkpoint_folder, rolling_btime_freq), ] ) if cfg.METERS.model_output_mask: hooks.extend([ModelOutputMaskHook()]) if cfg.HOOKS.CHECK_NAN: hooks.extend([CheckNanLossHook(), CheckNanModelOutputHook(world_size)]) return hooks
def infer_and_assert_hydra_config(cfg): """ Infer values of few parameters in the config file using the value of other config parameters 1. Inferring losses 2. Auto scale learning rate if user has specified auto scaling to be True. 3. Infer meter names (model layer name being evaluated) since we support list meters that have multiple output and same target. This is very common in self-supervised learning where we want to evaluate metric for several layers of the models. VISSL supports running evaluation for multiple model layers in a single training run. 4. Support multi-gpu DDP eval model by attaching a dummy parameter. This is particularly helpful for the multi-gpu feature extraction especially when the dataset is large for which features are being extracted. 5. Infer what kind of labels are being used. If user has specified a labels source, we set LABEL_TYPE to "standard" (also vissl default), otherwise if no label is specified, we set the LABEL_TYPE to "sample_index". """ cfg = infer_losses_config(cfg) cfg = infer_learning_rate(cfg) # pass the seed to cfg["MODEL"] so that model init on different nodes can # use the same seed. # TODO (Min): once FSDP supports sync'ing weights from rank 0, we don't need # this anymore. cfg["MODEL"]["_MODEL_INIT_SEED"] = cfg.SEED_VALUE # in case of linear evaluation, we often evaluate several layers at a time. For each # layer, there's a separate accuracy meter. In such case, we want to output the layer # name in the meters output to make it easy to interpret the results. This is # currently only supported for cases where we have linear evaluation. if cfg.METERS is not None: from vissl.models import is_feature_extractor_model meter_name = cfg.METERS.get("name", "") valid_meters = ["accuracy_list_meter", "mean_ap_list_meter"] if meter_name: if meter_name in valid_meters and is_feature_extractor_model(cfg.MODEL): cfg.METERS[meter_name]["num_meters"] = len( cfg.MODEL.FEATURE_EVAL_SETTINGS.LINEAR_EVAL_FEAT_POOL_OPS_MAP ) cfg.METERS[meter_name]["meter_names"] = [ item[0] for item in cfg.MODEL.FEATURE_EVAL_SETTINGS.LINEAR_EVAL_FEAT_POOL_OPS_MAP ] # in case of feature evaluation mode, we freeze the trunk. The Feature evaluation mode # is used for the feature extraction of trunk as well. VISSL supports distributed feature # extraction to speed up the extraction time. Since the model needs to be DDP for the # distributed extraction, we need some dummy parameters in the model otherwise model # can't be converted to DDP. So we attach some dummy head to the model. world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE if ( cfg.MODEL.FEATURE_EVAL_SETTINGS.EVAL_MODE_ON and cfg.MODEL.FEATURE_EVAL_SETTINGS.FREEZE_TRUNK_ONLY and cfg.MODEL.FEATURE_EVAL_SETTINGS.EXTRACT_TRUNK_FEATURES_ONLY and world_size > 1 and len(cfg.MODEL.HEAD.PARAMS) == 0 ): cfg.MODEL.HEAD.PARAMS = [["mlp", {"dims": [2048, 1000]}]] # in SSL, during pre-training we don't want to use annotated labels or during feature # extraction, we don't have annotated labels for some datasets. In such cases, we set # the label type to be just the image index in the dataset, unless the # user has specifically provided "zero" as the label type, which is # necessary when the CutMixUp collator is being used for self-supervised # training. if len(cfg.DATA.TRAIN.LABEL_SOURCES) == 0 and cfg.DATA.TRAIN.LABEL_TYPE != "zero": cfg.DATA.TRAIN.LABEL_TYPE = "sample_index" if len(cfg.DATA.TEST.LABEL_SOURCES) == 0 and cfg.DATA.TEST.LABEL_TYPE != "zero": cfg.DATA.TEST.LABEL_TYPE = "sample_index" # if the user has specified the model initialization from a params_file, we check if # the params_file is a url. If it is, we download the file to a local cache directory # and use that instead from vissl.utils.checkpoint import get_checkpoint_folder from vissl.utils.io import cache_url, is_url if is_url(cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE): checkpoint_dir = get_checkpoint_folder(cfg) cache_dir = f"{checkpoint_dir}/params_file_cache/" cached_url_path = cache_url( url=cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE, cache_dir=cache_dir ) cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE = cached_url_path # ZeRO2: Infer the settings for ShardedDDP which shards the optimizer state # and the model weights. For ShardedDDP, we must use the OSS optimizer, # set the right task name, use the PyTorch AMP if AMP is used. if cfg.MODEL.SHARDED_DDP_SETUP.USE_SDP: cfg.OPTIMIZER.use_zero = True cfg.TRAINER.TASK_NAME = "self_supervision_sdp_task" if cfg.MODEL.AMP_PARAMS.USE_AMP: cfg.MODEL.AMP_PARAMS.AMP_TYPE = "pytorch" # if we use a zero optimizer, we nest the optimizer related settings under the # base_optimizer. if cfg.OPTIMIZER.use_zero: cfg.OPTIMIZER["base_optimizer"] = cfg.OPTIMIZER.copy() cfg.OPTIMIZER.name = "zero" del cfg.OPTIMIZER.base_optimizer["param_schedulers"] del cfg.OPTIMIZER.base_optimizer["regularize_bn"] del cfg.OPTIMIZER.base_optimizer["regularize_bias"] del cfg.OPTIMIZER.base_optimizer["num_epochs"] del cfg.OPTIMIZER.base_optimizer["use_zero"] del cfg.OPTIMIZER.base_optimizer["head_optimizer_params"] # inference for the FSDP settings. Conditions are: # 1) use the FSDP task # 2) use the single param group in the optimizer # 3) if AMP is used, it must be PyTorch AMP # 4) If training SwAV, we automatically set the head to SwAV FSDP head # 4) Inference for the FSDP parameters to ensure the good convergence if cfg.MODEL.FSDP_CONFIG.AUTO_SETUP_FSDP: cfg.TRAINER.TASK_NAME = "self_supervision_fsdp_task" cfg.OPTIMIZER.construct_single_param_group_only = True # safely set flatten_parameters=True for FSDP trainings. cfg["MODEL"]["FSDP_CONFIG"]["flatten_parameters"] = True # recommended FSDP settings below for the convergence cfg["MODEL"]["FSDP_CONFIG"]["compute_dtype"] = "float32" # Inference of optimizer configuration if cfg["OPTIMIZER"]["use_larc"]: cfg["OPTIMIZER"]["name"] = "sgd_fsdp" # AMP based inference if cfg["MODEL"]["AMP_PARAMS"]["USE_AMP"]: cfg["MODEL"]["AMP_PARAMS"]["AMP_TYPE"] = "pytorch" cfg["MODEL"]["FSDP_CONFIG"]["mixed_precision"] = True cfg["MODEL"]["FSDP_CONFIG"]["fp32_reduce_scatter"] = True else: # if not using AMP, we can't use mixed_precision as it requires PyTorch AMP cfg["MODEL"]["FSDP_CONFIG"]["mixed_precision"] = False # if mixed_precision=False, FSDP mandates setting fp32_reduce_scatter=False cfg["MODEL"]["FSDP_CONFIG"]["fp32_reduce_scatter"] = False # Inference of the head in case of training with FSDP for i, head_param in enumerate(cfg.MODEL.HEAD.PARAMS): if head_param[0] == "swav_head": cfg.MODEL.HEAD.PARAMS[i][0] = "swav_head_fsdp" if head_param[0] == "eval_mlp": cfg.MODEL.HEAD.PARAMS[i][0] = "eval_mlp_fsdp" if head_param[0] == "mlp": cfg.MODEL.HEAD.PARAMS[i][0] = "mlp_fsdp" # Inference of the trunk in case of training with FSDP if cfg.MODEL.TRUNK.NAME == "regnet": cfg.MODEL.TRUNK.NAME = "regnet_fsdp" # Profiling the communication requires some setup for FSDP if cfg.PROFILING.MEMORY_PROFILING.TRACK_BY_LAYER_MEMORY: cfg["MODEL"]["FSDP_CONFIG"]["_TRACK_COMMUNICATIONS"] = True logging.info(f"Using the FSDP config: {cfg.MODEL.FSDP_CONFIG}") # Delete the AUTO_SETUP_FSDP key since we send the FSDP_CONFIG # to FSDP from fairscale which doesn't know about AUTO_SETUP_FSDP del cfg.MODEL.FSDP_CONFIG["AUTO_SETUP_FSDP"]
def infer_and_assert_hydra_config(cfg, engine_name: str): """ Infer values of few parameters in the config file using the value of other config parameters 1. Inferring losses 2. Auto scale learning rate if user has specified auto scaling to be True. 3. Infer meter names (model layer name being evaluated) since we support list meters that have multiple output and same target. This is very common in self-supervised learning where we want to evaluate metric for several layers of the models. VISSL supports running evaluation for multiple model layers in a single training run. 4. Support multi-gpu DDP eval model by attaching a dummy parameter. This is particularly helpful for the multi-gpu feature extraction especially when the dataset is large for which features are being extracted. 5. Infer what kind of labels are being used. If user has specified a labels source, we set LABEL_TYPE to "standard" (also vissl default), otherwise if no label is specified, we set the LABEL_TYPE to "sample_index". """ cfg = infer_losses_config(cfg) cfg = infer_learning_rate(cfg) assert_transforms(cfg) # pass the seed to cfg["MODEL"] so that model init on different nodes can # use the same seed. # TODO (Min): once FSDP supports sync'ing weights from rank 0, we don't need # this anymore. cfg["MODEL"]["_MODEL_INIT_SEED"] = cfg.SEED_VALUE # in case of linear evaluation, we often evaluate several layers at a time. For each # layer, there's a separate accuracy meter. In such case, we want to output the layer # name in the meters output to make it easy to interpret the results. This is # currently only supported for cases where we have linear evaluation. if cfg.METERS is not None: from vissl.models import is_feature_extractor_model # Ensure backwards compatibility of cfg.METERS.name. meter_name = cfg.METERS.get("name", "") if meter_name: meter_names = set(cfg.METERS.get("names", [])) meter_names.add(meter_name) cfg.METERS.names = list(meter_names) meter_names = cfg.METERS.get("names", []) valid_meters = [ "accuracy_list_meter", "mean_ap_list_meter", "precision_at_k_list_meter", "recall_at_k_list_meter", ] for meter_name in meter_names: if meter_name in valid_meters: feat_eval_ops_map = (cfg.MODEL.FEATURE_EVAL_SETTINGS. LINEAR_EVAL_FEAT_POOL_OPS_MAP) all_meter_names = [item[0] for item in feat_eval_ops_map] if is_feature_extractor_model(cfg.MODEL): cfg.METERS[meter_name]["num_meters"] = len( feat_eval_ops_map) cfg.METERS[meter_name]["meter_names"] = all_meter_names elif engine_name == "extract_label_predictions": if len(feat_eval_ops_map) > 0: cfg.METERS[meter_name]["num_meters"] = len( feat_eval_ops_map) cfg.METERS[meter_name]["meter_names"] = all_meter_names else: # if user is not extracting from multiple layers, we assume # the model head is being used. cfg.METERS[meter_name]["num_meters"] = 1 # in SSL, during pre-training we don't want to use annotated labels or during feature # extraction, we don't have annotated labels for some datasets. In such cases, we set # the label type to be just the image index in the dataset, unless the # user has specifically provided "zero" as the label type, which is # necessary when the CutMixUp collator is being used for self-supervised # training. if len(cfg.DATA.TRAIN.LABEL_SOURCES ) == 0 and cfg.DATA.TRAIN.LABEL_TYPE != "zero": cfg.DATA.TRAIN.LABEL_TYPE = "sample_index" if len(cfg.DATA.TEST.LABEL_SOURCES ) == 0 and cfg.DATA.TEST.LABEL_TYPE != "zero": cfg.DATA.TEST.LABEL_TYPE = "sample_index" # if the user has specified the model initialization from a params_file, we check if # the params_file is a url. If it is, we download the file to a local cache directory # and use that instead from vissl.utils.checkpoint import get_checkpoint_folder from vissl.utils.io import cache_url, is_url if is_url(cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE): checkpoint_dir = get_checkpoint_folder(cfg) cache_dir = f"{checkpoint_dir}/params_file_cache/" cached_url_path = cache_url(url=cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE, cache_dir=cache_dir) cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE = cached_url_path # ZeRO2: Infer the settings for ShardedDDP which shards the optimizer state # and the model weights. For ShardedDDP, we must use the OSS optimizer, # set the right task name, use the PyTorch AMP if AMP is used. if cfg.MODEL.SHARDED_DDP_SETUP.USE_SDP: cfg.OPTIMIZER.use_zero = True cfg.TRAINER.TASK_NAME = "self_supervision_sdp_task" if cfg.MODEL.AMP_PARAMS.USE_AMP: cfg.MODEL.AMP_PARAMS.AMP_TYPE = "pytorch" # if we use a zero optimizer, we nest the optimizer related settings under the # base_optimizer. if cfg.OPTIMIZER.use_zero: cfg.OPTIMIZER["base_optimizer"] = cfg.OPTIMIZER.copy() cfg.OPTIMIZER.name = "zero" del cfg.OPTIMIZER.base_optimizer["param_schedulers"] del cfg.OPTIMIZER.base_optimizer["regularize_bn"] del cfg.OPTIMIZER.base_optimizer["regularize_bias"] del cfg.OPTIMIZER.base_optimizer["num_epochs"] del cfg.OPTIMIZER.base_optimizer["use_zero"] del cfg.OPTIMIZER.base_optimizer["head_optimizer_params"] # Infer fsdp settings cfg = infer_fsdp_setup(cfg) if cfg.DATA.TRAIN.BASE_DATASET == "generic_ssl": assert ( cfg.DATA.TRAIN.get("TRAIN_PHASES_PER_EPOCH", 1) == 1 ), "When using the generic_ssl, we must set TRAIN_PHASES_PER_EPOCH = 1." if cfg.METERS.model_output_mask: assert ( len(cfg.DATA.TEST.DATA_SOURCES) > 0 ), "Model output mask is only applicable when there is a test dataset." assert (cfg.DATA.TEST.BASE_DATASET == "generic_ssl" ), "Model output mask is only supported with ssl dataset." # Remove CHECK_NAN hooks, as model output masking casts the logits # to -inf, which will throw an error from the CHECK_NAN hooks. cfg.HOOKS.CHECK_NAN = False if cfg.HOOKS.EMA_MODEL.ENABLE_EMA_METERS: assert cfg.METERS.get("name", "") or cfg.METERS.get( "names", [] ), "Please specify METER.name or METER.names if you are enabling the EMA_MODEL hook."
def launch_distributed( cfg: AttrDict, node_id: int, engine_name: str, hook_generator: Callable[[Any], List[ClassyHook]], ): """ Launch the distributed training across gpus, according to the cfg Args: cfg -- VISSL yaml configuration node_id -- node_id for this node engine_name -- what engine to run: train or extract_features hook_generator -- Callback to generate all the ClassyVision hooks for this engine """ node_id = get_node_id(node_id) dist_run_id = get_dist_run_id(cfg, cfg.DISTRIBUTED.NUM_NODES) world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE set_env_vars(local_rank=0, node_id=node_id, cfg=cfg) copy_to_local(cfg) # given the checkpoint folder, we check that there's not already a final checkpoint checkpoint_folder = get_checkpoint_folder(cfg) if is_training_finished(cfg, checkpoint_folder=checkpoint_folder): logging.info( f"Training already succeeded on node: {node_id}, exiting.") return # Get the checkpoint where to load from. The load_checkpoints function will # automatically take care of detecting whether it's a resume or not. symlink_checkpoint_path = f"{checkpoint_folder}/checkpoint.torch" if cfg.CHECKPOINT.USE_SYMLINK_CHECKPOINT_FOR_RESUME and PathManager.exists( symlink_checkpoint_path): checkpoint_path = f"{checkpoint_folder}/checkpoint.torch" else: checkpoint_path = get_resume_checkpoint( cfg, checkpoint_folder=checkpoint_folder) try: if world_size > 1: torch.multiprocessing.spawn( _distributed_worker, nprocs=cfg.DISTRIBUTED.NUM_PROC_PER_NODE, args=( cfg, node_id, dist_run_id, engine_name, checkpoint_path, checkpoint_folder, hook_generator, ), daemon=False, ) else: _distributed_worker( local_rank=0, cfg=cfg, node_id=node_id, dist_run_id=dist_run_id, engine_name=engine_name, checkpoint_path=checkpoint_path, checkpoint_folder=checkpoint_folder, hook_generator=hook_generator, ) except (KeyboardInterrupt, RuntimeError) as e: logging.error("Wrapping up, caught exception: ", e) if isinstance(e, RuntimeError): raise e finally: cleanup_local_dir(cfg) logging.info("All Done!")
def default_hook_generator(cfg: AttrDict) -> List[ClassyHook]: """ The utility function that prepares all the hoooks that will be used in training based on user selection. Some basic hooks are used by default. Optional hooks: - Tensorboard hook, - loss specific hooks (swav loss, deepcluster loss, moco loss) used only when the loss is being used - model complexity hook (if user wants to compute model flops, activations, params) enable the hook via HOOKS.MODEL_COMPLEXITY.COMPUTE_COMPLEXITY = True Returns: hooks (List(functions)): list containing the hook functions that will be used """ hooks = [] # conditionally add hooks based on use-case if cfg.HOOKS.PERF_STATS.MONITOR_PERF_STATS: perf_stat_freq = (cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY if cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY > 0 else None) hooks.append(LogPerfTimeMetricsHook(perf_stat_freq)) if cfg.LOSS.name == "swav_loss": hooks.extend([SwAVUpdateQueueScoresHook(), NormalizePrototypesHook()]) if cfg.LOSS.name == "swav_momentum_loss": hooks.extend([ SwAVMomentumHook( cfg.LOSS["swav_momentum_loss"]["momentum"], cfg.LOSS["swav_momentum_loss"] ["momentum_eval_mode_iter_start"], cfg.LOSS["swav_momentum_loss"]["crops_for_assign"], ), SwAVMomentumNormalizePrototypesHook(), ]) if cfg.LOSS.name == "deepclusterv2_loss": hooks.extend([InitMemoryHook(), ClusterMemoryHook()]) if cfg.LOSS.name == "moco_loss": hooks.extend([ MoCoHook( cfg.LOSS["moco_loss"]["momentum"], shuffle_batch=( not cfg.MODEL.SYNC_BN_CONFIG.CONVERT_BN_TO_SYNC_BN), ) ]) if cfg.HOOKS.MODEL_COMPLEXITY.COMPUTE_COMPLEXITY: hooks.extend([SSLModelComplexityHook()]) if cfg.HOOKS.LOG_GPU_STATS: hooks.extend([LogGpuStatsHook()]) if cfg.HOOKS.MEMORY_SUMMARY.PRINT_MEMORY_SUMMARY: hooks.extend( [LogGpuMemoryHook(cfg.HOOKS.MEMORY_SUMMARY.LOG_ITERATION_NUM)]) if cfg.HOOKS.TENSORBOARD_SETUP.USE_TENSORBOARD: assert is_tensorboard_available(), ( "Tensorboard must be installed to use it. Please install tensorboard using:" "If pip environment: `pip install tensorboard` " "If using conda and you prefer conda install of tensorboard: " "`conda install -c conda-forge tensorboard`") tb_hook = get_tensorboard_hook(cfg) hooks.extend([tb_hook]) if cfg.MODEL.GRAD_CLIP.USE_GRAD_CLIP: hooks.extend([ GradClipHook( norm_type=cfg.MODEL.GRAD_CLIP.NORM_TYPE, max_norm=cfg.MODEL.GRAD_CLIP.MAX_NORM, ) ]) # hooks that are used irrespective of workflow type rolling_btime_freq = (cfg.HOOKS.PERF_STATS.ROLLING_BTIME_FREQ if cfg.HOOKS.PERF_STATS.ROLLING_BTIME_FREQ > 0 else None) if ProfilingHook.is_enabled(cfg.PROFILING): hooks.append(ProfilingHook(profiling_config=cfg.PROFILING)) world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE checkpoint_folder = get_checkpoint_folder(cfg) hooks.extend([ CheckNanLossHook(), SetDataSamplerEpochHook(), FreezeParametersHook(), UpdateBatchesSeenHook(), UpdateTrainBatchTimeHook(), UpdateTestBatchTimeHook(), UpdateTrainIterationNumHook(), LogLossMetricsCheckpointHook(world_size), LogLossLrEtaHook(checkpoint_folder, rolling_btime_freq), ]) return hooks
def instance_retrieval_test(args, cfg): # We require 1-gpu for feature extraction. Hence check CUDA is available. assert torch.cuda.is_available(), "CUDA not available, Exit!" train_dataset_name = cfg.IMG_RETRIEVAL.TRAIN_DATASET_NAME eval_dataset_name = cfg.IMG_RETRIEVAL.EVAL_DATASET_NAME spatial_levels = cfg.IMG_RETRIEVAL.SPATIAL_LEVELS resize_img = cfg.IMG_RETRIEVAL.RESIZE_IMG eval_binary_path = cfg.IMG_RETRIEVAL.EVAL_BINARY_PATH root_dataset_path = cfg.IMG_RETRIEVAL.DATASET_PATH save_features = cfg.IMG_RETRIEVAL.SAVE_FEATURES temp_dir = None if save_features: temp_dir = os.path.join(get_checkpoint_folder(cfg), "features") logging.info(f"Temp directory: {temp_dir}") ############################################################################ # Step 1: Prepare the train/eval datasets, create model and load weights # We only create the train dataset if we need PCA/whitening otherwise # train_dataset is None train_dataset = get_train_dataset(cfg, root_dataset_path, train_dataset_name, eval_binary_path) # create the eval dataset. INSTRE data evaluation requires whitening. eval_dataset = get_eval_dataset(cfg, root_dataset_path, eval_dataset_name, eval_binary_path) # Setup the data transforms (basic) that we apply on the train/eval dataset. transforms = get_transforms(cfg, eval_dataset_name) # Create the image helper image_helper = InstanceRetrievalImageLoader(S=resize_img, transforms=transforms) # Build the model on gpu and set in the eval mode model = build_retrieval_model(cfg) model = copy_model_to_gpu(model) logging.info("Freezing the model.....") model.eval() model.freeze_head_and_trunk() ############################################################################ # Step 2: Extract the features for the train dataset, calculate PCA or # whitening and save if cfg.IMG_RETRIEVAL.TRAIN_PCA_WHITENING: logging.info("Extracting training features...") # the features are already processed based on type: rmac | GeM | l2 norm train_features = get_train_features( cfg, temp_dir, train_dataset_name, resize_img, spatial_levels, image_helper, train_dataset, model, ) ######################################################################## # Train PCA on the train features pca_out_fname = None if temp_dir: pca_out_fname = f"{temp_dir}/{train_dataset_name}_S{resize_img}_PCA.pickle" if pca_out_fname and PathManager.exists(pca_out_fname): logging.info("Loading PCA...") pca = load_pca(pca_out_fname) else: logging.info("Training and saving PCA...") pca = train_and_save_pca(train_features, cfg.IMG_RETRIEVAL.N_PCA, pca_out_fname) else: pca = None ############################################################################ # Step 4: Extract db_features and q_features for the eval dataset logging.info("Extracting Queries features...") features_queries = get_queries_features( cfg, temp_dir, eval_dataset_name, resize_img, spatial_levels, image_helper, eval_dataset, model, pca, ) logging.info("Extracting Dataset features...") features_dataset = get_dataset_features( cfg, temp_dir, eval_dataset_name, resize_img, spatial_levels, image_helper, eval_dataset, model, pca, ) ############################################################################ # Step 5: Compute similarity, score, and save results logging.info("Calculating similarity and score...") sim = features_queries.dot(features_dataset.T) logging.info(f"Similarity tensor: {sim.shape}") results = eval_dataset.score(sim, temp_dir) ############################################################################ # Step 6: save results and cleanup the temp directory if cfg.IMG_RETRIEVAL.SAVE_RETRIEVAL_RANKINGS_SCORES: checkpoint_folder = get_checkpoint_folder(cfg) # Save the rankings sim = sim.T ranks = np.argsort(-sim, axis=0) save_file(ranks.T.tolist(), os.path.join(checkpoint_folder, "rankings.json")) # Save the similarity scores save_file( sim.tolist(), os.path.join(checkpoint_folder, "similarity_scores.json"), ) # Save the result metrics save_file(results, os.path.join(checkpoint_folder, "metrics.json")) logging.info("All done!!")
def run_knn_at_layer(cfg: AttrDict, layer_name: str = "heads"): """ Run the Nearest Neighbour benchmark at the layer "layer_name" """ temperature = cfg.NEAREST_NEIGHBOR.SIGMA num_neighbors = cfg.NEAREST_NEIGHBOR.TOPK feature_dir = cfg.NEAREST_NEIGHBOR.FEATURES.PATH output_dir = get_checkpoint_folder(cfg) logging.info(f"Testing with sigma: {temperature}, topk neighbors: {num_neighbors}") ############################################################################ # Step 1: get train and test features train_out = ExtractedFeaturesLoader.load_features( feature_dir, "train", layer_name, flatten_features=True ) train_features, train_labels = train_out["features"], train_out["targets"] test_out = ExtractedFeaturesLoader.load_features( feature_dir, "test", layer_name, flatten_features=True ) test_features, test_labels = test_out["features"], test_out["targets"] train_features = torch.from_numpy(train_features).float() test_features = torch.from_numpy(test_features).float() train_labels = torch.LongTensor(train_labels) num_classes = train_labels.max() + 1 ########################################################################### # Step 2: calculate the nearest neighbor and the metrics accuracies = Accuracies() if cfg.NEAREST_NEIGHBOR.L2_NORM_FEATS: train_features = nn.functional.normalize(train_features, dim=1, p=2) test_features = nn.functional.normalize(test_features, dim=1, p=2) # put train features and labels on gpu and transpose train features if cfg.NEAREST_NEIGHBOR.USE_CUDA: train_features = train_features.cuda().t() test_features = test_features.cuda() train_labels = train_labels.cuda() else: train_features = train_features.t() num_test_images, num_chunks = test_labels.shape[0], 100 imgs_per_chunk = num_test_images // num_chunks output_targets, output_predicted_label, output_inds = [], [], [] with torch.no_grad(): for idx in range(0, num_test_images, imgs_per_chunk): # get the features for test images and normalize the features if needed features = test_features[ idx : min((idx + imgs_per_chunk), num_test_images), : ] targets = test_labels[idx : min((idx + imgs_per_chunk), num_test_images), :] batch_size = targets.shape[0] targets = torch.LongTensor(targets) if cfg.NEAREST_NEIGHBOR.USE_CUDA: targets = torch.LongTensor(targets).cuda() # calculate the dot product and compute top-k neighbors similarity = torch.mm(features, train_features) distances, indices = similarity.topk( num_neighbors, largest=True, sorted=True ) candidates = train_labels.view(1, -1).expand(batch_size, -1) retrieved_neighbors = torch.gather(candidates, 1, indices) retrieval_one_hot = torch.zeros(batch_size * num_neighbors, num_classes) if cfg.NEAREST_NEIGHBOR.USE_CUDA: retrieval_one_hot = retrieval_one_hot.cuda() retrieval_one_hot.scatter_(1, retrieved_neighbors.view(-1, 1), 1) predictions = _get_sorted_predictions( batch_size, num_classes, distances, retrieval_one_hot, temperature ) # find the predictions that match the target accuracies = accuracies + Accuracies.from_batch(predictions, targets) # get the predictions, nearest neighbors, inds to save output_inds.extend(range(idx, min((idx + imgs_per_chunk), num_test_images))) output_predicted_label.append(predictions.data.cpu().numpy()) output_targets.append(targets.data.cpu().numpy()) _save_knn_results( output_dir, layer_name, output_inds, output_predicted_label, output_targets ) accuracies.log(layer_name) return accuracies.top_1, accuracies.top_5, accuracies.total
def launch_distributed( cfg: AttrDict, node_id: int, engine_name: str, hook_generator: Callable[[Any], List[ClassyHook]], ): """ Launch the distributed training across gpus of the current node according to the cfg. If more than 1 nodes are needed for training, this function should be called on each of the different nodes, each time with an unique node_id in the range [0..N-1] if N is the total number of nodes to take part in training. Alternatively, you can use SLURM or any cluster management system to run this function for you. Configure the node_id, dist_run_id, setup the environment variabled Args: cfg (AttrDict): VISSL yaml configuration node_id (int): node_id for this node engine_name (str): what engine to run: train or extract_features hook_generator (Callable): Callback to generate all the ClassyVision hooks for this engine """ setup_logging(__name__) node_id = get_node_id(node_id) dist_run_id = get_dist_run_id(cfg, cfg.DISTRIBUTED.NUM_NODES) # If using gpus, we check that the user has specified <= gpus available on user system. if cfg.MACHINE.DEVICE == "gpu": assert cfg.DISTRIBUTED.NUM_PROC_PER_NODE <= torch.cuda.device_count( ), (f"User system doesn't have requested {cfg.DISTRIBUTED.NUM_PROC_PER_NODE} gpus " f"available. Number of gpus found on user system={torch.cuda.device_count()}. " "Please set the DISTRIBUTED.NUM_PROC_PER_NODE properly.") # set the environment variables including local rank, node id etc. set_env_vars(local_rank=0, node_id=node_id, cfg=cfg) # given the checkpoint folder, we check that there's not already a final checkpoint # and that if there already exists a final checkpoint and user is not overriding # to ignore the final checkpoint checkpoint_folder = get_checkpoint_folder(cfg) if is_training_finished(cfg, checkpoint_folder=checkpoint_folder): logging.info( f"Training already succeeded on node: {node_id}, exiting.") return # Get the checkpoint where to resume from. The get_resume_checkpoint function will # automatically take care of detecting whether it's a resume or not. symlink_checkpoint_path = f"{checkpoint_folder}/checkpoint.torch" if cfg.CHECKPOINT.USE_SYMLINK_CHECKPOINT_FOR_RESUME and g_pathmgr.exists( symlink_checkpoint_path): checkpoint_path = f"{checkpoint_folder}/checkpoint.torch" else: checkpoint_path = get_resume_checkpoint( cfg, checkpoint_folder=checkpoint_folder) # assert that if the user set the PARAMS_FILE, it must exist and be valid. # we only use the PARAMS_FILE init if the checkpoint doesn't exist for the # given training. This ensures that if the same training resumes, then it # resumes from the checkpoint and not the weight init if checkpoint_path is None and cfg["MODEL"]["WEIGHTS_INIT"]["PARAMS_FILE"]: params_file = cfg["MODEL"]["WEIGHTS_INIT"]["PARAMS_FILE"] error_message = f"Specified PARAMS_FILE does NOT exist: {params_file}" assert g_pathmgr.exists(params_file), error_message # copy the data to local if user wants. This can speed up dataloading. _copy_to_local(cfg) try: torch.multiprocessing.spawn( _distributed_worker, nprocs=cfg.DISTRIBUTED.NUM_PROC_PER_NODE, args=( cfg, node_id, dist_run_id, engine_name, checkpoint_path, checkpoint_folder, hook_generator, ), daemon=False, ) except (KeyboardInterrupt, RuntimeError) as e: logging.error("Wrapping up, caught exception: ", e) if isinstance(e, RuntimeError): raise e finally: _cleanup_local_dir(cfg) logging.info("All Done!") shutdown_logging()
def instance_retrieval_test(args, cfg): if (cfg.IMG_RETRIEVAL.USE_FEATURE_EXTRACTION_ENGINE and not cfg.IMG_RETRIEVAL.FEATURE_EXTRACTION_DIR): # We require 1-gpu for feature extraction. Hence check CUDA is available. # If we provide FEATURE_EXTRACTION_DIR, we have already extracted the features # and do not require GPU. assert torch.cuda.is_available(), "CUDA not available, Exit!" train_dataset_name = cfg.IMG_RETRIEVAL.TRAIN_DATASET_NAME eval_dataset_name = cfg.IMG_RETRIEVAL.EVAL_DATASET_NAME spatial_levels = cfg.IMG_RETRIEVAL.SPATIAL_LEVELS resize_img = cfg.IMG_RETRIEVAL.RESIZE_IMG eval_binary_path = cfg.IMG_RETRIEVAL.EVAL_BINARY_PATH root_dataset_path = cfg.IMG_RETRIEVAL.DATASET_PATH save_features = cfg.IMG_RETRIEVAL.SAVE_FEATURES use_feature_extractor = cfg.IMG_RETRIEVAL.USE_FEATURE_EXTRACTION_ENGINE temp_dir = None if save_features: temp_dir = os.path.join(get_checkpoint_folder(cfg), "features") logging.info(f"Temp directory: {temp_dir}") ############################################################################ # Step 1: Prepare the train/eval datasets, create model and load weights # We only create the train dataset if we need PCA/whitening otherwise # train_dataset is None train_dataset = get_train_dataset(cfg, root_dataset_path, train_dataset_name, eval_binary_path) # create the eval dataset. INSTRE data evaluation requires whitening. eval_dataset = get_eval_dataset(cfg, root_dataset_path, eval_dataset_name, eval_binary_path) # Setup the data transforms (basic) that we apply on the train/eval dataset. transforms = get_transforms(cfg, eval_dataset_name) # Create the image helper image_helper = InstanceRetrievalImageLoader( S=resize_img, transforms=transforms, center_crop=cfg.IMG_RETRIEVAL.CENTER_CROP) model = None if not use_feature_extractor: # Build the model on gpu and set in the eval mode model = build_retrieval_model(cfg) model = copy_model_to_gpu(model) logging.info("Freezing the model.....") model.eval() model.freeze_head_and_trunk() ############################################################################ # Step 2: Extract the features for the train dataset, calculate PCA or # whitening and save if cfg.IMG_RETRIEVAL.TRAIN_PCA_WHITENING: logging.info("Extracting training features...") # the features are already processed based on type: rmac | GeM | l2 norm with PerfTimer("get_train_features", PERF_STATS): # TODO: encapsulate the approach "WithFeatureExtractor" from the other one. if use_feature_extractor: input_dir = (cfg.IMG_RETRIEVAL.FEATURE_EXTRACTION_DIR or get_checkpoint_folder(cfg)) input_dir = os.path.join(input_dir, "train_database") train_features = load_and_process_features( cfg, input_dir, "train") else: train_features = extract_train_features( cfg, temp_dir, train_dataset_name, resize_img, spatial_levels, image_helper, train_dataset, model, ) train_features = np.vstack( [x.reshape(-1, x.shape[-1]) for x in train_features]) ######################################################################## # Train PCA on the train features pca_out_fname = None if temp_dir: pca_out_fname = f"{temp_dir}/{train_dataset_name}_S{resize_img}_PCA.pickle" if pca_out_fname and g_pathmgr.exists(pca_out_fname): logging.info("Loading PCA...") pca = load_pca(pca_out_fname) else: logging.info("Training and saving PCA...") pca = train_and_save_pca(train_features, cfg.IMG_RETRIEVAL.N_PCA, pca_out_fname) else: pca = None ############################################################################ # Step 4: Extract db_features and q_features for the eval dataset with PerfTimer("get_query_features", PERF_STATS): logging.info("Extracting Queries features...") # TODO: encapsulate the approach "WithFeatureExtractor" from the other one. if use_feature_extractor: input_dir = (cfg.IMG_RETRIEVAL.FEATURE_EXTRACTION_DIR or get_checkpoint_folder(cfg)) input_dir = os.path.join(input_dir, "query") features_queries = load_and_process_features( cfg, input_dir, "test", pca) else: features_queries = get_queries_features( cfg, temp_dir, eval_dataset_name, resize_img, spatial_levels, image_helper, eval_dataset, model, pca, ) features_queries = np.vstack(features_queries) with PerfTimer("get_dataset_features", PERF_STATS): logging.info("Extracting Dataset features...") # TODO: encapsulate the approach "WithFeatureExtractor" from the other one. if use_feature_extractor: input_dir = (cfg.IMG_RETRIEVAL.FEATURE_EXTRACTION_DIR or get_checkpoint_folder(cfg)) input_dir = os.path.join(input_dir, "train_database") features_dataset = load_and_process_features( cfg, input_dir, "test", pca) else: features_dataset = get_dataset_features( cfg, temp_dir, eval_dataset_name, resize_img, spatial_levels, image_helper, eval_dataset, model, pca, ) features_dataset = np.vstack(features_dataset) ############################################################################ # Step 5: Compute similarity, score, and save results with PerfTimer("scoring_results", PERF_STATS): logging.info("Calculating similarity and score...") if cfg.IMG_RETRIEVAL.SIMILARITY_MEASURE == "cosine_similarity": sim = features_queries.dot(features_dataset.T) elif cfg.IMG_RETRIEVAL.SIMILARITY_MEASURE == "l2": sim = -compute_l2_distance_matrix(features_queries, features_dataset) else: raise ValueError( f"{ cfg.IMG_RETRIEVAL.SIMILARITY_MEASURE } not supported.") logging.info(f"Similarity tensor: {sim.shape}") results = eval_dataset.score(sim, temp_dir) ############################################################################ # Step 6: save results and cleanup the temp directory if cfg.IMG_RETRIEVAL.SAVE_RETRIEVAL_RANKINGS_SCORES: checkpoint_folder = get_checkpoint_folder(cfg) # Save the rankings sim = sim.T ranks = np.argsort(-sim, axis=0) save_file(ranks.T.tolist(), os.path.join(checkpoint_folder, "rankings.json")) # Save the similarity scores save_file(sim.tolist(), os.path.join(checkpoint_folder, "similarity_scores.json")) # Save the result metrics save_file( results, os.path.join(checkpoint_folder, "metrics.json"), append_to_json=False, ) logging.info("All done!!")
def rank_features(args: Namespace, cfg: AttrDict): # faiss is an optional dependency for VISSL. assert is_faiss_available(), ( "Please install faiss using conda install faiss-gpu -c pytorch " "if using conda or pip install faiss-gpu") import faiss ranking_backend = cfg.RANKING.RANKING_BACKEND data_split = cfg.RANKING.FEATURES.DATA_PARTITION data_name = cfg.RANKING.FEATURES.DATASET_NAME output_dir = get_checkpoint_folder(cfg) ########### Step 1: Extract the features on full dataset ################### feature_data, image_paths = get_data_features_and_images(cfg) ########### Step 2: Get the data information ################### features = feature_data["features"] # features are of shape num_samples x feature_dim assert features.ndim == 2, f"Features incorrect shape: {features.shape}" assert features.dtype == np.float32, "Features are not float32 type" logging.info(f"Ranking Features: {features.shape}") ########### Step 3: Optionally L2 normalize features ################### if cfg.RANKING.APPLY_PCA: logging.info("L2 normalizing the features now...") feat_norm = np.linalg.norm(features, axis=1) + 1e-5 features = features / feat_norm[:, np.newaxis] logging.info(f"Projecting down to {cfg.RANKING.PCA_DIM} dims ...") features = PCA( n_components=cfg.RANKING.PCA_DIM).fit_transform(features) logging.info(f"PCA features: {features.shape}") if cfg.RANKING.NORMALIZE_FEATS: logging.info("L2 normalizing the features now...") feat_norm = np.linalg.norm(features, axis=1) + 1e-5 features = features / feat_norm[:, np.newaxis] ########### Step 4: Build the L2 index on the features ################### logging.info( "Building the L2 index and searching nearest neighbor with faiss now..." ) assert ranking_backend == "faiss", "Only faiss clustering is supported currently" if cfg.RANKING.USE_GPU: logging.info("Using gpu for faiss indexing...") index = faiss.GpuIndexFlatL2( faiss.StandardGpuResources(), features.shape[1], ) else: logging.info("Using CPU for faiss indexing...") index = faiss.IndexFlatL2(features.shape[1]) index.add(features) logging.info("Doing the nearest neighbor search now...") # Num. neighbors here is 2, so for a given point we find that same point at # distance 0, and its nearest neighbor distances, nn_indices = index.search(features, 2) # Remove distance to self, which is always 0 distances = [d[1] for d in distances] ########### Step 5: Sorting the distances now ############ logging.info("Sorting and ranking based on the L2 distance now...") img_paths_and_distances = zip(image_paths, distances) img_paths_and_distances = sorted(img_paths_and_distances, key=lambda x: x[1], reverse=True) paths, distances = [x[0] for x in img_paths_and_distances ], [x[1] for x in img_paths_and_distances] #### Step 6: Save image paths and distances... ### data_split = data_split.lower() ranking_output_dict = { "img_paths": paths, "distances": distances, } ranking_output_filepath = ( f"{output_dir}/ranking_{data_name}_{data_split}_{ranking_backend}.pkl") save_file(ranking_output_dict, ranking_output_filepath) logging.info("All Done!")
def extract_main(cfg: AttrDict, dist_run_id: str, local_rank: int = 0, node_id: int = 0): """ Sets up and executes feature extraction workflow per machine. Args: cfg (AttrDict): user specified input config that has optimizer, loss, meters etc settings relevant to the training dist_run_id (str): For multi-gpu training with PyTorch, we have to specify how the gpus are going to rendezvous. This requires specifying the communication method: file, tcp and the unique rendezvous run_id that is specific to 1 run. We recommend: 1) for 1node: use init_method=tcp and run_id=auto 2) for multi-node, use init_method=tcp and specify run_id={master_node}:{port} local_rank (int): id of the current device on the machine. If using gpus, local_rank = gpu number on the current machine node_id (int): id of the current machine. starts from 0. valid for multi-gpu """ # setup logging setup_logging(__name__) # setup the environment variables set_env_vars(local_rank, node_id, cfg) # setup the multiprocessing to be forkserver. # See https://fb.quip.com/CphdAGUaM5Wf setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD) # set seeds logging.info("Setting seed....") set_seeds(cfg) # print the training settings and system settings local_rank, _ = get_machine_local_and_dist_rank() if local_rank == 0: print_cfg(cfg) logging.info("System config:\n{}".format(collect_env_info())) output_dir = get_checkpoint_folder(cfg) trainer = SelfSupervisionTrainer(cfg, dist_run_id) features = trainer.extract() for split in features.keys(): logging.info(f"============== Split: {split} =======================") layers = features[split].keys() for layer in layers: out_feat_file = ( f"{output_dir}/rank{local_rank}_{split}_{layer}_features.npy") out_target_file = ( f"{output_dir}/rank{local_rank}_{split}_{layer}_targets.npy") out_inds_file = f"{output_dir}/rank{local_rank}_{split}_{layer}_inds.npy" logging.info("Saving extracted features: {} {} to: {}".format( layer, features[split][layer]["features"].shape, out_feat_file)) save_file(features[split][layer]["features"], out_feat_file) logging.info("Saving extracted targets: {} to: {}".format( features[split][layer]["targets"].shape, out_target_file)) save_file(features[split][layer]["targets"], out_target_file) logging.info("Saving extracted indices: {} to: {}".format( features[split][layer]["inds"].shape, out_inds_file)) save_file(features[split][layer]["inds"], out_inds_file) logging.info("All Done!") # close the logging streams including the filehandlers shutdown_logging()
def assert_hydra_conf(cfg): """ Infer values of few parameters in the config file using the value of other config parameters 1. Inferring losses 2. Auto scale learning rate if user has specified auto scaling to be True. 3. Infer meter names (model layer name being evaluated) since we support list meters that have multiple output and same target. This is very common in self-supervised learning where we want to evaluate metric for several layers of the models. VISSL supports running evaluation for multiple model layers in a single training run. 4. Support multi-gpu DDP eval model by attaching a dummy parameter. This is particularly helpful for the multi-gpu feature extraction especially when the dataset is large for which features are being extracted. 5. Infer what kind of labels are being used. If user has specified a labels source, we set LABEL_TYPE to "standard" (also vissl default), otherwise if no label is specified, we set the LABEL_TYPE to "sample_index". """ cfg = infer_losses_config(cfg) cfg = infer_learning_rate(cfg) # in case of linear evaluation, we often evaluate several layers at a time. For each # layer, there's a separate accuracy meter. In such case, we want to output the layer # name in the meters output to make it easy to interpret the results. This is # currently only supported for cases where we have linear evaluation. if cfg.METERS is not None: from vissl.models import is_feature_extractor_model meter_name = cfg.METERS.get("name", "") valid_meters = ["accuracy_list_meter", "mean_ap_list_meter"] if meter_name: if meter_name in valid_meters and is_feature_extractor_model( cfg.MODEL): cfg.METERS[meter_name]["num_meters"] = len( cfg.MODEL.FEATURE_EVAL_SETTINGS. LINEAR_EVAL_FEAT_POOL_OPS_MAP) cfg.METERS[meter_name]["meter_names"] = [ item[0] for item in cfg.MODEL.FEATURE_EVAL_SETTINGS. LINEAR_EVAL_FEAT_POOL_OPS_MAP ] # in case of feature evaluation mode, we freeze the trunk. The Feature evaluation mode # is used for the feature extraction of trunk as well. VISSL supports distributed feature # extraction to speed up the extraction time. Since the model needs to be DDP for the # distributed extraction, we need some dummy parameters in the model otherwise model # can't be converted to DDP. So we attach some dummy head to the model. world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE if (cfg.MODEL.FEATURE_EVAL_SETTINGS.EVAL_MODE_ON and cfg.MODEL.FEATURE_EVAL_SETTINGS.FREEZE_TRUNK_ONLY and cfg.MODEL.FEATURE_EVAL_SETTINGS.EXTRACT_TRUNK_FEATURES_ONLY and world_size > 1 and len(cfg.MODEL.HEAD.PARAMS) == 0): cfg.MODEL.HEAD.PARAMS = [["mlp", {"dims": [2048, 1000]}]] # in SSL, during pre-training we don't want to use annotated labels or during feature # extraction, we don't have annotated labels for some datasets. In such cases, we set # the label type to be just the image index in the dataset, unless the # user has specifically provided "zero" as the label type, which is # necessary when the CutMixUp collator is being used for self-supervised # training. if len(cfg.DATA.TRAIN.LABEL_SOURCES ) == 0 and cfg.DATA.TRAIN.LABEL_TYPE != "zero": cfg.DATA.TRAIN.LABEL_TYPE = "sample_index" if len(cfg.DATA.TEST.LABEL_SOURCES ) == 0 and cfg.DATA.TEST.LABEL_TYPE != "zero": cfg.DATA.TEST.LABEL_TYPE = "sample_index" # if the user has specified the model initialization from a params_file, we check if # the params_file is a url. If it is, we download the file to a local cache directory # and use that instead from vissl.utils.checkpoint import get_checkpoint_folder from vissl.utils.io import cache_url, is_url if is_url(cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE): checkpoint_dir = get_checkpoint_folder(cfg) cache_dir = f"{checkpoint_dir}/params_file_cache/" cached_url_path = cache_url(url=cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE, cache_dir=cache_dir) cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE = cached_url_path # if we use a zero optimizer, we nest the optimizer related settings under the # base_optimizer. if cfg.OPTIMIZER.use_zero: cfg.OPTIMIZER["base_optimizer"] = cfg.OPTIMIZER.copy() cfg.OPTIMIZER.name = "zero" del cfg.OPTIMIZER.base_optimizer["param_schedulers"] del cfg.OPTIMIZER.base_optimizer["regularize_bn"] del cfg.OPTIMIZER.base_optimizer["regularize_bias"] del cfg.OPTIMIZER.base_optimizer["num_epochs"] del cfg.OPTIMIZER.base_optimizer["use_zero"] del cfg.OPTIMIZER.base_optimizer["head_optimizer_params"]
def run_knn_at_layer_low_memory(cfg: AttrDict, layer_name: str = "heads"): """ Alternate implementation of kNN which scales to bigger features and bigger "train" splits """ if cfg.NEAREST_NEIGHBOR.USE_CUDA: logging.warning( "config.NEAREST_NEIGHBOR.USE_CUDA is not available when " "config.NEAREST_NEIGHBOR.OPTIMIZE_MEMORY is set to True, " "using CPU instead" ) temperature = cfg.NEAREST_NEIGHBOR.SIGMA num_neighbors = cfg.NEAREST_NEIGHBOR.TOPK feature_dir = cfg.NEAREST_NEIGHBOR.FEATURES.PATH output_dir = get_checkpoint_folder(cfg) logging.info(f"Testing with sigma: {temperature}, topk neighbors: {num_neighbors}") # Step 1: get the test features (the train features might not feat in memory) test_out = ExtractedFeaturesLoader.load_features( feature_dir, "test", layer_name, flatten_features=True ) test_features, test_labels = test_out["features"], test_out["targets"] test_features = torch.from_numpy(test_features).float() test_feature_num = test_features.shape[0] # Step 2: normalize the features if needed if cfg.NEAREST_NEIGHBOR.L2_NORM_FEATS: test_features = nn.functional.normalize(test_features, dim=1, p=2) # Step 3: collect the similarity score of each test feature # to all the train features, making sure: # - never to load the all train features at once to avoid OOM # - to keep just the 'num_neighbors' best similarity scores shard_paths = ExtractedFeaturesLoader.get_shard_file_names( input_dir=feature_dir, split="train", layer=layer_name ) similarity_queue = MaxSimilarityPriorityQueue(max_size=num_neighbors) num_classes = 0 for shard_path in shard_paths: shard_content = ExtractedFeaturesLoader.load_feature_shard(shard_path) train_features = torch.from_numpy(shard_content.features) train_features = train_features.float().reshape((train_features.shape[0], -1)) if cfg.NEAREST_NEIGHBOR.L2_NORM_FEATS: train_features = nn.functional.normalize(train_features, dim=1, p=2) train_features = train_features.t() train_labels = torch.LongTensor(shard_content.targets).squeeze(-1) num_classes = max(num_classes, train_labels.max().item() + 1) similarities = torch.mm(test_features, train_features) if similarities.shape[0] > num_neighbors: distances, indices = similarities.topk( num_neighbors, largest=True, sorted=True ) else: distances, indices = torch.sort(similarities, descending=True) closest_labels = train_labels[indices] similarity_queue.push_all(distances, closest_labels) # Step 4: collect the samples with the closest similarities # for each test sample, and assemble it in a matrix with # shape (num_test_samples, num_neighbors) topk_distances, topk_labels = similarity_queue.pop_all() # Step 5: go through each of the test samples, batch by batch, # to compute the label of each test sample based on the top k # nearest neighbors and their corresponding labels accuracies = Accuracies() output_targets, output_predicted_label, output_inds = [], [], [] batch_size = 100 num_test_images = test_feature_num for idx in range(0, num_test_images, batch_size): min_idx = idx max_idx = min(idx + batch_size, num_test_images) distances = topk_distances[min_idx:max_idx, ...] retrieved_neighbors = topk_labels[min_idx:max_idx, ...] targets = torch.LongTensor(test_labels[min_idx:max_idx]) retrieval_one_hot = torch.zeros(batch_size * num_neighbors, num_classes) retrieval_one_hot.scatter_(1, retrieved_neighbors.view(-1, 1), 1) predictions = _get_sorted_predictions( batch_size, num_classes, distances, retrieval_one_hot, temperature ) # find the predictions that match the target accuracies = accuracies + Accuracies.from_batch(predictions, targets) # get the predictions, nearest neighbors, inds to save output_inds.extend(range(min_idx, max_idx)) output_predicted_label.append(predictions.data.cpu().numpy()) output_targets.append(targets.data.cpu().numpy()) _save_knn_results( output_dir, layer_name, output_inds, output_predicted_label, output_targets ) accuracies.log(layer_name) return accuracies.top_1, accuracies.top_5, accuracies.total
def nearest_neighbor_test(cfg: AttrDict, layer_name: str = "heads"): temperature = cfg.NEAREST_NEIGHBOR.SIGMA num_neighbors = cfg.NEAREST_NEIGHBOR.TOPK output_dir = get_checkpoint_folder(cfg) logging.info(f"Testing with sigma: {temperature}, topk neighbors: {num_neighbors}") ############################################################################ # Step 1: get train and test features train_out = merge_features(output_dir, "train", layer_name, cfg) train_features, train_labels = train_out["features"], train_out["targets"] # put train features and labels on gpu and transpose train features train_features = torch.from_numpy(train_features).float().cuda().t() train_labels = torch.LongTensor(train_labels).cuda() num_classes = train_labels.max() + 1 if cfg.NEAREST_NEIGHBOR.L2_NORM_FEATS: train_features = nn.functional.normalize(train_features, dim=0, p=2) test_out = merge_features(output_dir, "test", layer_name, cfg) test_features, test_labels = test_out["features"], test_out["targets"] ########################################################################### # Step 2: calculate the nearest neighbor and the metrics top1, top5, total = 0.0, 0.0, 0 num_test_images, num_chunks = test_labels.shape[0], 100 imgs_per_chunk = num_test_images // num_chunks with torch.no_grad(): retrieval_one_hot = torch.zeros(num_neighbors, num_classes).cuda() for idx in range(0, num_test_images, imgs_per_chunk): # get the features for test images and normalize the features if needed features = test_features[ idx : min((idx + imgs_per_chunk), num_test_images), : ] targets = test_labels[idx : min((idx + imgs_per_chunk), num_test_images), :] batch_size = targets.shape[0] features = torch.from_numpy(features).float().cuda() targets = torch.LongTensor(targets).cuda() if cfg.NEAREST_NEIGHBOR.L2_NORM_FEATS: features = nn.functional.normalize(features, dim=1, p=2) # calculate the dot product and compute top-k neighbors similarity = torch.mm(features, train_features) distances, indices = similarity.topk( num_neighbors, largest=True, sorted=True ) candidates = train_labels.view(1, -1).expand(batch_size, -1) retrieved_neighbors = torch.gather(candidates, 1, indices) retrieval_one_hot.resize_(batch_size * num_neighbors, num_classes).zero_() retrieval_one_hot.scatter_(1, retrieved_neighbors.view(-1, 1), 1) distances_transform = distances.clone().div_(temperature).exp_() probs = torch.sum( torch.mul( retrieval_one_hot.view(batch_size, -1, num_classes), distances_transform.view(batch_size, -1, 1), ), 1, ) _, predictions = probs.sort(1, True) # find the predictions that match the target correct = predictions.eq(targets.data.view(-1, 1)) top1 = top1 + correct.narrow(1, 0, 1).sum().item() top5 = top5 + correct.narrow(1, 0, 5).sum().item() total += targets.size(0) top1 = top1 * 100.0 / total top5 = top5 * 100.0 / total logging.info(f"Total images: {total}, Top1: {top1}, Top5: {top5}") return top1, top5
def geolocalization_test(cfg: AttrDict, layer_name: str = "heads", topk: int = 1): output_dir = get_checkpoint_folder(cfg) logging.info(f"Output dir: {output_dir} ...") ############################################################################ # Step 1: Load the mapping file and partition it # Also load the test images and targets (latitude/longitude) # lastly, load the model predictions logging.info( f"Loading the label partitioning file: {cfg.GEO_LOCALIZATION.TRAIN_LABEL_MAPPING}" ) partitioning = Partitioning(cfg.GEO_LOCALIZATION.TRAIN_LABEL_MAPPING) data_files, label_files = get_data_files("TEST", cfg.DATA) test_image_paths = load_file(data_files[0]) target_lat_long = load_file(label_files[0]) logging.info( f"Loaded val image paths: {test_image_paths.shape}, " f"ground truth latitude/longitude: {target_lat_long.shape}" ) prediction_image_indices_filepath = f"{output_dir}/rank0_test_{layer_name}_inds.npy" predictions_filepath = f"{output_dir}/rank0_test_{layer_name}_predictions.npy" predictions = load_file(predictions_filepath) predictions_inds = load_file(prediction_image_indices_filepath) logging.info( f"Loaded predictions: {predictions.shape}, inds: {predictions_inds.shape}" ) ############################################################################ # Step 2: Convert the predicted classes to latitude/longitude and compute # accuracy at different km thresholds. gt_latitudes, gt_longitudes, predicted_lats, predicted_longs = [], [], [], [] output_metadata = {} num_images = len(test_image_paths) num_images = min(num_images, len(predictions)) for idx in range(num_images): img_index = predictions_inds[idx] inp_img_path = test_image_paths[img_index] gt_latitude = float(target_lat_long[img_index][0]) gt_longitude = float(target_lat_long[img_index][1]) pred_cls = int(predictions[idx][:topk]) pred_lat, pred_long = partitioning.get_lat_lng(pred_cls) output_metadata[inp_img_path] = { "target_lat": gt_latitude, "target_long": gt_longitude, "pred_lat": pred_lat, "pred_long": pred_long, "pred_cls": pred_cls, } gt_latitudes.append(gt_latitude) gt_longitudes.append(gt_longitude) predicted_lats.append(pred_lat) predicted_longs.append(pred_long) predicted_lats = torch.tensor(predicted_lats, dtype=torch.float) predicted_longs = torch.tensor(predicted_longs, dtype=torch.float) gt_latitudes = torch.tensor(gt_latitudes, dtype=torch.float) gt_longitudes = torch.tensor(gt_longitudes, dtype=torch.float) distances = vectorized_gc_distance( predicted_lats, predicted_longs, gt_latitudes, gt_longitudes, ) # accuracy for all distances (in km) acc_dict = gcd_threshold_eval( distances, thresholds=cfg.GEO_LOCALIZATION.ACC_KM_THRESHOLDS ) gcd_dict = {} for gcd_thres, acc in acc_dict.items(): gcd_dict[f"{gcd_thres}"] = round(acc * 100.0, 4) logging.info(f"acc dist in percentage: {gcd_dict}") save_file( output_metadata, f"{output_dir}/output_metadata_predictions.json", append_to_json=False, ) save_file( gcd_dict, f"{output_dir}/metrics.json", append_to_json=False, ) return output_metadata, acc_dict