def _get_data_files(self, split): """ Get the given dataset split (train or test), get the path to the dataset (images and labels). 1. If the user has explicitly specified the data_sources, we simply use those and don't do lookup in the datasets registered with VISSL from the dataset catalog. 2. If the user hasn't specified the path, look for the dataset in the datasets catalog registered with VISSL. For a given list of datasets and a given partition (train/test), we first verify that we have the dataset and the correct source as specified by the user. Then for each dataset in the list, we get the data path (make sure it exists, sources match). For the label file, the file is optional. """ local_rank, _ = get_machine_local_and_dist_rank() self.data_paths, self.label_paths = dataset_catalog.get_data_files( split, dataset_config=self.cfg["DATA"] ) logging.info( f"Rank: {local_rank} split: {split} Data files:\n{self.data_paths}" ) logging.info( f"Rank: {local_rank} split: {split} Label files:\n{self.label_paths}" )
def _copy_to_local(cfg: AttrDict): available_splits = _get_available_splits(cfg) for split in available_splits: if cfg.DATA[split].COPY_TO_LOCAL_DISK: dest_dir = cfg.DATA[split]["COPY_DESTINATION_DIR"] tmp_dest_dir = tempfile.mkdtemp() data_files, label_files = get_data_files(split, cfg.DATA) data_files.extend(label_files) _, output_dir = copy_data_to_local( data_files, dest_dir, tmp_destination_dir=tmp_dest_dir) cfg.DATA[split]["COPY_DESTINATION_DIR"] = output_dir
def extract_low_shot_features(args: Namespace, cfg: AttrDict, output_dir: str): dataset_name = cfg["SVM"]["low_shot"]["dataset_name"] k_values = cfg["SVM"]["low_shot"]["k_values"] sample_inds = cfg["SVM"]["low_shot"]["sample_inds"] if "voc" in dataset_name: # extract the features. In case of voc07 low-shot, we extract the # features on full train and test sets. Both sets have about 5K images # we extract launch_distributed( cfg, args.node_id, engine_name="extract_features", hook_generator=default_hook_generator, ) elif "places" in dataset_name: # in case of places, since the features size could become large, we need # to extract features at smaller subsamples data_paths, label_paths = dataset_catalog.get_data_files( split="TRAIN", dataset_config=cfg["DATA"]) targets = load_file(label_paths[0]) logging.info("Generating low-shot samples for Places205...") generate_places_low_shot_samples(targets, k_values, sample_inds, output_dir, data_paths[0]) test_features_extracted = False for idx in sample_inds: for k in k_values: out_img_file = f"{output_dir}/train_images_sample{idx}_k{k}.npy" out_lbls_file = f"{output_dir}/train_labels_sample{idx}_k{k}.npy" cfg.DATA.TRAIN.DATA_PATHS = [out_img_file] cfg.DATA.TRAIN.LABEL_PATHS = [out_lbls_file] cfg.CHECKPOINT.DIR = f"{output_dir}/sample{idx}_k{k}" logging.info( f"Extracting features for places low shot: sample{idx}_k{k}" ) # we want to extract the test features only once since the test # features are commonly used for testing for all low-shot setup. if test_features_extracted: cfg.TEST_MODEL = False launch_distributed( cfg, args.node_id, engine_name="extract_features", hook_generator=default_hook_generator, ) test_features_extracted = True # set the test model to true again after feature extraction is done cfg.TEST_MODEL = True else: raise RuntimeError(f"Dataset not recognised: {dataset_name}")
def geolocalization_test(cfg: AttrDict, layer_name: str = "heads", topk: int = 1): output_dir = get_checkpoint_folder(cfg) logging.info(f"Output dir: {output_dir} ...") ############################################################################ # Step 1: Load the mapping file and partition it # Also load the test images and targets (latitude/longitude) # lastly, load the model predictions logging.info( f"Loading the label partitioning file: {cfg.GEO_LOCALIZATION.TRAIN_LABEL_MAPPING}" ) partitioning = Partitioning(cfg.GEO_LOCALIZATION.TRAIN_LABEL_MAPPING) data_files, label_files = get_data_files("TEST", cfg.DATA) test_image_paths = load_file(data_files[0]) target_lat_long = load_file(label_files[0]) logging.info( f"Loaded val image paths: {test_image_paths.shape}, " f"ground truth latitude/longitude: {target_lat_long.shape}" ) prediction_image_indices_filepath = f"{output_dir}/rank0_test_{layer_name}_inds.npy" predictions_filepath = f"{output_dir}/rank0_test_{layer_name}_predictions.npy" predictions = load_file(predictions_filepath) predictions_inds = load_file(prediction_image_indices_filepath) logging.info( f"Loaded predictions: {predictions.shape}, inds: {predictions_inds.shape}" ) ############################################################################ # Step 2: Convert the predicted classes to latitude/longitude and compute # accuracy at different km thresholds. gt_latitudes, gt_longitudes, predicted_lats, predicted_longs = [], [], [], [] output_metadata = {} num_images = len(test_image_paths) num_images = min(num_images, len(predictions)) for idx in range(num_images): img_index = predictions_inds[idx] inp_img_path = test_image_paths[img_index] gt_latitude = float(target_lat_long[img_index][0]) gt_longitude = float(target_lat_long[img_index][1]) pred_cls = int(predictions[idx][:topk]) pred_lat, pred_long = partitioning.get_lat_lng(pred_cls) output_metadata[inp_img_path] = { "target_lat": gt_latitude, "target_long": gt_longitude, "pred_lat": pred_lat, "pred_long": pred_long, "pred_cls": pred_cls, } gt_latitudes.append(gt_latitude) gt_longitudes.append(gt_longitude) predicted_lats.append(pred_lat) predicted_longs.append(pred_long) predicted_lats = torch.tensor(predicted_lats, dtype=torch.float) predicted_longs = torch.tensor(predicted_longs, dtype=torch.float) gt_latitudes = torch.tensor(gt_latitudes, dtype=torch.float) gt_longitudes = torch.tensor(gt_longitudes, dtype=torch.float) distances = vectorized_gc_distance( predicted_lats, predicted_longs, gt_latitudes, gt_longitudes, ) # accuracy for all distances (in km) acc_dict = gcd_threshold_eval( distances, thresholds=cfg.GEO_LOCALIZATION.ACC_KM_THRESHOLDS ) gcd_dict = {} for gcd_thres, acc in acc_dict.items(): gcd_dict[f"{gcd_thres}"] = round(acc * 100.0, 4) logging.info(f"acc dist in percentage: {gcd_dict}") save_file( output_metadata, f"{output_dir}/output_metadata_predictions.json", append_to_json=False, ) save_file( gcd_dict, f"{output_dir}/metrics.json", append_to_json=False, ) return output_metadata, acc_dict