def download_google_drive_url(url: str, output_path: str, output_file_name: str): """ Download a file from google drive Downloading an URL from google drive requires confirmation when the file of the size is too big (google drive notifies that anti-viral checks cannot be performed on such files) """ import requests with requests.Session() as session: # First get the confirmation token and append it to the URL with session.get(url, stream=True, allow_redirects=True) as response: for k, v in response.cookies.items(): if k.startswith("download_warning"): url = url + "&confirm=" + v # Then download the content of the file with session.get(url, stream=True, verify=True) as response: makedir(output_path) path = os.path.join(output_path, output_file_name) total_size = int(response.headers.get("Content-length", 0)) with open(path, "wb") as file: from tqdm import tqdm with tqdm(total=total_size) as progress_bar: for block in response.iter_content( chunk_size=io.DEFAULT_BUFFER_SIZE ): file.write(block) progress_bar.update(len(block))
def save_evaluation_benchmarks(self): """ Create the /evaluations directory inside the training checkpoints dir. Upload json file to the parent evaluation directories, as well as to each child evaluation directories. """ # Upload all checkpoints evaluations to parent checkpoint directory. evaluation_dir = self.evaluation_dir() parent_metrics_file = os.path.join(evaluation_dir, "evaluation_metrics.json") makedir(evaluation_dir) self._write_json_file(self.evaluation_results, parent_metrics_file) # Upload each checkpoint's evaluations to child directories. for checkpoint_str, benchmarks in self.evaluation_results.items(): child_metrics_dir = os.path.join(evaluation_dir, checkpoint_str) child_metrics_file = os.path.join(child_metrics_dir, "evaluation_metrics.json") makedir(child_metrics_dir) self._write_json_file(benchmarks, child_metrics_file) logging.info("Saved benchmarks json file.")
def create_submitit_executor(cfg: AttrDict): """ Utility function to create a SLURM submitit executor, which is able to schedule arbitrary functions on a SLURM cluster The configuration of the executor is derived from the SLURM part of the VISSL configuration provided as parameter """ import submitit log_folder = cfg.SLURM.LOG_FOLDER makedir(log_folder) assert g_pathmgr.exists( log_folder ), f"Specified config.SLURM.LOG_FOLDER={log_folder} doesn't exist" assert cfg.SLURM.PARTITION, "SLURM.PARTITION must be set when using SLURM" executor = submitit.AutoExecutor(folder=log_folder) timeout_min = cfg.SLURM.TIME_HOURS * 60 + cfg.SLURM.TIME_MINUTES executor.update_parameters( name=cfg.SLURM.NAME, slurm_comment=cfg.SLURM.COMMENT, slurm_partition=cfg.SLURM.PARTITION, slurm_constraint=cfg.SLURM.CONSTRAINT, timeout_min=timeout_min, nodes=cfg.DISTRIBUTED.NUM_NODES, cpus_per_task=cfg.SLURM.NUM_CPU_PER_PROC * cfg.DISTRIBUTED.NUM_PROC_PER_NODE, tasks_per_node=1, gpus_per_node=cfg.DISTRIBUTED.NUM_PROC_PER_NODE, mem_gb=cfg.SLURM.MEM_GB, slurm_additional_parameters=cfg.SLURM.ADDITIONAL_PARAMETERS, ) return executor
def get_train_features( cfg, temp_dir, train_dataset_name, resize_img, spatial_levels, image_helper, train_dataset, model, ): train_features = [] def process_train_image(i, out_dir): if i % LOG_FREQUENCY == 0: logging.info(f"Train Image: {i}"), fname_out = f"{out_dir}/{i}.npy" if PathManager.exists(fname_out): feat = load_file(fname_out) train_features.append(feat) else: fname_in = train_dataset.get_filename(i) if is_revisited_dataset(train_dataset_name): img = image_helper.load_and_prepare_revisited_image(fname_in) elif is_whiten_dataset(train_dataset_name): img = image_helper.load_and_prepare_whitening_image(fname_in) else: img = image_helper.load_and_prepare_image(fname_in, roi=None) v = torch.autograd.Variable(img.unsqueeze(0)) vc = v.cuda() # the model output is a list always. activation_map = model(vc)[0].cpu() # once we have the features, # we can perform: rmac | gem pooling | l2 norm if cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE == "rmac": descriptors = get_rmac_descriptors(activation_map, spatial_levels) else: descriptors = activation_map save_file(descriptors.data.numpy(), fname_out) train_features.append(descriptors.data.numpy()) num_images = train_dataset.get_num_images() out_dir = f"{temp_dir}/{train_dataset_name}_S{resize_img}_features_train" makedir(out_dir) for i in range(num_images): process_train_image(i, out_dir) if cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE == "gem": gem_out_fname = f"{out_dir}/{train_dataset_name}_GeM.npy" train_features = torch.tensor(np.concatenate(train_features)) train_features = gem_pool_and_save_features( train_features, p=cfg.IMG_RETRIEVAL.GEM_POOL_POWER, add_bias=True, gem_out_fname=gem_out_fname, ) train_features = np.vstack( [x.reshape(-1, x.shape[-1]) for x in train_features]) logging.info(f"Train features size: {train_features.shape}") return train_features
def map_features_to_img_filepath(cls, image_paths: List[str], input_dir: str, split: str, layer: str): """ Map the features across all GPUs to the respective filenames. Args: image_paths (List[str]): list of image paths. Obtained by dataset.get_image_paths() input_dir (str): input path where the features are dumped split (str): whether the features are train or test data features layer (str): the features correspond to what layer of the model """ logging.info(f"Merging features: {split} {layer}") output_dir = f"{input_dir}/features_to_image/{split}/{layer}" makedir(output_dir) logging.info(f"Saving the mapped features to dir: {output_dir} ...") shard_paths = cls.get_shard_file_names(input_dir, split=split, layer=layer) if not shard_paths: raise ValueError(f"No features found for {split} {layer}") for shard_path in shard_paths: shard_content = cls.load_feature_shard(shard_path) for idx in range(shard_content.num_samples): img_index = shard_content.indices[idx] img_feat = shard_content.features[idx] img_filename = os.path.splitext( os.path.basename(image_paths[img_index]))[0] out_feat_filename = os.path.join(output_dir, img_filename + ".npy") with g_pathmgr.open(out_feat_filename, "wb") as fopen: np.save(fopen, np.expand_dims(img_feat, axis=0))
def get_queries_features( cfg, temp_dir, eval_dataset_name, resize_img, spatial_levels, image_helper, eval_dataset, model, pca, ): features_queries = [] num_queries = eval_dataset.get_num_query_images() if cfg.IMG_RETRIEVAL.DEBUG_MODE: num_queries = 50 logging.info(f"Getting features for queries: {num_queries}") q_fname_out_dir = "{}/{}_S{}_q".format(temp_dir, eval_dataset_name, resize_img) makedir(q_fname_out_dir) for idx in range(num_queries): if idx % LOG_FREQUENCY == 0: logging.info(f"Eval Query: {idx}"), q_fname_in = eval_dataset.get_query_filename(idx) roi = eval_dataset.get_query_roi(idx) q_fname_out = f"{q_fname_out_dir}/{idx}.npy" if PathManager.exists(q_fname_out): query_feature = load_file(q_fname_out) else: query_feature = process_eval_image( cfg, q_fname_in, roi, q_fname_out, spatial_levels, image_helper, model, pca, eval_dataset_name, ) features_queries.append(query_feature) if cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE == "gem": # GeM pool the features and apply the PCA gem_out_fname = f"{q_fname_out_dir}/{eval_dataset_name}_GeM.npy" features_queries = torch.tensor(np.concatenate(features_queries)) features_queries = gem_pool_and_save_features( features_queries, p=cfg.IMG_RETRIEVAL.GEM_POOL_POWER, add_bias=True, gem_out_fname=gem_out_fname, ) features_queries = pca.apply(features_queries) features_queries = np.vstack(features_queries) logging.info(f"features queries: {features_queries.shape}") return features_queries
def get_queries_features( cfg, temp_dir, eval_dataset_name, resize_img, spatial_levels, image_helper, eval_dataset, model, pca, ): features_queries = [] num_queries = eval_dataset.get_num_query_images() num_queries = (num_queries if cfg.IMG_RETRIEVAL.NUM_QUERY_SAMPLES == -1 else cfg.IMG_RETRIEVAL.NUM_QUERY_SAMPLES) logging.info(f"Getting features for queries: {num_queries}") q_fname_out_dir = None if q_fname_out_dir: q_fname_out_dir = f"{temp_dir}/{eval_dataset_name}_S{resize_img}_q" makedir(q_fname_out_dir) for idx in range(num_queries): if idx % LOG_FREQUENCY == 0: logging.info(f"Eval Query: {idx}"), q_fname_in = eval_dataset.get_query_filename(idx) # Optionally crop the query by the region-of-interest (ROI). roi = (eval_dataset.get_query_roi(idx) if cfg.IMG_RETRIEVAL.CROP_QUERY_ROI else None) q_fname_out = None if q_fname_out_dir: q_fname_out = f"{q_fname_out_dir}/{idx}.npy" if q_fname_out and PathManager.exists(q_fname_out): query_feature = load_file(q_fname_out) else: query_feature = process_eval_image( cfg, q_fname_in, roi, q_fname_out, spatial_levels, image_helper, model, pca, eval_dataset_name, verbose=(idx == 0), ) features_queries.append(query_feature) features_queries = np.vstack(features_queries) logging.info(f"Queries Features Size: {features_queries.shape}") return features_queries
def get_dataset_features( cfg, temp_dir, eval_dataset_name, resize_img, spatial_levels, image_helper, eval_dataset, model, pca, ): features_dataset = [] num_images = eval_dataset.get_num_images() logging.info(f"Getting features for dataset images: {num_images}") db_fname_out_dir = "{}/{}_S{}_db".format(temp_dir, eval_dataset_name, resize_img) makedir(db_fname_out_dir) for idx in range(num_images): if idx % LOG_FREQUENCY == 0: logging.info(f"Eval Dataset Image: {idx}"), db_fname_in = eval_dataset.get_filename(idx) db_fname_out = f"{db_fname_out_dir}/{idx}.npy" if PathManager.exists(db_fname_out): db_feature = load_file(db_fname_out) else: db_feature = process_eval_image( cfg, db_fname_in, None, db_fname_out, spatial_levels, image_helper, model, pca, eval_dataset_name, ) features_dataset.append(db_feature) if cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE == "gem": # GeM pool the features and apply the PCA gem_out_fname = f"{db_fname_out_dir}/{eval_dataset_name}_GeM.npy" features_dataset = torch.tensor(np.concatenate(features_dataset)) features_dataset = gem_pool_and_save_features( features_dataset, p=cfg.IMG_RETRIEVAL.GEM_POOL_POWER, add_bias=True, gem_out_fname=gem_out_fname, ) features_dataset = pca.apply(features_dataset) features_dataset = np.vstack(features_dataset) logging.info(f"features dataset: {features_dataset.shape}") return features_dataset
def download_url( url: str, root: str, filename: Optional[str] = None, md5: Optional[str] = None, ) -> None: """Download a file from a url and place it in root. Args: url (str): URL to download file from root (str): Directory to place downloaded file in filename (str, optional): Name to save the file under. If None, use the basename of the URL. md5 (str, optional): MD5 checksum of the download. If None, do not check """ root = os.path.expanduser(root) if not filename: filename = os.path.basename(url) fpath = os.path.join(root, filename) makedir(root) # check if file is already present locally if check_integrity(fpath, md5): print("Using downloaded and verified file: " + fpath) return # expand redirect chain if needed url = get_redirected_url(url) # check if file is located on Google Drive file_id = _get_google_drive_file_id(url) if file_id is not None: return download_file_from_google_drive(file_id, root, filename, md5) # download the file try: print("Downloading " + url + " to " + fpath) _urlretrieve(url, fpath) except (urllib.error.URLError, IOError) as e: # type: ignore[attr-defined] if url[:5] == "https": url = url.replace("https:", "http:") print( "Failed download. Trying https -> http instead." " Downloading " + url + " to " + fpath ) _urlretrieve(url, fpath) else: raise e # check integrity of downloaded file if not check_integrity(fpath, md5): raise RuntimeError("File not found or corrupted.")
def get_dataset_features( cfg, temp_dir, eval_dataset_name, resize_img, spatial_levels, image_helper, eval_dataset, model, pca, ): features_dataset = [] num_images = eval_dataset.get_num_images() logging.info(f"Getting features for dataset images: {num_images}") db_fname_out_dir = None if temp_dir: db_fname_out_dir = f"{temp_dir}/{eval_dataset_name}_S{resize_img}_db" makedir(db_fname_out_dir) for idx in range(num_images): if idx % LOG_FREQUENCY == 0: logging.info(f"Eval Dataset Image: {idx}"), db_fname_in = eval_dataset.get_filename(idx) db_fname_out = None if db_fname_out_dir: db_fname_out = f"{db_fname_out_dir}/{idx}.npy" if db_fname_out and PathManager.exists(db_fname_out): db_feature = load_file(db_fname_out) else: db_feature = process_eval_image( cfg, db_fname_in, None, db_fname_out, spatial_levels, image_helper, model, pca, eval_dataset_name, verbose=(idx == 0), ) features_dataset.append(db_feature) features_dataset = np.vstack(features_dataset) logging.info(f"Dataset Features Size: {features_dataset.shape}") return features_dataset
def convert_checkpoint(input_path: str, output_path: str, output_type: str): assert g_pathmgr.exists( input_path), f"Checkpoint input path: {input_path} not found." # Make the output directory if it doesn't exist. makedir(os.path.split(output_path)[0]) setup_logging(__name__) if output_type == CheckpointType.consolidated.name: CheckpointFormatConverter.sharded_to_consolidated_checkpoint( input_path, output_path) elif output_type == CheckpointType.sliced.name: CheckpointFormatConverter.to_sliced_checkpoint(input_path, output_path) shutdown_logging()
def get_tensorboard_dir(cfg): """ Get the output directory where the tensorboard events will be written. Args: cfg (AttrDict): User specified config file containing the settings for the tensorboard as well like log directory, logging frequency etc Returns: tensorboard_dir (str): output directory path """ checkpoint_folder = get_checkpoint_folder(cfg) tensorboard_dir = f"{checkpoint_folder}/tb_logs" logging.info(f"Tensorboard dir: {tensorboard_dir}") makedir(tensorboard_dir) return tensorboard_dir
def get_checkpoint_folder(config: AttrDict): """ Check, create and return the checkpoint folder. User can specify their own checkpoint directory otherwise the default "." is used. Optionally, for training that involves more than 1 machine, we allow to append the distributed run id which helps to uniquely identify the training. This is completely optional and user can se APPEND_DISTR_RUN_ID=true for this. """ odir = config.CHECKPOINT.DIR if config.DISTRIBUTED.NUM_NODES > 1 and config.CHECKPOINT.APPEND_DISTR_RUN_ID: odir = f"{odir}/{config.DISTRIBUTED.RUN_ID}" makedir(odir) assert PathManager.exists( config.CHECKPOINT.DIR ), f"Please specify config.CHECKPOINT.DIR parameter. Invalid: {config.CHECKPOINT.DIR}" return odir
def launch_distributed_on_slurm(cfg: AttrDict, engine_name: str): """ Launch a distributed training on SLURM, allocating the nodes and GPUs as described in the configuration, and calls the function "launch_on_local_node" appropriately on each of the nodes. Args: cfg (AttrDict): the configuration of the experiment engine_name (str): the name of the engine to run (train or extract_features) """ import submitit # setup the log folder log_folder = cfg.SLURM.LOG_FOLDER makedir(log_folder) assert g_pathmgr.exists( log_folder ), f"Specified config.SLURM.LOG_FOLDER={log_folder} doesn't exist" assert cfg.SLURM.PARTITION, "SLURM.PARTITION must be set when using SLURM" executor = submitit.AutoExecutor(folder=log_folder) timeout_min = cfg.SLURM.TIME_HOURS * 60 + cfg.SLURM.TIME_MINUTES executor.update_parameters( name=cfg.SLURM.NAME, slurm_comment=cfg.SLURM.COMMENT, slurm_partition=cfg.SLURM.PARTITION, slurm_constraint=cfg.SLURM.CONSTRAINT, timeout_min=timeout_min, nodes=cfg.DISTRIBUTED.NUM_NODES, cpus_per_task=cfg.SLURM.NUM_CPU_PER_PROC * cfg.DISTRIBUTED.NUM_PROC_PER_NODE, tasks_per_node=1, gpus_per_node=cfg.DISTRIBUTED.NUM_PROC_PER_NODE, mem_gb=cfg.SLURM.MEM_GB, slurm_additional_parameters=cfg.SLURM.ADDITIONAL_PARAMETERS, ) trainer = _ResumableSlurmJob(engine_name=engine_name, config=cfg) job = executor.submit(trainer) print(f"SUBMITTED: {job.job_id}") return job
def save_slice(cls, checkpoint_path: str, param_path: str, param) -> str: """ Save a slice of the model: a parameter and its associated weights - create a folder in which the slice will live - save the slice in this folder, with a unique name - return the created file name """ checkpoint_sub_folder = os.path.splitext( checkpoint_path)[0] + "_layers" makedir(checkpoint_sub_folder) hash_name = hashlib.sha1(param_path.encode()).hexdigest() file_path = os.path.join(checkpoint_sub_folder, f"{hash_name}.torch") file_path = abspath(file_path) checkpoint_slice = { "type": CheckpointItemType.slice.name, "weight": param } with g_pathmgr.open(file_path, "wb") as f: torch.save(checkpoint_slice, f) return file_path
def setup_logging(name, output_dir=None, rank=0): """ Setup various logging streams: stdout and file handlers. For file handlers, we only setup for the master gpu. """ # get the filename if we want to log to the file as well log_filename = None if output_dir: makedir(output_dir) if rank == 0: log_filename = f"{output_dir}/log.txt" logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) # create formatter FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)4d: %(message)s" formatter = logging.Formatter(FORMAT) # clean up any pre-existing handlers for h in logger.handlers: logger.removeHandler(h) logger.root.handlers = [] # setup the console handler console_handler = logging.StreamHandler(sys.stdout) console_handler.setLevel(logging.DEBUG) console_handler.setFormatter(formatter) logger.addHandler(console_handler) # we log to file as well if user wants if log_filename and rank == 0: file_handler = logging.StreamHandler(_cached_log_stream(log_filename)) file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(formatter) logger.addHandler(file_handler) logging.root = logger
def extract_train_features( cfg, temp_dir, train_dataset_name, resize_img, spatial_levels, image_helper, train_dataset, model, ): train_features = [] def process_train_image(i, out_dir, verbose=False): if i % LOG_FREQUENCY == 0: logging.info(f"Train Image: {i}"), fname_out = None if out_dir: fname_out = f"{out_dir}/{i}.npy" if fname_out and g_pathmgr.exists(fname_out): feat = load_file(fname_out) train_features.append(feat) else: with PerfTimer("read_sample", PERF_STATS): fname_in = train_dataset.get_filename(i) if is_revisited_dataset(train_dataset_name): img = image_helper.load_and_prepare_revisited_image( fname_in, roi=None) elif is_whiten_dataset(train_dataset_name): img = image_helper.load_and_prepare_whitening_image( fname_in) else: img = image_helper.load_and_prepare_image(fname_in, roi=None) with PerfTimer("extract_features", PERF_STATS): img_scalings = cfg.IMG_RETRIEVAL.IMG_SCALINGS or [1] activation_maps = extract_activation_maps( img, model, img_scalings) if verbose: print( f"Example train Image raw activation map shape: { activation_maps[0].shape }" # NOQA ) with PerfTimer("post_process_features", PERF_STATS): # once we have the features, # we can perform: rmac | gem pooling | l2 norm if cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE == "rmac": descriptors = get_rmac_descriptors( activation_maps[0], spatial_levels, normalize=cfg.IMG_RETRIEVAL.NORMALIZE_FEATURES, ) elif cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE == "gem": descriptors = get_average_gem( activation_maps, p=cfg.IMG_RETRIEVAL.GEM_POOL_POWER, add_bias=True, ) else: descriptors = torch.mean(torch.stack(activation_maps), dim=0) descriptors = descriptors.reshape(descriptors.shape[0], -1) # Optionally l2 normalize the features. if (cfg.IMG_RETRIEVAL.NORMALIZE_FEATURES and cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE != "rmac"): # RMAC performs normalization within the algorithm, hence we skip it here. descriptors = l2n(descriptors, dim=1) if fname_out: save_file(descriptors.data.numpy(), fname_out, verbose=False) train_features.append(descriptors.data.numpy()) num_images = train_dataset.get_num_images() out_dir = None if temp_dir: out_dir = f"{temp_dir}/{train_dataset_name}_S{resize_img}_features_train" makedir(out_dir) logging.info(f"Getting features for train images: {num_images}") for i in range(num_images): process_train_image(i, out_dir, verbose=(i == 0)) train_features = np.vstack( [x.reshape(-1, x.shape[-1]) for x in train_features]) logging.info(f"Train features size: {train_features.shape}") return train_features
def get_output_dir(): curr_folder = os.path.abspath(".") datasets_dir = f"{curr_folder}/datasets" logging.info(f"Datasets dir: {datasets_dir}") makedir(datasets_dir) return datasets_dir
def get_train_features( cfg, temp_dir, train_dataset_name, resize_img, spatial_levels, image_helper, train_dataset, model, ): train_features = [] def process_train_image(i, out_dir, verbose=False): if i % LOG_FREQUENCY == 0: logging.info(f"Train Image: {i}"), fname_out = None if out_dir: fname_out = f"{out_dir}/{i}.npy" if fname_out and PathManager.exists(fname_out): feat = load_file(fname_out) train_features.append(feat) else: fname_in = train_dataset.get_filename(i) if is_revisited_dataset(train_dataset_name): img = image_helper.load_and_prepare_revisited_image(fname_in, roi=None) elif is_whiten_dataset(train_dataset_name): img = image_helper.load_and_prepare_whitening_image(fname_in) else: img = image_helper.load_and_prepare_image(fname_in, roi=None) v = torch.autograd.Variable(img.unsqueeze(0)) vc = v.cuda() # the model output is a list always. activation_map = model(vc)[0].cpu() if verbose: print( f"Train Image raw activation map shape: { activation_map.shape }" ) # once we have the features, # we can perform: rmac | gem pooling | l2 norm if cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE == "rmac": descriptors = get_rmac_descriptors( activation_map, spatial_levels, normalize=cfg.IMG_RETRIEVAL.NORMALIZE_FEATURES, ) elif cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE == "gem": descriptors = gem( activation_map, p=cfg.IMG_RETRIEVAL.GEM_POOL_POWER, add_bias=True, ) else: descriptors = activation_map # Optionally l2 normalize the features. if (cfg.IMG_RETRIEVAL.NORMALIZE_FEATURES and cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE != "rmac"): # RMAC performs normalization within the algorithm, hence we skip it here. descriptors = l2n(descriptors, dim=1) if fname_out: save_file(descriptors.data.numpy(), fname_out, verbose=False) train_features.append(descriptors.data.numpy()) num_images = train_dataset.get_num_images() out_dir = None if temp_dir: out_dir = f"{temp_dir}/{train_dataset_name}_S{resize_img}_features_train" makedir(out_dir) logging.info(f"Getting features for train images: {num_images}") for i in range(num_images): process_train_image(i, out_dir, verbose=(i == 0)) train_features = np.vstack( [x.reshape(-1, x.shape[-1]) for x in train_features]) logging.info(f"Train features size: {train_features.shape}") return train_features