def get_cityscapes_panoptic_files(image_dir, gt_dir, json_info): files = [] # scan through the directory cities = PathManager.ls(image_dir) logger.info(f"{len(cities)} cities found in '{image_dir}'.") image_dict = {} for city in cities: city_img_dir = os.path.join(image_dir, city) for basename in PathManager.ls(city_img_dir): image_file = os.path.join(city_img_dir, basename) suffix = "_leftImg8bit.png" assert basename.endswith(suffix), basename basename = os.path.basename(basename)[: -len(suffix)] image_dict[basename] = image_file for ann in json_info["annotations"]: image_file = image_dict.get(ann["image_id"], None) assert image_file is not None, "No image {} found for annotation {}".format( ann["image_id"], ann["file_name"] ) label_file = os.path.join(gt_dir, ann["file_name"]) segments_info = ann["segments_info"] files.append((image_file, label_file, segments_info)) assert len(files), "No images found in {}".format(image_dir) assert PathManager.isfile(files[0][0]), files[0][0] assert PathManager.isfile(files[0][1]), files[0][1] return files
def get_cityscapes_files(image_dir, gt_dir): files = [] # scan through the directory cities = PathManager.ls(image_dir) logger.info(f"{len(cities)} cities found in '{image_dir}'.") for city in cities: city_img_dir = os.path.join(image_dir, city) city_gt_dir = os.path.join(gt_dir, city) for basename in PathManager.ls(city_img_dir): image_file = os.path.join(city_img_dir, basename) suffix = "leftImg8bit.png" assert basename.endswith(suffix) basename = basename[:-len(suffix)] instance_file = os.path.join(city_gt_dir, basename + "gtFine_instanceIds.png") label_file = os.path.join(city_gt_dir, basename + "gtFine_labelIds.png") json_file = os.path.join(city_gt_dir, basename + "gtFine_polygons.json") files.append((image_file, instance_file, label_file, json_file)) assert len(files), "No images found in {}".format(image_dir) for f in files[0]: assert PathManager.isfile(f), f return files
def merge(self): """merge all clip features of a video into one/several fix-size matrix(es) """ if not PathManager.exists(self.merge_dir): PathManager.mkdirs(self.merge_dir) for video_name in PathManager.ls(self.save_dir): video_dir = os.path.join(self.save_dir, video_name) num_feats = len(PathManager.ls(video_dir)) if self.min_length <= num_feats <= self.max_length: merged_feat = torch.zeros((num_feats, self.dim), dtype=torch.float32) for clip_idx in range(num_feats): feat = torch.load( os.path.join(video_dir, f'{clip_idx}.pth')) merged_feat[clip_idx, :] = torch.from_numpy(feat) torch.save(merged_feat, os.path.join(self.merge_dir, f'{video_name}.pth')) else: # TODO print(video_name)
def test_bad_args(self) -> None: with self.assertRaises(NotImplementedError): PathManager.copy( self._remote_uri, self._remote_uri, foo="foo" # type: ignore ) with self.assertRaises(NotImplementedError): PathManager.exists(self._remote_uri, foo="foo") # type: ignore with self.assertRaises(ValueError): PathManager.get_local_path( self._remote_uri, foo="foo" # type: ignore ) with self.assertRaises(NotImplementedError): PathManager.isdir(self._remote_uri, foo="foo") # type: ignore with self.assertRaises(NotImplementedError): PathManager.isfile(self._remote_uri, foo="foo") # type: ignore with self.assertRaises(NotImplementedError): PathManager.ls(self._remote_uri, foo="foo") # type: ignore with self.assertRaises(NotImplementedError): PathManager.mkdirs(self._remote_uri, foo="foo") # type: ignore with self.assertRaises(ValueError): PathManager.open(self._remote_uri, foo="foo") # type: ignore with self.assertRaises(NotImplementedError): PathManager.rm(self._remote_uri, foo="foo") # type: ignore PathManager.set_strict_kwargs_checking(False) PathManager.get_local_path(self._remote_uri, foo="foo") # type: ignore f = PathManager.open(self._remote_uri, foo="foo") # type: ignore f.close() PathManager.set_strict_kwargs_checking(True)
def load_pan_seg(gt_root, image_root, gt_ext="png", image_ext="jpg"): # We match input images with ground truth based on their relative filepaths (without file # extensions) starting from 'image_root' and 'gt_root' respectively. def file2id(folder_path, file_path): # extract relative path starting from `folder_path` image_id = os.path.normpath( os.path.relpath(file_path, start=folder_path)) # remove file extension image_id = os.path.splitext(image_id)[0] return image_id input_files = sorted( (os.path.join(image_root, f) for f in PathManager.ls(image_root) if f.endswith(image_ext)), key=lambda file_path: file2id(image_root, file_path), ) gt_files = sorted( (os.path.join(gt_root, f) for f in PathManager.ls(gt_root) if f.endswith(gt_ext)), key=lambda file_path: file2id(gt_root, file_path), ) assert len(gt_files) > 0, "No annotations found in {}.".format(gt_root) # Use the intersection, so that val2017_100 annotations can run smoothly with val2017 images if len(input_files) != len(gt_files): logger.warn( "Directory {} and {} has {} and {} files, respectively.".format( image_root, gt_root, len(input_files), len(gt_files))) input_basenames = [ os.path.basename(f)[:-len(image_ext)] for f in input_files ] gt_basenames = [os.path.basename(f)[:-len(gt_ext)] for f in gt_files] intersect = list(set(input_basenames) & set(gt_basenames)) # sort, otherwise each worker may obtain a list[dict] in different order intersect = sorted(intersect) logger.warn("Will use their intersection of {} files.".format( len(intersect))) input_files = [ os.path.join(image_root, f + image_ext) for f in intersect ] gt_files = [os.path.join(gt_root, f + gt_ext) for f in intersect] logger.info("Loaded {} images with semantic segmentation from {}".format( len(input_files), image_root)) dataset_dicts = [] for (img_path, gt_path) in zip(input_files, gt_files): record = {} record["file_name"] = img_path record["pan_seg_file_name"] = gt_path dataset_dicts.append(record) return dataset_dicts
def get_shard_file_names(input_dir: str, split: str, layer: str) -> List[ExtractedFeaturesShardPaths]: """ Get the list of files needed to load the extracted features """ # List all the files that are containing the features for a given # dataset split and a given layer feature_regex = re.compile(rf"(.*)_{split}_{layer}_features.npy") prefixes = [] for file_path in PathManager.ls(input_dir): match = feature_regex.match(file_path) if match is not None: prefixes.append(match.group(1)) # Yield all the files needed to merge the features dumped on # the different GPUs shard_paths = [] for prefix in prefixes: feat_file = os.path.join(input_dir, f"{prefix}_{split}_{layer}_features.npy") targets_file = os.path.join( input_dir, f"{prefix}_{split}_{layer}_targets.npy") indices_file = os.path.join(input_dir, f"{prefix}_{split}_{layer}_inds.npy") shard_paths.append( ExtractedFeaturesShardPaths( feature_file=feat_file, targets_file=targets_file, indices_file=indices_file, )) return shard_paths
def get_filelist_labels_images_paths(input_path): dataset_summary, metadata = {}, {} img_paths, gender_labels, race_labels, age_labels = [], [], [], [] inp_image_names = PathManager.ls(input_path) print(f"{len(inp_image_names)} images found.") total_examples = 0 # Populate the img_paths and labels labels based on folder file structure. for img_name in inp_image_names: if not img_name.endswith(".jpg"): continue img_path = os.path.join(input_path, img_name) img_paths.append(img_path) img_age = int(str(img_name).split("_")[0]) img_gender = GENDER_MAPPING[int(str(img_name).split("_")[1])] img_race = RACE_MAPPING[int(str(img_name).split("_")[2])] # import pdb; pdb.set_trace() age_labels.append(img_age) gender_labels.append(img_gender) race_labels.append(img_race) metadata[img_name] = { "age": img_age, "gender": img_gender, "race": img_race, } total_examples += 1 # print the dataset summary print(f"Dataset has {total_examples} images") dataset_summary["num_images"] = total_examples return dataset_summary, metadata, img_paths, age_labels, gender_labels, race_labels
def test_bad_args(self) -> None: # TODO (T58240718): Replace with dynamic checks with self.assertRaises(ValueError): PathManager.copy( self._tmpfile, self._tmpfile, foo="foo" # type: ignore ) with self.assertRaises(ValueError): PathManager.exists(self._tmpfile, foo="foo") # type: ignore with self.assertRaises(ValueError): PathManager.get_local_path(self._tmpfile, foo="foo") # type: ignore with self.assertRaises(ValueError): PathManager.isdir(self._tmpfile, foo="foo") # type: ignore with self.assertRaises(ValueError): PathManager.isfile(self._tmpfile, foo="foo") # type: ignore with self.assertRaises(ValueError): PathManager.ls(self._tmpfile, foo="foo") # type: ignore with self.assertRaises(ValueError): PathManager.mkdirs(self._tmpfile, foo="foo") # type: ignore with self.assertRaises(ValueError): PathManager.open(self._tmpfile, foo="foo") # type: ignore with self.assertRaises(ValueError): PathManager.rm(self._tmpfile, foo="foo") # type: ignore PathManager.set_strict_kwargs_checking(False) PathManager.copy( self._tmpfile, self._tmpfile, foo="foo" # type: ignore ) PathManager.exists(self._tmpfile, foo="foo") # type: ignore PathManager.get_local_path(self._tmpfile, foo="foo") # type: ignore PathManager.isdir(self._tmpfile, foo="foo") # type: ignore PathManager.isfile(self._tmpfile, foo="foo") # type: ignore PathManager.ls(self._tmpdir, foo="foo") # type: ignore PathManager.mkdirs(self._tmpdir, foo="foo") # type: ignore f = PathManager.open(self._tmpfile, foo="foo") # type: ignore f.close() # pyre-ignore with open(os.path.join(self._tmpdir, "test_rm.txt"), "w") as f: rm_file = f.name f.write(self._tmpfile_contents) f.flush() PathManager.rm(rm_file, foo="foo") # type: ignore
def load_sem_seg(gt_root, image_root, gt_ext="png", image_ext="jpg"): def file2id(folder_path, file_path): image_id = os.path.normpath(os.path.relpath(file_path, start=folder_path)) image_id = os.path.splitext(image_id)[0] return image_id input_files = sorted( (os.path.join(image_root, f) for f in PathManager.ls(image_root) if f.endswith(image_ext)), key=lambda file_path: file2id(image_root, file_path), ) gt_files = sorted( (os.path.join(gt_root, f) for f in PathManager.ls(gt_root) if f.endswith(gt_ext)), key=lambda file_path: file2id(gt_root, file_path), ) assert len(gt_files) > 0, f"No annotations found in {gt_root}." if len(input_files) != len(gt_files): logger.warn( "Directory {} and {} has {} and {} files, respectively.".format( image_root, gt_root, len(input_files), len(gt_files) ) ) input_basenames = [os.path.basename(f)[: -len(image_ext)] for f in input_files] gt_basenames = [os.path.basename(f)[: -len(gt_ext)] for f in gt_files] intersect = list(set(input_basenames) & set(gt_basenames)) intersect = sorted(intersect) logger.warn(f"Will use their intersection of {len(intersect)} files.") input_files = [os.path.join(image_root, f + image_ext) for f in intersect] gt_files = [os.path.join(gt_root, f + gt_ext) for f in intersect] logger.info( f"Loaded {len(input_files)} images with semantic segmentation from {image_root}" ) dataset_dicts = [] for (img_path, gt_path) in zip(input_files, gt_files): record = {} record["file_name"] = img_path record["sem_seg_file_name"] = gt_path dataset_dicts.append(record) return dataset_dicts
def bias_pascal_voc( dirname: str, noise_ratio: float, bias_rule: Dict[str, str] ): """ Add Noise to Pascal VOC detection annotations. Args: dirname: Contain "Annotations", "ImageSets", "JPEGImages" noise_ratio: Noise ratio of biased annotations bias_rule: Assymetric mislabel rules between classes """ annotation_dirname = PathManager.get_local_path( os.path.join(dirname, "Annotations/") ) annotation_files = np.array(PathManager.ls(annotation_dirname)) num_biased_files = round(len(annotation_files) * noise_ratio) np.random.shuffle(annotation_files) biased_files = set(annotation_files[:num_biased_files]) bias_stats = dict.fromkeys(["total", "mislabeled", "skipped"], 0) for filename in tqdm(annotation_files): anno_file = os.path.join(annotation_dirname, filename) with PathManager.open(anno_file) as f: tree = ET.parse(f) instances = tree.findall("object") num_instances = len(instances) bias_stats['total'] += num_instances if filename in biased_files: mislabel_ratio = round(num_instances*0.7) np.random.shuffle(instances) biased_instances = instances[:mislabel_ratio] for instance in biased_instances: cls_name = instance.find("name") if cls_name.text in bias_rule.keys(): biased_cls_name = bias_rule[cls_name.text] cls_name.text = biased_cls_name mislabel_attr = ET.SubElement(instance, "mislabeled") mislabel_attr.text = '1' bias_stats['mislabeled'] += 1 else: tree.getroot().remove(instance) bias_stats['skipped'] +=1 tree.write(anno_file) return bias_stats
def has_checkpoint(path_to_job): """ Determines if the given directory contains a checkpoint. Args: path_to_job (string): the path to the folder of the current job. """ d = get_checkpoint_dir(path_to_job) files = PathManager.ls(d) if PathManager.exists(d) else [] return any("checkpoint" in f for f in files)
def has_checkpoint(path_to_checkpoint): """ check if checkpoint exist :param path_to_checkpoint: :return: """ d = get_checkpoint_dir(path_to_checkpoint) files = PathManager.ls(d) if PathManager.exists(d) else [] return any("checkpoint" in f for f in files)
def _get_filenames(self, data_path: str): fnames = [] for fname in sorted(PathManager.ls(data_path)): # Only put images in fnames. if not fname.endswith(".jpg"): continue full_fname = os.path.join(data_path, fname) fnames.append(full_fname) return np.array(fnames)
def get_checkpoint_resume_files( checkpoint_folder: str, config: AttrDict, skip_final: bool = False, latest_checkpoint_resume_num: int = 1, ): """ Get the checkpoint file from which the model should be resumed. We look at all the checkpoints in the checkpoint_folder and if the final model checkpoint exists (starts with `model_final_`) and not overriding it, then return the final checkpoint. Otherwise find the latest checkpoint. Args: checkpoint_folder (str): path to the checkpoint folder. config (AttrDict): root config skip_final (bool): whether the final model checkpoint should be skipped or not latest_checkpoint_resume_num (int): what Nth latest checkpoint to resume from. Sometimes the latest checkpoints could be corrupt so this option helps to resume from instead a few checkpoints before the last checkpoint. """ all_files = PathManager.ls(checkpoint_folder) all_iters = [] replace_prefix = "model_phase" # if we checkpoint at iterations too, we start from an iteration checkpoint # since that's latest than the phase end checkpoint. Sometimes, it's also # possible that there is no phase. if config.CHECKPOINT.CHECKPOINT_ITER_FREQUENCY > 0: replace_prefix = "model_iteration" for f in all_files: # if we have the finished training, we pick the finished training file # the checkpoint is saved as "model_final_checkpoint". Otherwise, we pick # the latest phase checkpoint if "model_final" in f and not skip_final: return f if replace_prefix in f: iter_num = f.replace(".torch", "").replace(replace_prefix, "") if iter_num.isdigit(): all_iters.append(int(iter_num)) # make sure the checkpoint resume number is in bounds checkpoint_resume_num = max(0, latest_checkpoint_resume_num - 1) # len(all_iters) - 1 is the last index, checkpoint_resume_num can't be beyond that. checkpoint_resume_num = min(len(all_iters) - 1, checkpoint_resume_num) logging.info(f"checkpoint_resume_num: {checkpoint_resume_num}") if len(all_iters) > 0: all_iters.sort(reverse=True) last_iter = int(all_iters[checkpoint_resume_num]) filename = f"{replace_prefix}{last_iter}.torch" return filename else: return None
def get_all_checkpoint_files(self) -> List[str]: """ Returns: list: All available checkpoint files (.pth files) in target directory. """ all_model_checkpoints = [ os.path.join(self.save_dir, file) for file in PathManager.ls(self.save_dir) if PathManager.isfile(os.path.join(self.save_dir, file)) and file.endswith(".pth") ] return all_model_checkpoints
def test_ls(self): # Create some files in the tempdir to ls out. root_dir = os.path.join(self._tmpdir, "ls") os.makedirs(root_dir, exist_ok=True) files = sorted(["foo.txt", "bar.txt", "baz.txt"]) for f in files: open(os.path.join(root_dir, f), "a").close() children = sorted(PathManager.ls(root_dir)) self.assertListEqual(children, files) # Cleanup the tempdir shutil.rmtree(root_dir)
def get_last_checkpoint(path_to_job): """ Get the last checkpoint from the checkpointing folder. Args: path_to_job (string): the path to the folder of the current job. """ d = get_checkpoint_dir(path_to_job) names = PathManager.ls(d) if PathManager.exists(d) else [] names = [f for f in names if "checkpoint" in f] assert len(names), "No checkpoints found in '{}'.".format(d) # Sort the checkpoints by epoch. name = sorted(names)[-1] return os.path.join(d, name)
def get_special_checkpoint(path_to_checkpoint, special_epoch): """ get one special checkpoint :param path_to_checkpoint: :return: """ d = get_checkpoint_dir(path_to_checkpoint) names = PathManager.ls(d) if PathManager.exists(d) else [] special_name = "checkpoint_epoch_{:.05d}.pyth".format(special_epoch) names = [f for f in names if special_name in f] name = names[0] logger.info("load mode in special epoch : {}".format(os.path.join(d, name))) return os.path.join(d, name)
def has_final_checkpoint(checkpoint_folder: str, final_checkpoint_pattern: str = "model_final"): """ Check whether the final checkpoint exists in the checkpoint folder. The final checkpoint is recognized by the prefix "model_final_" in VISSL. Args: checkpoint_folder (str): path to the checkpoint folder. final_checkpoint_pattern (str): what prefix is used to save the final checkpoint. Returns: has_final_checkpoint: whether the final checkpoint exists or not """ checkpointed_files = PathManager.ls(checkpoint_folder) torch_files = filter(lambda x: x.endswith(".torch"), checkpointed_files) final_files = filter(lambda x: final_checkpoint_pattern in x, torch_files) return len(list(final_files)) > 0
def has_checkpoint(checkpoint_folder: str, skip_final: bool = False): """ Check whether there are any checkpoints at all in the checkpoint folder. Args: checkpoint_folder (str): path to the checkpoint folder skip_final (bool): if the checkpoint with `model_final_` prefix exist, whether to skip it and train. Returns: checkpoint_exists (bool): whether checkpoint exists or not """ checkpointed_files = PathManager.ls(checkpoint_folder) checkpoint_exists = False for f in checkpointed_files: if f.endswith(".torch") and ("model_final" not in f or not skip_final): checkpoint_exists = True break return checkpoint_exists
def _construct_loader(self): """ Construct the video loader. """ # TODO: merge into the annotation file, and modify the action duration. with open(os.path.join(self.cfg.DATA.PATH_TO_DATA_DIR, "duration.json")) as f: duration_dict = json.load(f) self.raw_video_path = os.path.join(self.cfg.DATA.PATH_TO_DATA_DIR, 'raw') _video_names = PathManager.ls(self.raw_video_path) _video_durations = [duration_dict[k] for k in _video_names] self._video_names = list() self._clip_idx = list() self._video_durations = list() for i, duration in enumerate(_video_durations): if duration >= self._clip_size: _num_clips = int(duration - self._clip_size) + 1 for j in range(_num_clips): self._video_names.append(_video_names[i]) self._clip_idx.append(j) self._video_durations.append(duration)
return parser if __name__ == "__main__": """ Example usage: buck-out/gen/deeplearning/projects/ssl_framework/extra_scripts/fb/convert_folder_to_filelist.par \ # NOQA -i "manifold://ssl_framework/tree/datasets/food_101/" \ -o "manifold://ssl_framework/tree/datasets/food_101/" """ args = get_argument_parser().parse_args() setup_path_manager() splits = PathManager.ls(args.input) print(f"The following splits are found: { ','.join(splits) }") dataset_summary = {} for split in ["train", "trainval", "val", "test"]: if not PathManager.exists(os.path.join(args.input, split)): continue dataset_summary[split] = {} img_paths = [] img_labels = [] split_path = os.path.join(args.input, split) label_paths = PathManager.ls(split_path) dataset_summary[split]["labels"] = label_paths
def load(self, num_samples=None): """ Load the data ground truth and parse the data so it's ready to be used. """ # Load the dataset GT self.lab_root = f"{self.path}/lab/" self.img_root = f"{self.path}/jpg/" logging.info(f"Loading data: {self.path}") lab_filenames = np.sort(PathManager.ls(self.lab_root)) # Get the filenames without the extension self.img_filenames = [ e[:-4] for e in np.sort(PathManager.ls(self.img_root)) if e[:-4] not in self.blacklisted ] # Parse the label files. Some challenges as filenames do not correspond # exactly to query names. Go through all the labels to: # i) map names to filenames and vice versa # ii) get the relevant regions of interest of the queries, # iii) get the indexes of the dataset images that are queries # iv) get the relevants / non-relevants list self.relevants = {} self.junk = {} self.non_relevants = {} self.filename_to_name = {} self.name_to_filename = OrderedDict() self.q_roi = {} for e in lab_filenames: if e.endswith("_query.txt"): q_name = e[:-len("_query.txt")] with PathManager.open(f"{self.lab_root}/{e}") as fopen: q_data = fopen.readline().split(" ") if q_data[0].startswith("oxc1_"): q_filename = q_data[0][5:] else: q_filename = q_data[0] self.filename_to_name[q_filename] = q_name self.name_to_filename[q_name] = q_filename with PathManager.open( f"{self.lab_root}/{q_name}_ok.txt") as fopen: good = {e.strip() for e in fopen} with PathManager.open( f"{self.lab_root}/{q_name}_good.txt") as fopen: good = good.union({e.strip() for e in fopen}) with PathManager.open( f"{self.lab_root}/{q_name}_junk.txt") as fopen: junk = {e.strip() for e in fopen} good_plus_junk = good.union(junk) self.relevants[q_name] = [ i for i in range(len(self.img_filenames)) if self.img_filenames[i] in good ] self.junk[q_name] = [ i for i in range(len(self.img_filenames)) if self.img_filenames[i] in junk ] self.non_relevants[q_name] = [ i for i in range(len(self.img_filenames)) if self.img_filenames[i] not in good_plus_junk ] self.q_roi[q_name] = np.array([float(q) for q in q_data[1:]], dtype=np.float32) self.q_names = list(self.name_to_filename.keys()) self.q_index = np.array([ self.img_filenames.index(self.name_to_filename[qn]) for qn in self.q_names ]) self.N_images = len(self.img_filenames) self.N_queries = len(self.q_index) if num_samples is not None: self.N_queries = min(self.N_queries, num_samples) self.N_images = min(self.N_images, num_samples)
def load_sem_seg(gt_root, image_root, gt_ext="png", image_ext="jpg"): """ Load semantic segmentation datasets. All files under "gt_root" with "gt_ext" extension are treated as ground truth annotations and all files under "image_root" with "image_ext" extension as input images. Ground truth and input images are matched using file paths relative to "gt_root" and "image_root" respectively without taking into account file extensions. This works for COCO as well as some other datasets. Args: gt_root (str): full path to ground truth semantic segmentation files. Semantic segmentation annotations are stored as images with integer values in pixels that represent corresponding semantic labels. image_root (str): the directory where the input images are. gt_ext (str): file extension for ground truth annotations. image_ext (str): file extension for input images. Returns: list[dict]: a list of dicts in detectron2 standard format without instance-level annotation. Notes: 1. This function does not read the image and ground truth files. The results do not have the "image" and "sem_seg" fields. """ # We match input images with ground truth based on their relative filepaths (without file # extensions) starting from 'image_root' and 'gt_root' respectively. def file2id(folder_path, file_path): # extract relative path starting from `folder_path` image_id = os.path.normpath( os.path.relpath(file_path, start=folder_path)) # remove file extension image_id = os.path.splitext(image_id)[0] return image_id input_files = sorted( (os.path.join(image_root, f) for f in PathManager.ls(image_root) if f.endswith(image_ext)), key=lambda file_path: file2id(image_root, file_path), ) gt_files = sorted( (os.path.join(gt_root, f) for f in PathManager.ls(gt_root) if f.endswith(gt_ext)), key=lambda file_path: file2id(gt_root, file_path), ) assert len(gt_files) > 0, "No annotations found in {}.".format(gt_root) # Use the intersection, so that val2017_100 annotations can run smoothly with val2017 images if len(input_files) != len(gt_files): logger.warn( "Directory {} and {} has {} and {} files, respectively.".format( image_root, gt_root, len(input_files), len(gt_files))) input_basenames = [ os.path.basename(f)[:-len(image_ext)] for f in input_files ] gt_basenames = [os.path.basename(f)[:-len(gt_ext)] for f in gt_files] intersect = list(set(input_basenames) & set(gt_basenames)) # sort, otherwise each worker may obtain a list[dict] in different order intersect = sorted(intersect) logger.warn("Will use their intersection of {} files.".format( len(intersect))) input_files = [ os.path.join(image_root, f + image_ext) for f in intersect ] gt_files = [os.path.join(gt_root, f + gt_ext) for f in intersect] logger.info("Loaded {} images with semantic segmentation from {}".format( len(input_files), image_root)) dataset_dicts = [] for (img_path, gt_path) in zip(input_files, gt_files): record = {} record["file_name"] = img_path record["sem_seg_file_name"] = gt_path dataset_dicts.append(record) return dataset_dicts
def _get_lightning_checkpoints(path: str): return [ os.path.join(path, x) for x in PathManager.ls(path) if x.endswith(ModelCheckpoint.FILE_EXTENSION) and not x.startswith(ModelCheckpoint.CHECKPOINT_NAME_LAST) ]
def test_PathManager(self): x = LazyPath(lambda: "./") output = PathManager.ls(x) output_gt = PathManager.ls("./") self.assertEqual(sorted(output), sorted(output_gt))
def ls(path: str) -> List[str]: if FVCorePathManager: return FVCorePathManager.ls(path) return os.listdir(path)
def merge_features(input_dir: str, split: str, layer: str): """ For multi-gpu feature extraction, each gpu saves features corresponding to its share of the data. We can merge the features across all gpus to get the features for the full data. The features are saved along with the data indexes and label. The data indexes can be used to sort the data and ensure the uniqueness. We organize the features, targets corresponding to the data index of each feature, ensure the uniqueness and return. Args: input_dir (str): input path where the features are dumped split (str): whether the features are train or test data features layer (str): the features correspond to what layer of the model Returns: output (Dict): contains features, targets, inds as the keys """ logging.info(f"Merging features: {split} {layer}") feature_regex = re.compile(rf"(.*)_{split}_{layer}_features.npy") # List all the files that are containing the features for a given # dataset split and a given layer prefixes = [] for file_path in PathManager.ls(input_dir): match = feature_regex.match(file_path) if match is not None: prefixes.append(match.group(1)) # Reassemble each feature shard (dumped by a given rank) output_feats, output_targets = {}, {} for prefix in prefixes: feat_file = os.path.join(input_dir, f"{prefix}_{split}_{layer}_features.npy") targets_file = os.path.join(input_dir, f"{prefix}_{split}_{layer}_targets.npy") inds_file = os.path.join(input_dir, f"{prefix}_{split}_{layer}_inds.npy") logging.info(f"Loading:\n{feat_file}\n{targets_file}\n{inds_file}") feats = load_file(feat_file) targets = load_file(targets_file) indices = load_file(inds_file) num_samples = feats.shape[0] for idx in range(num_samples): index = indices[idx] if index not in output_feats: output_feats[index] = feats[idx] output_targets[index] = targets[idx] # Sort the entries by sample index indices = sorted(output_targets.keys()) features = [output_feats[i] for i in indices] targets = [output_targets[i] for i in indices] # Cast the entries as numpy arrays N = len(indices) output = { "features": np.array(features).reshape(N, -1), "targets": np.array(targets), "inds": np.array(indices), } logging.info(f"Features: {output['features'].shape}") logging.info(f"Targets: {output['targets'].shape}") logging.info(f"Indices: {output['inds'].shape}") return output
def test_PathManager(self) -> None: x = LazyPath(lambda: "./") output = PathManager.ls(x) # pyre-ignore output_gt = PathManager.ls("./") self.assertEqual(sorted(output), sorted(output_gt))