Ejemplo n.º 1
0
    def from_directory(
        cls,
        path: str,
        fps: float = 30.0,
        multithreaded_io=False,
        path_order_cache: Optional[Dict[str, List[str]]] = None,
    ):
        """
        Args:
            path (str): path to frame video directory.
            fps (float): the target fps for the video. This is needed to link the frames
                to a second timestamp in the video.
            multithreaded_io (bool):  controls whether parllelizable io operations are
                performed across multiple threads.
            path_order_cache (dict): An optional mapping from directory-path to list
                of frames in the directory in numerical order. Used for speedup by
                caching the frame paths.
        """
        if path_order_cache is not None and path in path_order_cache:
            return cls.from_frame_paths(path_order_cache[path], fps,
                                        multithreaded_io)

        assert g_pathmgr.isdir(path), f"{path} is not a directory"
        rel_frame_paths = g_pathmgr.ls(path)

        def natural_keys(text):
            return [
                int(c) if c.isdigit() else c for c in re.split("(\d+)", text)
            ]

        rel_frame_paths.sort(key=natural_keys)
        frame_paths = [os.path.join(path, f) for f in rel_frame_paths]
        if path_order_cache is not None:
            path_order_cache[path] = frame_paths
        return cls.from_frame_paths(frame_paths, fps, multithreaded_io)
Ejemplo n.º 2
0
def get_last_checkpoint():
    """Retrieves the most recent checkpoint (highest epoch number)."""
    checkpoint_dir = get_checkpoint_dir()
    checkpoints = [
        f for f in g_pathmgr.ls(checkpoint_dir) if _NAME_PREFIX in f
    ]
    last_checkpoint_name = sorted(checkpoints)[-1]
    return os.path.join(checkpoint_dir, last_checkpoint_name)
Ejemplo n.º 3
0
def has_checkpoint(path_to_job):
    """
    Determines if the given directory contains a checkpoint.
    Args:
        path_to_job (string): the path to the folder of the current job.
    """
    d = get_checkpoint_dir(path_to_job)
    files = g_pathmgr.ls(d) if g_pathmgr.exists(d) else []
    return any("checkpoint" in f for f in files)
Ejemplo n.º 4
0
 def _construct_imdb(self):
     """Constructs the imdb."""
     # Compile the split data path
     split_path = os.path.join(self.data_path, self.mode)
     logger.info("{} data path: {}".format(self.mode, split_path))
     # Images are stored per class in subdirs (format: n<number>)
     split_files = g_pathmgr.ls(split_path)
     self._class_ids = sorted(f for f in split_files
                              if re.match(r"^n[0-9]+$", f))
     # Map ImageNet class ids to contiguous ids
     self._class_id_cont_id = {v: i for i, v in enumerate(self._class_ids)}
     # Construct the image db
     self._imdb = []
     for class_id in self._class_ids:
         cont_id = self._class_id_cont_id[class_id]
         im_dir = os.path.join(split_path, class_id)
         for im_name in g_pathmgr.ls(im_dir):
             im_path = os.path.join(im_dir, im_name)
             self._imdb.append({"im_path": im_path, "class": cont_id})
     logger.info("Number of images: {}".format(len(self._imdb)))
     logger.info("Number of classes: {}".format(len(self._class_ids)))
Ejemplo n.º 5
0
    def _get_filenames(self, data_path: str):
        fnames = []

        for fname in sorted(g_pathmgr.ls(data_path)):
            # Only put images in fnames.
            if not fname.endswith(".jpg"):
                continue

            full_fname = os.path.join(data_path, fname)
            fnames.append(full_fname)

        return np.array(fnames)
Ejemplo n.º 6
0
def get_checkpoint_resume_files(
    checkpoint_folder: str,
    config: AttrDict,
    skip_final: bool = False,
    latest_checkpoint_resume_num: int = 1,
):
    """
    Get the checkpoint file from which the model should be resumed. We look at all
    the checkpoints in the checkpoint_folder and if the final model checkpoint exists
    (starts with `model_final_`) and not overriding it, then return the final
    checkpoint. Otherwise find the latest checkpoint.

    Args:
        checkpoint_folder (str): path to the checkpoint folder.
        config (AttrDict): root config
        skip_final (bool): whether the final model checkpoint should be skipped or not
        latest_checkpoint_resume_num (int): what Nth latest checkpoint to resume from.
                   Sometimes the latest checkpoints could be corrupt so this option
                   helps to resume from instead a few checkpoints before the last checkpoint.
    """
    all_files = g_pathmgr.ls(checkpoint_folder)
    all_iters = []
    replace_prefix = "model_phase"
    # if we checkpoint at iterations too, we start from an iteration checkpoint
    # since that's latest than the phase end checkpoint. Sometimes, it's also
    # possible that there is no phase.
    if config.CHECKPOINT.CHECKPOINT_ITER_FREQUENCY > 0:
        replace_prefix = "model_iteration"

    for f in all_files:
        # if we have the finished training, we pick the finished training file
        # the checkpoint is saved as "model_final_checkpoint". Otherwise, we pick
        # the latest phase checkpoint
        if "model_final" in f and not skip_final:
            return f
        if replace_prefix in f:
            iter_num = f.replace(".torch", "").replace(replace_prefix, "")
            if iter_num.isdigit():
                all_iters.append(int(iter_num))

    # make sure the checkpoint resume number is in bounds
    checkpoint_resume_num = max(0, latest_checkpoint_resume_num - 1)
    # len(all_iters) - 1 is the last index, checkpoint_resume_num can't be beyond that.
    checkpoint_resume_num = min(len(all_iters) - 1, checkpoint_resume_num)
    logging.info(f"checkpoint_resume_num: {checkpoint_resume_num}")
    if len(all_iters) > 0:
        all_iters.sort(reverse=True)
        last_iter = int(all_iters[checkpoint_resume_num])
        filename = f"{replace_prefix}{last_iter}.torch"
        return filename
    else:
        return None
Ejemplo n.º 7
0
def delete_checkpoints(checkpoint_dir=None, keep="all"):
    """Deletes unneeded checkpoints, keep can be "all", "last", or "none"."""
    assert keep in ["all", "last", "none"], "Invalid keep setting: {}".format(keep)
    checkpoint_dir = checkpoint_dir if checkpoint_dir else get_checkpoint_dir()
    if keep == "all" or not g_pathmgr.exists(checkpoint_dir):
        return 0
    checkpoints = [f for f in g_pathmgr.ls(checkpoint_dir) if _NAME_PREFIX in f]
    checkpoints = sorted(checkpoints)[:-1] if keep == "last" else checkpoints
    [
        g_pathmgr.rm(os.path.join(checkpoint_dir, checkpoint))
        for checkpoint in checkpoints
    ]
    return len(checkpoints)
Ejemplo n.º 8
0
def get_last_checkpoint(path_to_job):
    """
    Get the last checkpoint from the checkpointing folder.
    Args:
        path_to_job (string): the path to the folder of the current job.
    """

    d = get_checkpoint_dir(path_to_job)
    names = g_pathmgr.ls(d) if g_pathmgr.exists(d) else []
    names = [f for f in names if "checkpoint" in f]
    assert len(names), "No checkpoints found in '{}'.".format(d)
    # Sort the checkpoints by epoch.
    name = sorted(names)[-1]
    return os.path.join(d, name)
Ejemplo n.º 9
0
def build_encoded_manifest_from_nested_directory(
    data_directory_path: str, ) -> Dict[str, EncodedVideoInfo]:
    """
    Creates a dictionary from video_id to EncodedVideoInfo for
    encoded videos in the given directory.

    Args:
        data_directory_path (str): The folder to ls to find encoded
        video files.

    Returns:
        Dict[str, EncodedVideoInfo] mapping video_id to EncodedVideoInfo
        for each file in 'data_directory_path'
    """
    encoded_video_infos = {}
    for participant_id in g_pathmgr.ls(data_directory_path):
        participant_folder_path = f"{data_directory_path}/{participant_id}"
        for video_file_name in g_pathmgr.ls(participant_folder_path):
            video_id = video_file_name[:6]
            video_full_path = f"{participant_folder_path}/{video_file_name}"
            encoded_video_infos[video_id] = EncodedVideoInfo(
                video_id, video_full_path)
    return encoded_video_infos
Ejemplo n.º 10
0
def get_filelist_labels_images_paths(input_path):
    dataset_split_summary = {}
    img_paths, img_labels = [], []
    label_paths = g_pathmgr.ls(input_path)
    dataset_split_summary["labels"] = label_paths
    dataset_split_summary["num_labels"] = len(label_paths)
    print(f"{len(label_paths)} classes found.")

    total_split_examples = 0
    # Populate the img_paths and img_labels based on torchvision image folder file structure.
    for label in label_paths:
        label_path = os.path.join(input_path, label)
        images = g_pathmgr.ls(os.path.join(input_path, label))
        print(f"{len(images)} examples found for {label}.")
        total_split_examples += len(images)
        for image in images:
            img_path = os.path.join(label_path, image)
            img_paths.append(img_path)
            img_labels.append(label)

    # print the dataset summary
    dataset_split_summary["num_examples"] = total_split_examples
    print(f"{total_split_examples} found")
    return dataset_split_summary, img_paths, img_labels
Ejemplo n.º 11
0
 def add_participant_video_frames(participant_id: str,
                                  participant_path: str) -> None:
     participant_frames = sorted(g_pathmgr.ls(str(participant_path)))
     for frame_file_name in participant_frames:
         file_extension = frame_file_name.split(".")[-1]
         frame_name = frame_file_name[:-(len(file_extension) + 1)]
         [path_participant_id, path_video_id,
          path_frame_id] = frame_name.split("_")
         assert path_participant_id == participant_id
         video_id = f"{path_participant_id}_{path_video_id}"
         if (
                 video_id not in video_frames
         ):  # This is the first frame we have seen from video w/ video_id
             video_frames[video_id] = VideoFrameInfo(
                 video_id=video_id,
                 location=participant_path,
                 frame_file_stem=f"{video_id}_",
                 frame_string_length=len(frame_name),
                 min_frame_number=int(path_frame_id),
                 max_frame_number=int(path_frame_id),
                 file_extension=file_extension,
             )
         else:
             video_frame_info = video_frames[video_id]
             # Check that this new frame is of the same format as other frames for this video
             # and that it is the next frame in order, if so update the frame info for this
             # video to reflect there is an additional frame.
             # We don't need to check video_id or frame_file_stem as they are function of
             # video_id which is aligned within the dictionary
             assert video_frame_info.frame_string_length == len(frame_name)
             assert video_frame_info.location == participant_path, (
                 f"Frames for {video_id} found in two paths: "
                 f"{video_frame_info.location} and {participant_path}")
             assert video_frame_info.max_frame_number + 1 == int(
                 path_frame_id)
             assert (
                 video_frame_info.file_extension == file_extension
             ), f"Frames with two different file extensions found for video {video_id}"
             video_frames[video_id] = VideoFrameInfo(
                 video_id=video_frame_info.video_id,
                 location=video_frame_info.location,
                 frame_file_stem=video_frame_info.frame_file_stem,
                 frame_string_length=video_frame_info.frame_string_length,
                 min_frame_number=video_frame_info.min_frame_number,
                 max_frame_number=int(path_frame_id),  # Update
                 file_extension=video_frame_info.file_extension,
             )
Ejemplo n.º 12
0
def has_final_checkpoint(checkpoint_folder: str,
                         final_checkpoint_pattern: str = "model_final"):
    """
    Check whether the final checkpoint exists in the checkpoint folder. The
    final checkpoint is recognized by the prefix "model_final_" in VISSL.

    Args:
        checkpoint_folder (str): path to the checkpoint folder.
        final_checkpoint_pattern (str): what prefix is used to save the final checkpoint.

    Returns:
        has_final_checkpoint: whether the final checkpoint exists or not
    """
    checkpointed_files = g_pathmgr.ls(checkpoint_folder)
    torch_files = filter(lambda x: x.endswith(".torch"), checkpointed_files)
    final_files = filter(lambda x: final_checkpoint_pattern in x, torch_files)
    return len(list(final_files)) > 0
Ejemplo n.º 13
0
def has_checkpoint(checkpoint_folder: str, skip_final: bool = False):
    """
    Check whether there are any checkpoints at all in the checkpoint folder.

    Args:
        checkpoint_folder (str): path to the checkpoint folder
        skip_final (bool): if the checkpoint with `model_final_` prefix exist, whether
                           to skip it and train.

    Returns:
        checkpoint_exists (bool): whether checkpoint exists or not
    """
    checkpointed_files = g_pathmgr.ls(checkpoint_folder)
    checkpoint_exists = False
    for f in checkpointed_files:
        if f.endswith(".torch") and ("model_final" not in f or not skip_final):
            checkpoint_exists = True
            break
    return checkpoint_exists
Ejemplo n.º 14
0
    def get_shard_file_names(
        input_dir: str,
        split: str,
        layer: str,
        sorted: bool = True,
    ) -> List[ExtractedFeaturesShardPaths]:
        """
        Get the list of files needed to load the extracted features
        """

        # List all the files that are containing the features for a given
        # dataset split and a given layer
        feature_regex = re.compile(rf"(.*)_{split}_{layer}_features.npy")
        prefixes = []
        for file_path in g_pathmgr.ls(input_dir):
            match = feature_regex.match(file_path)
            if match is not None:
                prefixes.append(match.group(1))

        # Sort the shards by file name if required: it might be useful
        # if the algorithm that uses the shards is influenced by ordering
        if sorted:
            prefixes.sort()

        # Yield all the files needed to merge the features dumped on
        # the different GPUs
        shard_paths = []
        for prefix in prefixes:
            feat_file = os.path.join(input_dir,
                                     f"{prefix}_{split}_{layer}_features.npy")
            targets_file = os.path.join(
                input_dir, f"{prefix}_{split}_{layer}_targets.npy")
            indices_file = os.path.join(input_dir,
                                        f"{prefix}_{split}_{layer}_inds.npy")
            shard_paths.append(
                ExtractedFeaturesShardPaths(
                    feature_file=feat_file,
                    targets_file=targets_file,
                    indices_file=indices_file,
                ))
        return shard_paths
Ejemplo n.º 15
0
    def load(self, num_samples=None):
        """
        Load the data ground truth and parse the data so it's ready to be used.
        """
        # Load the dataset GT
        self.lab_root = f"{self.path}/lab/"
        self.img_root = f"{self.path}/jpg/"
        logging.info(f"Loading data: {self.path}")
        lab_filenames = np.sort(g_pathmgr.ls(self.lab_root))
        # Get the filenames without the extension
        self.img_filenames = [
            e[:-4] for e in np.sort(g_pathmgr.ls(self.img_root))
            if e[:-4] not in self.blacklisted
        ]

        # Parse the label files. Some challenges as filenames do not correspond
        # exactly to query names. Go through all the labels to:
        # i) map names to filenames and vice versa
        # ii) get the relevant regions of interest of the queries,
        # iii) get the indexes of the dataset images that are queries
        # iv) get the relevants / non-relevants list
        self.relevants = {}
        self.junk = {}
        self.non_relevants = {}

        self.filename_to_name = {}
        self.name_to_filename = OrderedDict()
        self.q_roi = {}
        for e in lab_filenames:
            if e.endswith("_query.txt"):
                q_name = e[:-len("_query.txt")]
                with g_pathmgr.open(f"{self.lab_root}/{e}") as fopen:
                    q_data = fopen.readline().split(" ")
                if q_data[0].startswith("oxc1_"):
                    q_filename = q_data[0][5:]
                else:
                    q_filename = q_data[0]
                self.filename_to_name[q_filename] = q_name
                self.name_to_filename[q_name] = q_filename
                with g_pathmgr.open(
                        f"{self.lab_root}/{q_name}_ok.txt") as fopen:
                    good = {e.strip() for e in fopen}
                with g_pathmgr.open(
                        f"{self.lab_root}/{q_name}_good.txt") as fopen:
                    good = good.union({e.strip() for e in fopen})
                with g_pathmgr.open(
                        f"{self.lab_root}/{q_name}_junk.txt") as fopen:
                    junk = {e.strip() for e in fopen}
                good_plus_junk = good.union(junk)
                self.relevants[q_name] = [
                    i for i in range(len(self.img_filenames))
                    if self.img_filenames[i] in good
                ]
                self.junk[q_name] = [
                    i for i in range(len(self.img_filenames))
                    if self.img_filenames[i] in junk
                ]
                self.non_relevants[q_name] = [
                    i for i in range(len(self.img_filenames))
                    if self.img_filenames[i] not in good_plus_junk
                ]
                self.q_roi[q_name] = np.array([float(q) for q in q_data[1:]],
                                              dtype=np.float32)

        self.q_names = list(self.name_to_filename.keys())
        self.q_index = np.array([
            self.img_filenames.index(self.name_to_filename[qn])
            for qn in self.q_names
        ])

        self.N_images = len(self.img_filenames)
        self.N_queries = len(self.q_index)

        if num_samples is not None:
            self.N_queries = min(self.N_queries, num_samples)
            self.N_images = min(self.N_images, num_samples)
Ejemplo n.º 16
0
def has_checkpoint():
    """Determines if there are checkpoints available."""
    checkpoint_dir = get_checkpoint_dir()
    if not g_pathmgr.exists(checkpoint_dir):
        return False
    return any(_NAME_PREFIX in f for f in g_pathmgr.ls(checkpoint_dir))
Ejemplo n.º 17
0
def build_frame_manifest_from_flat_directory(
        data_directory_path: str,
        multithreaded: bool) -> Dict[str, VideoFrameInfo]:
    """
    Args:
        data_directory_path (str): Path or URI to EpicKitchenDataset data.
                Data at this path must be a folder of structure:
                    {
                        "{video_id}": [
                            "frame_{frame_number}.{file_extension}",
                            "frame_{frame_number}.{file_extension}",
                            "frame_{frame_number}.{file_extension}",
                        ...]
                    ...}
        multithreaded (bool):
            controls whether io operations are performed across multiple threads.

    Returns:
        Dictionary mapping video_id of available videos to the locations of their
        underlying frame files.
    """

    video_frames = {}
    video_ids = g_pathmgr.ls(str(data_directory_path))

    def add_video_frames(video_id: str, video_path: str) -> None:
        video_frame_file_names = sorted(g_pathmgr.ls(video_path))
        for frame in video_frame_file_names:
            file_extension = frame.split(".")[-1]
            frame_name = frame[:-(len(file_extension) + 1)]
            stem, path_frame_id = frame_name.split("_")
            if video_id not in video_frames:
                video_frames[video_id] = VideoFrameInfo(
                    video_id=video_id,
                    location=video_path,
                    frame_file_stem=f"{stem}_",
                    frame_string_length=len(frame_name),
                    min_frame_number=int(path_frame_id),
                    max_frame_number=int(path_frame_id),
                    file_extension=file_extension,
                )
            else:
                video_frame_info = video_frames[video_id]
                # Check that this new frame is of the same format as other frames for this video
                # and that it is the next frame in order, if so update the frame info for this
                # video to reflect there is an additional frame.
                # We don't need to check video_id or frame_file_stem as they are function of
                # video_id which is aligned within the dictionary
                assert video_frame_info.frame_string_length == len(frame_name)
                assert video_frame_info.location == video_path, (
                    f"Frames for {video_id} found in two paths: "
                    f"{video_frame_info.location} and {video_path}")
                assert video_frame_info.max_frame_number + 1 == int(
                    path_frame_id)
                assert (
                    video_frame_info.file_extension == file_extension
                ), f"Frames with two different file extensions found for video {video_id}"
                video_frames[video_id] = VideoFrameInfo(
                    video_id=video_frame_info.video_id,
                    location=video_frame_info.location,
                    frame_file_stem=video_frame_info.frame_file_stem,
                    frame_string_length=video_frame_info.frame_string_length,
                    min_frame_number=video_frame_info.min_frame_number,
                    max_frame_number=int(path_frame_id),  # Update
                    file_extension=video_frame_info.file_extension,
                )

    video_paths = [(video_id, f"{data_directory_path}/{video_id}")
                   for video_id in video_ids]
    # Kick off frame indexing for all participants
    optional_threaded_foreach(add_video_frames, video_paths, multithreaded)

    return video_frames
Ejemplo n.º 18
0
 def ls(path: str) -> List[str]:
     if IOPathManager:
         return IOPathManager.ls(path)
     return os.listdir(path)
Ejemplo n.º 19
0

if __name__ == "__main__":
    """
    Example usage:

    python extra_scripts/convert_folder_to_filelist.par \
        -i "manifold://ssl_framework/tree/datasets/food_101/" \
        -o "manifold://ssl_framework/tree/datasets/food_101/"
    """
    args = get_argument_parser().parse_args()

    setup_path_manager()

    ground_truth_splits = ["train", "trainval", "val", "test"]
    available_splits = g_pathmgr.ls(args.input)

    dataset_summary = {}

    if not any(split in available_splits for split in ground_truth_splits):
        # the dataset doesn't have any splits. So we just read it as is
        print("Dataset has no splits...")
        dataset_summary, img_paths, img_labels = get_filelist_labels_images_paths(
            args.input)
        out_image_filepath = os.path.join(args.output, "images.npy")
        out_label_filepath = os.path.join(args.output, "labels.npy")
        save_img_labels_filelist(img_paths, img_labels, out_image_filepath,
                                 out_label_filepath)
    else:
        for split in ["train", "trainval", "val", "test"]:
            if not g_pathmgr.exists(os.path.join(args.input, split)):