Esempio n. 1
0
def gsutil_get_storage_subdirs_containing_file_types(storage_bucket_path: str,
                                                     file_type: GcsfsDirectIngestFileType,
                                                     upper_bound_date: Optional[str],
                                                     lower_bound_date: Optional[str]) -> List[str]:
    """Returns all subdirs containing files of type |file_type| in the provided |storage_bucket_path| for a given
    region."""
    subdirs = gsutil_ls(f'gs://{storage_bucket_path}', directories_only=True)

    subdirs_containing_files = []
    for outer_subdir_path in subdirs:
        outer_subdir_name = os.path.basename(os.path.normpath(outer_subdir_path))
        if outer_subdir_name == file_type.value:
            date_subdirs = _dfs_get_date_subdirs([outer_subdir_path])

            for date_path in date_subdirs:
                if is_between_date_strs_inclusive(
                        upper_bound_date=upper_bound_date,
                        lower_bound_date=lower_bound_date,
                        date_of_interest=_date_str_from_date_subdir_path(date_path)):
                    subdirs_containing_files.append(date_path)

        elif file_type == GcsfsDirectIngestFileType.UNSPECIFIED:
            # TODO(3162): For now we assume that all files not in raw/ or ingest_view/ storage subdirs are 'raw'
            #  files. Once all files have been migrated to raw/ and ingest_view/ subdirs, delete this part.
            if not is_date_str(outer_subdir_name):
                continue

            if is_between_date_strs_inclusive(
                    upper_bound_date=upper_bound_date,
                    lower_bound_date=lower_bound_date,
                    date_of_interest=outer_subdir_name):
                subdirs_containing_files.append(outer_subdir_path)

    return subdirs_containing_files
Esempio n. 2
0
def _dfs_get_date_subdirs(paths_to_search: List[str], depth: int = 0) -> List[str]:
    """Traverses down through year/month/day subdirectories to contain list of all date subdirectories that contain
    files for a given day."""
    if depth == 3:
        return [p for p in paths_to_search if is_date_str(_date_str_from_date_subdir_path(p))]

    date_subdirs = []
    for p in paths_to_search:
        sub_paths = gsutil_ls(p, directories_only=True)
        date_subdirs.extend(_dfs_get_date_subdirs(sub_paths, depth=depth + 1))

    return date_subdirs
    def get_date_subdir_paths(self) -> List[str]:
        possible_paths = gsutil_ls(f'gs://{self.storage_bucket}')

        result = []
        for path in possible_paths:
            last_part = os.path.basename(os.path.normpath(path))

            if not is_date_str(last_part):
                continue

            if is_between_date_strs_inclusive(
                    upper_bound_date=self.end_date_bound,
                    lower_bound_date=self.start_date_bound,
                    date_of_interest=last_part):
                result.append(path)

        return result
    def _get_subdirs_to_copy(self) -> List[str]:
        subdirs = gsutil_ls(f'gs://{self.prod_storage_bucket}')

        subdirs_to_copy = []
        for subdir in subdirs:
            if not subdir.endswith('/'):
                logging.info("Path [%s] is in unexpected format, skipping",
                             subdir)
                continue

            subdir_name = os.path.basename(os.path.normpath(subdir))
            if not is_date_str(subdir_name):
                continue

            if is_between_date_strs_inclusive(
                    upper_bound_date=self.end_date_bound,
                    lower_bound_date=self.start_date_bound,
                    date_of_interest=subdir_name):
                subdirs_to_copy.append(subdir_name)

        return subdirs_to_copy