def _get_files_to_move(self) -> List[str]:
     """Function that gets the files to move to deprecated based on the file_filter and end/start dates specified"""
     subdirs = dfs_get_date_subdirs([self.region_storage_dir_path.uri()])
     result = []
     for yr_mth_day_subdir_path in subdirs:
         dir_path_blob = GcsfsFilePath.from_absolute_path(
             yr_mth_day_subdir_path).blob_name
         search_date = DATE_SUBDIR_REGEX.search(dir_path_blob)
         if search_date is None:
             raise ValueError(
                 "No match found. File paths should have the format YYYY/MM/DD. Instead we found"
                 f"{dir_path_blob}.")
         match_date = search_date.group()
         date_of_interest = datetime.datetime.strptime(
             match_date, '%Y/%m/%d').date().isoformat()
         if is_between_date_strs_inclusive(
                 upper_bound_date=self.end_date_bound,
                 lower_bound_date=self.start_date_bound,
                 date_of_interest=date_of_interest):
             from_paths = gsutil_ls(f'{yr_mth_day_subdir_path}*.csv')
             for from_path in from_paths:
                 _, file_name = os.path.split(from_path)
                 if re.match(INGESTED_FILE_REGEX, file_name):
                     if not self.file_filter or re.search(
                             self.file_filter, file_name):
                         result.append(from_path)
     return result
Beispiel #2
0
    def get_files_to_move_from_path(self, gs_dir_path: str) -> List[str]:
        """Returns files directly in the given directory that should be moved back into the ingest directory."""
        file_paths = gsutil_ls(gs_dir_path)

        result = []
        for file_path in file_paths:
            _, file_name = os.path.split(file_path)
            if re.match(self.FILE_TO_MOVE_RE, file_name):
                if not self.file_filter or re.search(self.file_filter, file_name):
                    result.append(file_path)
        with self.mutex:
            if self.collect_progress:
                self.collect_progress.next()
        return result
Beispiel #3
0
 def _get_files_to_move(self) -> List[str]:
     """Function that gets the files to move to deprecated based on the file_filter and end/start dates specified"""
     subdirs = gsutil_get_storage_subdirs_containing_file_types(
         storage_bucket_path=self.region_storage_dir_path.abs_path(),
         file_type=self.file_type,
         lower_bound_date=self.start_date_bound,
         upper_bound_date=self.end_date_bound)
     result = []
     for subdir_path in subdirs:
         from_paths = gsutil_ls(f'{subdir_path}*.csv')
         for from_path in from_paths:
             _, file_name = os.path.split(from_path)
             if re.match(INGESTED_FILE_REGEX, file_name):
                 if not self.file_filter or re.search(
                         self.file_filter, file_name):
                     result.append(from_path)
     return result
    def get_date_subdir_paths(self) -> List[str]:
        possible_paths = gsutil_ls(f'gs://{self.storage_bucket}')

        result = []
        for path in possible_paths:
            last_part = os.path.basename(os.path.normpath(path))

            if not is_date_str(last_part):
                continue

            if is_between_date_strs_inclusive(
                    upper_bound_date=self.end_date_bound,
                    lower_bound_date=self.start_date_bound,
                    date_of_interest=last_part):
                result.append(path)

        return result
    def _move_files_for_date(self, subdir_path_str: str) -> None:
        """Function that loops through each subdirectory and moves files in each subdirectory using the from path
        and to path specified."""

        from_dir_path = GcsfsDirectoryPath.from_absolute_path(
            subdir_path_str.rstrip("/"))

        previous_date_format = from_dir_path.relative_path.rstrip("/").split(
            "/")[-1]
        new_date_format = date.fromisoformat(previous_date_format).strftime(
            "%Y/%m/%d/")

        from_paths = gsutil_ls(f"{subdir_path_str}*.csv")
        for from_path in from_paths:
            file_name = GcsfsFilePath(
                bucket_name=self.region_storage_dir_path.bucket_name,
                blob_name=from_path,
            ).file_name

            to_file_path = os.path.join(
                "gs://",
                self.region_storage_dir_path.bucket_name,
                self.region_code,
                GcsfsDirectIngestFileType.RAW_DATA.value,
                new_date_format,
                file_name,
            )

            normalized_to_file_path = (
                to_normalized_processed_file_path_from_normalized_path(
                    to_file_path,
                    file_type_override=GcsfsDirectIngestFileType.RAW_DATA))

            to_path = normalized_to_file_path

            if not self.dry_run:
                gsutil_mv(from_path=from_path, to_path=to_path)
            with self.mutex:
                self.move_list.append((from_path, to_path))

        if self.move_progress:
            self.move_progress.next()
    def _get_subdirs_to_copy(self) -> List[str]:
        subdirs = gsutil_ls(f'gs://{self.prod_storage_bucket}')

        subdirs_to_copy = []
        for subdir in subdirs:
            if not subdir.endswith('/'):
                logging.info("Path [%s] is in unexpected format, skipping",
                             subdir)
                continue

            subdir_name = os.path.basename(os.path.normpath(subdir))
            if not is_date_str(subdir_name):
                continue

            if is_between_date_strs_inclusive(
                    upper_bound_date=self.end_date_bound,
                    lower_bound_date=self.start_date_bound,
                    date_of_interest=subdir_name):
                subdirs_to_copy.append(subdir_name)

        return subdirs_to_copy
 def _get_files_to_move(self) -> List[str]:
     return gsutil_ls(
         'gs://'
         f'{self.region_ingest_bucket_dir_path.bucket_name}/*.csv')