def _get_files_to_move(self) -> List[str]: """Function that gets the files to move to deprecated based on the file_filter and end/start dates specified""" subdirs = dfs_get_date_subdirs([self.region_storage_dir_path.uri()]) result = [] for yr_mth_day_subdir_path in subdirs: dir_path_blob = GcsfsFilePath.from_absolute_path( yr_mth_day_subdir_path).blob_name search_date = DATE_SUBDIR_REGEX.search(dir_path_blob) if search_date is None: raise ValueError( "No match found. File paths should have the format YYYY/MM/DD. Instead we found" f"{dir_path_blob}.") match_date = search_date.group() date_of_interest = datetime.datetime.strptime( match_date, '%Y/%m/%d').date().isoformat() if is_between_date_strs_inclusive( upper_bound_date=self.end_date_bound, lower_bound_date=self.start_date_bound, date_of_interest=date_of_interest): from_paths = gsutil_ls(f'{yr_mth_day_subdir_path}*.csv') for from_path in from_paths: _, file_name = os.path.split(from_path) if re.match(INGESTED_FILE_REGEX, file_name): if not self.file_filter or re.search( self.file_filter, file_name): result.append(from_path) return result
def get_files_to_move_from_path(self, gs_dir_path: str) -> List[str]: """Returns files directly in the given directory that should be moved back into the ingest directory.""" file_paths = gsutil_ls(gs_dir_path) result = [] for file_path in file_paths: _, file_name = os.path.split(file_path) if re.match(self.FILE_TO_MOVE_RE, file_name): if not self.file_filter or re.search(self.file_filter, file_name): result.append(file_path) with self.mutex: if self.collect_progress: self.collect_progress.next() return result
def _get_files_to_move(self) -> List[str]: """Function that gets the files to move to deprecated based on the file_filter and end/start dates specified""" subdirs = gsutil_get_storage_subdirs_containing_file_types( storage_bucket_path=self.region_storage_dir_path.abs_path(), file_type=self.file_type, lower_bound_date=self.start_date_bound, upper_bound_date=self.end_date_bound) result = [] for subdir_path in subdirs: from_paths = gsutil_ls(f'{subdir_path}*.csv') for from_path in from_paths: _, file_name = os.path.split(from_path) if re.match(INGESTED_FILE_REGEX, file_name): if not self.file_filter or re.search( self.file_filter, file_name): result.append(from_path) return result
def get_date_subdir_paths(self) -> List[str]: possible_paths = gsutil_ls(f'gs://{self.storage_bucket}') result = [] for path in possible_paths: last_part = os.path.basename(os.path.normpath(path)) if not is_date_str(last_part): continue if is_between_date_strs_inclusive( upper_bound_date=self.end_date_bound, lower_bound_date=self.start_date_bound, date_of_interest=last_part): result.append(path) return result
def _move_files_for_date(self, subdir_path_str: str) -> None: """Function that loops through each subdirectory and moves files in each subdirectory using the from path and to path specified.""" from_dir_path = GcsfsDirectoryPath.from_absolute_path( subdir_path_str.rstrip("/")) previous_date_format = from_dir_path.relative_path.rstrip("/").split( "/")[-1] new_date_format = date.fromisoformat(previous_date_format).strftime( "%Y/%m/%d/") from_paths = gsutil_ls(f"{subdir_path_str}*.csv") for from_path in from_paths: file_name = GcsfsFilePath( bucket_name=self.region_storage_dir_path.bucket_name, blob_name=from_path, ).file_name to_file_path = os.path.join( "gs://", self.region_storage_dir_path.bucket_name, self.region_code, GcsfsDirectIngestFileType.RAW_DATA.value, new_date_format, file_name, ) normalized_to_file_path = ( to_normalized_processed_file_path_from_normalized_path( to_file_path, file_type_override=GcsfsDirectIngestFileType.RAW_DATA)) to_path = normalized_to_file_path if not self.dry_run: gsutil_mv(from_path=from_path, to_path=to_path) with self.mutex: self.move_list.append((from_path, to_path)) if self.move_progress: self.move_progress.next()
def _get_subdirs_to_copy(self) -> List[str]: subdirs = gsutil_ls(f'gs://{self.prod_storage_bucket}') subdirs_to_copy = [] for subdir in subdirs: if not subdir.endswith('/'): logging.info("Path [%s] is in unexpected format, skipping", subdir) continue subdir_name = os.path.basename(os.path.normpath(subdir)) if not is_date_str(subdir_name): continue if is_between_date_strs_inclusive( upper_bound_date=self.end_date_bound, lower_bound_date=self.start_date_bound, date_of_interest=subdir_name): subdirs_to_copy.append(subdir_name) return subdirs_to_copy
def _get_files_to_move(self) -> List[str]: return gsutil_ls( 'gs://' f'{self.region_ingest_bucket_dir_path.bucket_name}/*.csv')