def _move_files(self, from_uri: str): curr_gcsfs_file_path = GcsfsFilePath.from_absolute_path(from_uri) previous_date_format = filename_parts_from_path( curr_gcsfs_file_path).date_str new_date_format = date.fromisoformat(previous_date_format).strftime( "%Y/%m/%d/") path_with_new_file_name = GcsfsFilePath.from_absolute_path( to_normalized_unprocessed_file_path_from_normalized_path( from_uri, GcsfsDirectIngestFileType.RAW_DATA)) if DirectIngestGCSFileSystem.is_processed_file(curr_gcsfs_file_path): path_with_new_file_name = GcsfsFilePath.from_absolute_path( to_normalized_processed_file_path_from_normalized_path( from_uri, GcsfsDirectIngestFileType.RAW_DATA)) raw_dir_with_date = GcsfsDirectoryPath.from_dir_and_subdir( self.region_storage_raw_dir_path, new_date_format) to_uri = GcsfsFilePath.from_directory_and_file_name( raw_dir_with_date, path_with_new_file_name.file_name).uri() if not self.dry_run: gsutil_mv(from_path=from_uri, to_path=to_uri) with self.mutex: self.move_list.append((from_uri, to_uri)) if self.move_progress: self.move_progress.next()
def move_file(self, original_file_path: str): """Moves a file at the given path into the ingest directory, updating the name to always have an prefix of 'unprocessed'. Logs the file move, which will later be written to a log file. If in dry_run mode, merely logs the move, but does not execute it. """ new_file_path = self.build_moved_file_path(original_file_path) if not self.dry_run: gsutil_mv(original_file_path, new_file_path) with self.mutex: self.moves_list.append((original_file_path, new_file_path)) if self.move_progress: self.move_progress.next()
def _move_files_for_date(self, from_uri: str): """Function that loops through each list of files to move and moves them to the deprecated folder in accordance with the date they were received and the date they were deprecated.""" curr_gcsfs_file_path = GcsfsFilePath.from_absolute_path(from_uri) previous_date_format = filename_parts_from_path( curr_gcsfs_file_path).date_str new_date_format = date.fromisoformat(previous_date_format).strftime( "%Y/%m/%d/") to_uri = os.path.join( 'gs://', self.region_storage_dir_path_for_file_type.bucket_name, self.region_code, 'deprecated', f'deprecated_on_{date.today()}', str(self.file_type.value), new_date_format, curr_gcsfs_file_path.file_name) if not self.dry_run: gsutil_mv(from_path=from_uri, to_path=to_uri) with self.mutex: self.move_list.append((from_uri, to_uri)) if self.move_progress: self.move_progress.next()
def _move_files_for_date(self, subdir_path_str: str) -> None: """Function that loops through each subdirectory and moves files in each subdirectory using the from path and to path specified.""" from_dir_path = GcsfsDirectoryPath.from_absolute_path( subdir_path_str.rstrip("/")) previous_date_format = from_dir_path.relative_path.rstrip("/").split( "/")[-1] new_date_format = date.fromisoformat(previous_date_format).strftime( "%Y/%m/%d/") from_paths = gsutil_ls(f"{subdir_path_str}*.csv") for from_path in from_paths: file_name = GcsfsFilePath( bucket_name=self.region_storage_dir_path.bucket_name, blob_name=from_path, ).file_name to_file_path = os.path.join( "gs://", self.region_storage_dir_path.bucket_name, self.region_code, GcsfsDirectIngestFileType.RAW_DATA.value, new_date_format, file_name, ) normalized_to_file_path = ( to_normalized_processed_file_path_from_normalized_path( to_file_path, file_type_override=GcsfsDirectIngestFileType.RAW_DATA)) to_path = normalized_to_file_path if not self.dry_run: gsutil_mv(from_path=from_path, to_path=to_path) with self.mutex: self.move_list.append((from_path, to_path)) if self.move_progress: self.move_progress.next()