def _move_files(self, from_uri: str): curr_gcsfs_file_path = GcsfsFilePath.from_absolute_path(from_uri) previous_date_format = filename_parts_from_path( curr_gcsfs_file_path).date_str new_date_format = date.fromisoformat(previous_date_format).strftime( "%Y/%m/%d/") path_with_new_file_name = GcsfsFilePath.from_absolute_path( to_normalized_unprocessed_file_path_from_normalized_path( from_uri, GcsfsDirectIngestFileType.RAW_DATA)) if DirectIngestGCSFileSystem.is_processed_file(curr_gcsfs_file_path): path_with_new_file_name = GcsfsFilePath.from_absolute_path( to_normalized_processed_file_path_from_normalized_path( from_uri, GcsfsDirectIngestFileType.RAW_DATA)) raw_dir_with_date = GcsfsDirectoryPath.from_dir_and_subdir( self.region_storage_raw_dir_path, new_date_format) to_uri = GcsfsFilePath.from_directory_and_file_name( raw_dir_with_date, path_with_new_file_name.file_name).uri() if not self.dry_run: gsutil_mv(from_path=from_uri, to_path=to_uri) with self.mutex: self.move_list.append((from_uri, to_uri)) if self.move_progress: self.move_progress.next()
def run_export(project_id: str, dry_run: bool, state_code: str, target_bucket_suffix: str): """Performs the export operation, exporting rows for the given state codes from the tables from the state dataset in the given project to CSV files with the same names as the tables to the given GCS bucket.""" today = datetime.date.today() big_query_client = BigQueryClientImpl() dataset_ref = big_query_client.dataset_ref_for_id(STATE_BASE_DATASET) if not big_query_client.dataset_exists(dataset_ref): raise ValueError(f'Dataset {dataset_ref.dataset_id} does not exist') tables = big_query_client.list_tables(dataset_ref.dataset_id) export_configs = [] for table in tables: logging.info("******************************") export_query = state_table_export_query_str(table, [state_code]) logging.info(export_query) if not export_query: continue target_bucket_name = f'{project_id}-{target_bucket_suffix}' export_dir = gcs_export_directory(target_bucket_name, today, state_code) export_file_name = f'{table.table_id}_{today.isoformat()}_export.csv' file = GcsfsFilePath.from_directory_and_file_name( export_dir, export_file_name) output_uri = file.uri() export_config = ExportQueryConfig( query=export_query, query_parameters=[], intermediate_dataset_id='export_temporary_tables', intermediate_table_name= f'{dataset_ref.dataset_id}_{table.table_id}', output_uri=output_uri, output_format=bigquery.DestinationFormat.CSV, ) export_configs.append(export_config) if dry_run: logging.info( "[DRY RUN] Created export configuration to export table to GCS: %s", export_config) else: logging.info( "Created export configuration to export table to GCS: %s", export_config) if dry_run: logging.info("[DRY RUN] Exporting [%d] tables to GCS", len(export_configs)) else: logging.info("Exporting [%d] tables to GCS", len(export_configs)) big_query_client.export_query_results_to_cloud_storage(export_configs)
def _copy_to_ingest_bucket(self, path: str, normalized_file_name: str) -> None: full_file_upload_path_uri = GcsfsFilePath.from_directory_and_file_name( self.ingest_bucket, normalized_file_name).uri() if not self.dry_run: gsutil_cp(path, full_file_upload_path_uri) with self.mutex: self.copies_list.append((path, full_file_upload_path_uri)) if self.move_progress: self.move_progress.next()
def _generate_output_path(self, ingest_view_export_args: GcsfsIngestViewExportArgs, metadata: DirectIngestIngestFileMetadata) -> GcsfsFilePath: ingest_view = self.ingest_views_by_tag[ingest_view_export_args.ingest_view_name] if not metadata.normalized_file_name: output_file_name = to_normalized_unprocessed_file_name( f'{ingest_view.file_tag}.csv', GcsfsDirectIngestFileType.INGEST_VIEW, dt=ingest_view_export_args.upper_bound_datetime_to_export ) else: output_file_name = metadata.normalized_file_name return GcsfsFilePath.from_directory_and_file_name(self.ingest_directory_path, output_file_name)
def copy(self, src_path: GcsfsFilePath, dst_path: GcsfsPath) -> None: if isinstance(dst_path, GcsfsFilePath): path = dst_path elif isinstance(dst_path, GcsfsDirectoryPath): path = \ GcsfsFilePath.from_directory_and_file_name(dst_path, src_path.file_name) else: raise ValueError(f'Unexpected path type [{type(dst_path)}]') self._add_path(path)
def _create_split_file_path(self, original_file_path: GcsfsFilePath, output_dir: GcsfsDirectoryPath, split_num: int) -> GcsfsFilePath: parts = filename_parts_from_path(original_file_path) rank_str = str(split_num + 1).zfill(5) existing_suffix = \ f'_{parts.filename_suffix}' if parts.filename_suffix else '' updated_file_name = ( f'{parts.file_tag}{existing_suffix}_{rank_str}' f'_{SPLIT_FILE_SUFFIX}_size{self.file_split_line_limit}' f'.{parts.extension}') return GcsfsFilePath.from_directory_and_file_name( output_dir, to_normalized_unprocessed_file_path(updated_file_name, dt=parts.utc_upload_datetime))
def copy(self, src_path: GcsfsFilePath, dst_path: GcsfsPath) -> None: if isinstance(dst_path, GcsfsFilePath): path = dst_path elif isinstance(dst_path, GcsfsDirectoryPath): path = \ GcsfsFilePath.from_directory_and_file_name(dst_path, src_path.file_name) else: raise ValueError(f'Unexpected path type [{type(dst_path)}]') if src_path.abs_path() in self.uploaded_test_path_to_actual: self.uploaded_test_path_to_actual[dst_path.abs_path()] = \ self.uploaded_test_path_to_actual[src_path.abs_path()] self._add_path(path)
def copy(self, src_path: GcsfsFilePath, dst_path: GcsfsPath) -> None: src_bucket = self.storage_client.get_bucket(src_path.bucket_name) src_blob = src_bucket.get_blob(src_path.blob_name) if not src_blob: raise ValueError( f'Blob at path [{src_path.abs_path()}] does not exist') dst_bucket = self.storage_client.get_bucket(dst_path.bucket_name) if isinstance(dst_path, GcsfsFilePath): dst_blob_name = dst_path.blob_name elif isinstance(dst_path, GcsfsDirectoryPath): dst_blob_name = \ GcsfsFilePath.from_directory_and_file_name( dst_path, src_path.file_name).blob_name else: raise ValueError(f'Unexpected path type [{type(dst_path)}]') src_bucket.copy_blob(src_blob, dst_bucket, dst_blob_name)
def _create_split_file_path(self, original_file_path: GcsfsFilePath, output_dir: GcsfsDirectoryPath, split_num: int) -> GcsfsFilePath: parts = filename_parts_from_path(original_file_path) rank_str = str(split_num + 1).zfill(5) updated_file_name = ( f'{parts.stripped_file_name}_{rank_str}' f'_{SPLIT_FILE_SUFFIX}_size{self.ingest_file_split_line_limit}' f'.{parts.extension}') file_type = GcsfsDirectIngestFileType.INGEST_VIEW \ if self.region.is_raw_vs_ingest_file_name_detection_enabled() else GcsfsDirectIngestFileType.UNSPECIFIED return GcsfsFilePath.from_directory_and_file_name( output_dir, to_normalized_unprocessed_file_path(updated_file_name, file_type=file_type, dt=parts.utc_upload_datetime))
def get_output_path(self, chunk_num: int): name, _extension = os.path.splitext(self.path.file_name) return GcsfsFilePath.from_directory_and_file_name( self.temp_output_directory_path, f'temp_{name}_{chunk_num}.csv')