def __init__( self, project_id: str, region: str, lower_bound_update_datetime: Optional[datetime.datetime], gcs_destination_path: Optional[str] = None, ): self.project_id = project_id self.region = region.lower() self.auth = SftpAuth.for_region(region) self.delegate = SftpDownloadDelegateFactory.build(region_code=region) self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build()) self.unable_to_download_items: List[str] = [] self.downloaded_items: List[Tuple[str, datetime.datetime]] = [] self.lower_bound_update_datetime = lower_bound_update_datetime self.bucket = ( GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id ) ) if gcs_destination_path is None else GcsfsDirectoryPath.from_absolute_path(gcs_destination_path) ) self.download_dir = GcsfsDirectoryPath.from_dir_and_subdir( dir_path=self.bucket, subdir=RAW_INGEST_DIRECTORY )
def __init__( self, region_code: str, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, project_id: str, ): self.region_code = region_code self.file_type = GcsfsDirectIngestFileType.UNSPECIFIED self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.dry_run = dry_run self.project_id = project_id self.region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, project_id=self.project_id)) self.region_storage_raw_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, GcsfsDirectIngestFileType.RAW_DATA, project_id=self.project_id, )) self.log_output_path = os.path.join( os.path.dirname(__file__), f"move_storage_files_from_unspecified_to_raw_start_bound_{self.region_code}_region_{self.start_date_bound}" f"_end_bound_{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt", ) self.mutex = threading.Lock() self.move_list: List[Tuple[str, str]] = [] self.move_progress: Optional[Bar] = None
def __init__( self, region_code: str, file_type: GcsfsDirectIngestFileType, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, ): self.file_type = file_type self.prod_region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, project_id="recidiviz-123" ) ) self.staging_region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, project_id="recidiviz-staging" ) ) self.dry_run = dry_run self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.log_output_path = os.path.join( os.path.dirname(__file__), f"copy_prod_to_staging_result_{region_code}_start_bound_{self.start_date_bound}_end_bound_" f"{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt", ) self.mutex = threading.Lock() self.copy_list: List[Tuple[str, str]] = [] self.copy_progress: Optional[Bar] = None
def test_get_configs_for_export_name( self, mock_environment: mock.MagicMock) -> None: """Tests get_configs_for_export_name function to ensure that export names correctly match""" mock_environment.return_value = "production" export_configs_for_filter = view_export_manager.get_configs_for_export_name( export_name=self.mock_export_name, state_code=self.mock_state_code, project_id=self.mock_project_id, ) view = self.mock_view_builder.build() metric_view = self.mock_metric_view_builder.build() expected_view_config_list = [ ExportBigQueryViewConfig( bq_view_namespace=self.mock_big_query_view_namespace, view=view, view_filter_clause= f" WHERE state_code = '{self.mock_state_code}'", intermediate_table_name= f"{view.view_id}_table_{self.mock_state_code}", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://{project_id}-dataset-location/subdirectory/{state_code}" .format( project_id=self.mock_project_id, state_code=self.mock_state_code, )), export_output_formats=[ExportOutputFormatType.JSON], ), ExportBigQueryViewConfig( bq_view_namespace=self.mock_big_query_view_namespace, view=metric_view, view_filter_clause= f" WHERE state_code = '{self.mock_state_code}'", intermediate_table_name= f"{view.view_id}_table_{self.mock_state_code}", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://{project_id}-dataset-location/subdirectory/{state_code}" .format( project_id=self.mock_project_id, state_code=self.mock_state_code, )), export_output_formats=[ ExportOutputFormatType.JSON, ExportOutputFormatType.METRIC, ], ), ] self.assertEqual(expected_view_config_list, export_configs_for_filter) # Test for case insensitivity export_configs_for_filter = view_export_manager.get_configs_for_export_name( export_name=self.mock_export_name.lower(), state_code=self.mock_state_code.lower(), project_id=self.mock_project_id, ) self.assertEqual(expected_view_config_list, export_configs_for_filter)
def __init__(self, region_name: str, system_level: SystemLevel, ingest_directory_path: Optional[str] = None, storage_directory_path: Optional[str] = None, max_delay_sec_between_files: Optional[int] = None): super().__init__(region_name, system_level) self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build()) self.max_delay_sec_between_files = max_delay_sec_between_files if not ingest_directory_path: ingest_directory_path = \ gcsfs_direct_ingest_directory_path_for_region(region_name, system_level) self.ingest_directory_path = \ GcsfsDirectoryPath.from_absolute_path(ingest_directory_path) if not storage_directory_path: storage_directory_path = \ gcsfs_direct_ingest_storage_directory_path_for_region( region_name, system_level) self.storage_directory_path = \ GcsfsDirectoryPath.from_absolute_path(storage_directory_path) self.temp_output_directory_path = \ GcsfsDirectoryPath.from_absolute_path(gcsfs_direct_ingest_temporary_output_directory_path()) ingest_job_file_type_filter = \ GcsfsDirectIngestFileType.INGEST_VIEW \ if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None self.file_prioritizer = \ GcsfsDirectIngestJobPrioritizer( self.fs, self.ingest_directory_path, self.get_file_tag_rank_list(), ingest_job_file_type_filter) self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT self.file_metadata_manager = PostgresDirectIngestFileMetadataManager( region_code=self.region.region_code) self.raw_file_import_manager = DirectIngestRawFileImportManager( region=self.region, fs=self.fs, ingest_directory_path=self.ingest_directory_path, temp_output_directory_path=self.temp_output_directory_path, big_query_client=BigQueryClientImpl()) self.ingest_view_export_manager = DirectIngestIngestViewExportManager( region=self.region, fs=self.fs, ingest_directory_path=self.ingest_directory_path, file_metadata_manager=self.file_metadata_manager, big_query_client=BigQueryClientImpl(), view_collector=DirectIngestPreProcessedIngestViewCollector( self.region, self.get_file_tag_rank_list()))
def setUp(self) -> None: self.mock_bq_client = mock.create_autospec(BigQueryClient) self.mock_validator = mock.create_autospec(BigQueryViewExportValidator) self.mock_project_id = "fake-project" self.metadata_patcher = mock.patch( "recidiviz.utils.metadata.project_id") self.mock_project_id_fn = self.metadata_patcher.start() self.mock_project_id_fn.return_value = self.mock_project_id self.view_builder = SimpleBigQueryViewBuilder( dataset_id="test_dataset", view_id="test_view", view_query_template="SELECT NULL LIMIT 0", ) self.second_view_builder = SimpleBigQueryViewBuilder( dataset_id="test_dataset", view_id="test_view_2", view_query_template="SELECT NULL LIMIT 0", ) self.view_export_configs = [ ExportBigQueryViewConfig( view=self.view_builder.build(), view_filter_clause=" WHERE state_code = 'US_XX'", intermediate_table_name= f"{self.view_builder.view_id}_table_US_XX", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://{project_id}-dataset-location/subdirectory/{state_code}" .format( project_id=self.mock_project_id, state_code="US_XX", )), export_output_formats=[ ExportOutputFormatType.JSON, ExportOutputFormatType.HEADERLESS_CSV, ], ), ExportBigQueryViewConfig( view=self.second_view_builder.build(), view_filter_clause=" WHERE state_code = 'US_XX'", intermediate_table_name= f"{self.second_view_builder.view_id}_table_US_XX", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://{project_id}-dataset-location/subdirectory/{state_code}" .format( project_id=self.mock_project_id, state_code="US_XX", )), export_output_formats=[ ExportOutputFormatType.JSON, ExportOutputFormatType.CSV, ], ), ]
def test_export_dashboard_data_to_cloud_storage( self, mock_view_exporter, mock_view_update_manager_rematerialize) -> None: """Tests the table is created from the view and then extracted.""" view_export_manager.export_view_data_to_cloud_storage( self.mock_state_code, mock_view_exporter) view = self.mock_view_builder.build() metric_view = self.mock_metric_view_builder.build() view_export_configs = [ ExportBigQueryViewConfig( view=view, view_filter_clause=" WHERE state_code = 'US_XX'", intermediate_table_name=f"{view.view_id}_table_US_XX", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://{project_id}-dataset-location/subdirectory/{state_code}" .format( project_id=self.mock_project_id, state_code="US_XX", )), export_output_formats=[ExportOutputFormatType.JSON], ), ExportBigQueryViewConfig( view=metric_view, view_filter_clause=" WHERE state_code = 'US_XX'", intermediate_table_name=f"{view.view_id}_table_US_XX", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://{project_id}-dataset-location/subdirectory/{state_code}" .format( project_id=self.mock_project_id, state_code="US_XX", )), export_output_formats=[ ExportOutputFormatType.JSON, ExportOutputFormatType.METRIC, ], ), ] mock_view_update_manager_rematerialize.assert_called() mock_view_exporter.export_and_validate.assert_has_calls( [ mock.call([]), # CSV export mock.call([ view_export_configs[1].pointed_to_staging_subdirectory() ]), # JSON export mock.call([ conf.pointed_to_staging_subdirectory() for conf in view_export_configs ]), # METRIC export ], any_order=True, )
def __init__( self, project_id: str, region: str, file_type_to_move: GcsfsDirectIngestFileType, destination_file_type: GcsfsDirectIngestFileType, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, file_filter: Optional[str], ): self.project_id = project_id self.region = region self.file_type_to_move = file_type_to_move self.destination_file_type = destination_file_type if ( self.file_type_to_move != self.destination_file_type and self.file_type_to_move != GcsfsDirectIngestFileType.UNSPECIFIED ): raise ValueError( "Args file_type_to_move and destination_file_type must match if type to move is UNSPECIFIED" ) self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.dry_run = dry_run self.file_filter = file_filter self.storage_bucket = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id ) ) self.ingest_bucket = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id ) ) self.mutex = threading.Lock() self.collect_progress: Optional[Bar] = None self.move_progress: Optional[Bar] = None self.moves_list: List[Tuple[str, str]] = [] self.log_output_path = os.path.join( os.path.dirname(__file__), f"move_result_{region}_{self.project_id}_start_bound_{self.start_date_bound}_end_bound_" f"{self.end_date_bound}_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt", )
def export_configs_for_views_to_export(self, project_id: str) -> Sequence[ExportMetricBigQueryViewConfig]: """Builds a list of ExportMetricBigQueryViewConfigs that define how all metric views in metric_view_builders_to_export should be exported to Google Cloud Storage.""" view_filter_clause = (f" WHERE state_code = '{self.state_code_filter}'" if self.state_code_filter else None) intermediate_table_name = "{export_view_name}_table" output_directory = self.output_directory_uri_template.format( project_id=project_id ) if self.state_code_filter: intermediate_table_name += f"_{self.state_code_filter}" output_directory += f"/{self.state_code_filter}" return [ ExportMetricBigQueryViewConfig( view=view, view_filter_clause=view_filter_clause, intermediate_table_name=intermediate_table_name.format( export_view_name=view.view_id ), output_directory=GcsfsDirectoryPath.from_absolute_path(output_directory), ) for view in [vb.build() for vb in self.metric_view_builders_to_export] ]
def test_metric_export_state_agnostic(self): """Tests the export_configs_for_views_to_export function on the ExportMetricDatasetConfig class when the export is state-agnostic.""" state_agnostic_dataset_export_config = ExportMetricDatasetConfig( dataset_id='dataset_id', metric_view_builders_to_export=self.views_for_dataset, output_directory_uri_template= "gs://{project_id}-bucket-without-state-codes", state_code_filter=None, export_name=None) view_configs_to_export = state_agnostic_dataset_export_config.export_configs_for_views_to_export( project_id=self.mock_project_id) expected_view = self.mock_view_builder.build() expected_view_export_configs = [ ExportMetricBigQueryViewConfig( view=expected_view, view_filter_clause=None, intermediate_table_name=f"{expected_view.view_id}_table", output_directory=GcsfsDirectoryPath.from_absolute_path( state_agnostic_dataset_export_config. output_directory_uri_template.format( project_id=self.mock_project_id, ))) ] self.assertEqual(expected_view_export_configs, view_configs_to_export)
def test_metric_export_lantern_dashboard(self) -> None: """Tests the export_configs_for_views_to_export function on the ExportViewCollectionConfig class when the export is state-agnostic.""" lantern_dashboard_dataset_export_config = ExportViewCollectionConfig( view_builders_to_export=self.views_for_dataset, output_directory_uri_template= "gs://{project_id}-bucket-without-state-codes", export_name="TEST_EXPORT", bq_view_namespace=self.mock_big_query_view_namespace, ) view_configs_to_export = (lantern_dashboard_dataset_export_config. export_configs_for_views_to_export( project_id=self.mock_project_id, )) expected_view = self.mock_view_builder.build() expected_view_export_configs = [ ExportBigQueryViewConfig( bq_view_namespace=self.mock_big_query_view_namespace, view=expected_view, view_filter_clause=None, intermediate_table_name=f"{expected_view.view_id}_table", output_directory=GcsfsDirectoryPath.from_absolute_path( lantern_dashboard_dataset_export_config. output_directory_uri_template.format( project_id=self.mock_project_id, )), export_output_formats=[ ExportOutputFormatType.JSON, ExportOutputFormatType.METRIC, ], ) ] self.assertEqual(expected_view_export_configs, view_configs_to_export)
def test_export_dashboard_data_to_cloud_storage_validation_error(self, mock_view_exporter, mock_view_update_manager): """Tests the table is created from the view and then extracted.""" mock_view_exporter.export_and_validate.side_effect = ViewExportValidationError # Should not throw metric_view_export_manager.export_view_data_to_cloud_storage(mock_state_code, mock_view_exporter) view = self.mock_view_builder.build() view_export_configs = [ExportMetricBigQueryViewConfig( view=view, view_filter_clause=" WHERE state_code = 'US_XX'", intermediate_table_name=f"{view.view_id}_table_US_XX", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://{project_id}-dataset-location/subdirectory/{state_code}".format( project_id=self.mock_project_id, state_code='US_XX', ) ) )] mock_view_update_manager.assert_called() mock_view_exporter.export_and_validate.assert_called_with(view_export_configs)
def get_paths_to_upload(self) -> List[Tuple[str, datetime.datetime]]: """Returns the appropriate paths to upload and the proper associated timestamp that it is to be normalized with. Skips any files that are not properly supported.""" path_candidates = [] for path, timestamp in self.paths_with_timestamps: if self.gcsfs.is_dir(path): directory = GcsfsDirectoryPath.from_absolute_path(path) files_in_directory = self.gcsfs.ls_with_blob_prefix( bucket_name=directory.bucket_name, blob_prefix=directory.relative_path, ) for file in files_in_directory: if self._is_supported_extension(file.abs_path()): path_candidates.append((file.abs_path(), timestamp)) else: self.skipped_files.append(file.abs_path()) elif self.gcsfs.is_file(path): file = GcsfsFilePath.from_absolute_path(path) if self._is_supported_extension(file.abs_path()): path_candidates.append((file.abs_path(), timestamp)) else: self.skipped_files.append(file.abs_path()) else: logging.warning( "Could not indicate %s as a directory or a file in %s. Skipping", path, self.destination_ingest_bucket.uri(), ) self.unable_to_upload_files.append(path) continue return path_candidates
def test_metric_export_state_specific(self): """Tests the export_configs_for_views_to_export function on the ExportMetricDatasetConfig class when the export is state-specific.""" specific_state_dataset_export_config = ExportMetricDatasetConfig( dataset_id='dataset_id', metric_view_builders_to_export=self.views_for_dataset, output_directory_uri_template="gs://{project_id}-bucket", state_code_filter='US_XX', export_name=None) view_configs_to_export = specific_state_dataset_export_config.export_configs_for_views_to_export( project_id=self.mock_project_id) expected_view = self.mock_view_builder.build() expected_view_export_configs = [ ExportMetricBigQueryViewConfig( view=expected_view, view_filter_clause=" WHERE state_code = 'US_XX'", intermediate_table_name=f"{expected_view.view_id}_table_US_XX", output_directory=GcsfsDirectoryPath.from_absolute_path( f"gs://{self.mock_project_id}-bucket/US_XX")) ] self.assertEqual(expected_view_export_configs, view_configs_to_export)
def test_metric_export_lantern_dashboard_with_state(self): """Tests the export_configs_for_views_to_export function on the ExportViewCollectionConfig class when the export is state-specific.""" lantern_dashboard_with_state_dataset_export_config = ExportViewCollectionConfig( view_builders_to_export=self.views_for_dataset, output_directory_uri_template="gs://{project_id}-bucket", state_code_filter="US_XX", export_name="TEST_EXPORT", bq_view_namespace=self.mock_big_query_view_namespace, ) view_configs_to_export = lantern_dashboard_with_state_dataset_export_config.export_configs_for_views_to_export( project_id=self.mock_project_id ) expected_view = self.mock_view_builder.build() expected_view_export_configs = [ ExportBigQueryViewConfig( view=expected_view, view_filter_clause=" WHERE state_code = 'US_XX'", intermediate_table_name=f"{expected_view.view_id}_table_US_XX", output_directory=GcsfsDirectoryPath.from_absolute_path( f"gs://{self.mock_project_id}-bucket/US_XX" ), export_output_formats=[ ExportOutputFormatType.JSON, ExportOutputFormatType.METRIC, ], ) ] self.assertEqual(expected_view_export_configs, view_configs_to_export)
def export_configs_for_views_to_export( self, project_id: str) -> Sequence[ExportBigQueryViewConfig]: """Builds a list of ExportBigQueryViewConfig that define how all views in view_builders_to_export should be exported to Google Cloud Storage.""" view_filter_clause = (f" WHERE state_code = '{self.state_code_filter}'" if self.state_code_filter else None) intermediate_table_name = "{export_view_name}_table" output_directory = self.output_directory_uri_template.format( project_id=project_id) if self.state_code_filter: intermediate_table_name += f"_{self.state_code_filter}" output_directory += f"/{self.state_code_filter}" configs = [] for vb in self.view_builders_to_export: view = vb.build() optional_args = {} if self.export_output_formats is not None: optional_args[ "export_output_formats"] = self.export_output_formats configs.append( ExportBigQueryViewConfig( view=view, view_filter_clause=view_filter_clause, intermediate_table_name=intermediate_table_name.format( export_view_name=view.view_id), output_directory=GcsfsDirectoryPath.from_absolute_path( output_directory), **optional_args, )) return configs
def __init__(self, file_type: GcsfsDirectIngestFileType, region_code: str, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, project_id: str, file_filter: Optional[str]): self.file_type = file_type self.region_code = region_code self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.dry_run = dry_run self.file_filter = file_filter self.project_id = project_id self.region_storage_dir_path_for_file_type = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, self.file_type, project_id=self.project_id)) self.log_output_path = os.path.join( os.path.dirname(__file__), f'move_storage_files_to_deprecated_start_bound_{self.region_code}_region_{self.start_date_bound}' f'_end_bound_{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt' ) self.mutex = threading.Lock() self.move_list: List[Tuple[str, str]] = [] self.move_progress: Optional[Bar] = None
def create_export_manager( self, region: Region, is_detect_row_deletion_view: bool = False, materialize_raw_data_table_views: bool = False, controller_file_tags: Optional[List[str]] = None, ) -> DirectIngestIngestViewExportManager: metadata_manager = PostgresDirectIngestFileMetadataManager( region.region_code) controller_file_tags = (["ingest_view"] if controller_file_tags is None else controller_file_tags) return DirectIngestIngestViewExportManager( region=region, fs=FakeGCSFileSystem(), ingest_directory_path=GcsfsDirectoryPath.from_absolute_path( "ingest_bucket"), big_query_client=self.mock_client, file_metadata_manager=metadata_manager, view_collector=_ViewCollector( # type: ignore[arg-type] region, controller_file_tags=controller_file_tags, is_detect_row_deletion_view=is_detect_row_deletion_view, materialize_raw_data_table_views= materialize_raw_data_table_views, ), launched_file_tags=controller_file_tags, )
def setUp(self) -> None: self.metadata_patcher = patch("recidiviz.utils.metadata.project_id") self.mock_project_id_fn = self.metadata_patcher.start() self.mock_project_id_fn.return_value = "project-id" self.mock_bq_view_namespace = BigQueryViewNamespace.STATE metric_view_one = MetricBigQueryViewBuilder( dataset_id="dataset", view_id="view1", description="view1 description", view_query_template="select * from table", dimensions=("a", "b", "c"), ).build() export_config_one_staging = ExportBigQueryViewConfig( bq_view_namespace=self.mock_bq_view_namespace, view=metric_view_one, view_filter_clause="WHERE state_code = 'US_XX'", intermediate_table_name="intermediate_table", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://bucket1/staging/US_XX"), ) metric_view_two = MetricBigQueryViewBuilder( dataset_id="dataset", view_id="view2", description="view2 description", view_query_template="select * from view2", dimensions=("d", "e", "f"), ).build() export_config_two_staging = ExportBigQueryViewConfig( bq_view_namespace=self.mock_bq_view_namespace, view=metric_view_two, view_filter_clause="WHERE state_code = 'US_XX'", intermediate_table_name="intermediate_table2", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://bucket2/staging/US_XX"), ) self.staging_paths = [ export_config_one_staging.output_path("txt"), export_config_two_staging.output_path("txt"), ]
def is_dir(self, path: str) -> bool: try: directory = GcsfsDirectoryPath.from_absolute_path(path) has_dir = self.ls_with_blob_prefix( bucket_name=directory.bucket_name, blob_prefix=directory.relative_path) return len(has_dir) > 0 except ValueError: return False
def gcsfs_direct_ingest_temporary_output_directory_path( project_id: Optional[str] = None, ) -> GcsfsDirectoryPath: if project_id is None: project_id = metadata.project_id() if not project_id: raise ValueError("Project id not set") return GcsfsDirectoryPath.from_absolute_path( f"{project_id}-direct-ingest-temporary-files" )
def config_with_path(self, path: str) -> ExportBigQueryViewConfig: return ExportBigQueryViewConfig( view=SimpleBigQueryViewBuilder( dataset_id="test_dataset", view_id="test_view", view_query_template="you know", ).build(), view_filter_clause="WHERE state_code = 'US_XX'", intermediate_table_name="tubular", output_directory=GcsfsDirectoryPath.from_absolute_path( f"gs://{path}"), )
def _copy_files_for_date(self, subdir_path_str: str) -> None: dir_path = GcsfsDirectoryPath.from_absolute_path(subdir_path_str.rstrip("/")) from_path = f"gs://{self.prod_region_storage_dir_path.bucket_name}/{dir_path.relative_path}*" to_path = f"gs://{self.staging_region_storage_dir_path.bucket_name}/{dir_path.relative_path}" if not self.dry_run: gsutil_cp(from_path=from_path, to_path=to_path) with self.mutex: self.copy_list.append((from_path, to_path)) if self.copy_progress: self.copy_progress.next()
def direct_ingest_storage_directory(self) -> GcsfsDirectoryPath: if in_gcp(): return gcsfs_direct_ingest_storage_directory_path_for_region( region_code=self.region_code, system_level=SystemLevel.STATE, ingest_instance=DirectIngestInstance.PRIMARY, ) # Local override return GcsfsDirectoryPath.from_absolute_path( f"recidiviz-staging-direct-ingest-state-storage/{self.region_code.lower()}" )
def test_json_throws(self) -> None: exporter = JsonLinesBigQueryViewExporter(self.mock_bq_client, self.mock_validator) view_export_configs = [ ExportBigQueryViewConfig( bq_view_namespace=self.mock_bq_view_namespace, view=self.view_builder.build(), view_filter_clause=" WHERE state_code = 'US_XX'", intermediate_table_name= f"{self.view_builder.view_id}_table_US_XX", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://{project_id}-dataset-location/subdirectory/{state_code}" .format( project_id=self.mock_project_id, state_code="US_XX", )), export_output_formats=[ ExportOutputFormatType.JSON, ExportOutputFormatType.HEADERLESS_CSV, ], ), ExportBigQueryViewConfig( bq_view_namespace=self.mock_bq_view_namespace, view=self.second_view_builder.build(), view_filter_clause=" WHERE state_code = 'US_XX'", intermediate_table_name= f"{self.second_view_builder.view_id}_table_US_XX", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://{project_id}-dataset-location/subdirectory/{state_code}" .format( project_id=self.mock_project_id, state_code="US_XX", )), export_output_formats=[ExportOutputFormatType.METRIC], ), ] with self.assertRaises(ValueError): exporter.export(view_export_configs)
def setUp(self) -> None: self.metadata_patcher = patch('recidiviz.utils.metadata.project_id') self.mock_project_id_fn = self.metadata_patcher.start() self.mock_project_id_fn.return_value = 'project-id' metric_view_one = MetricBigQueryViewBuilder( dataset_id='dataset', view_id='view1', view_query_template='select * from table', dimensions=['a', 'b', 'c'], ).build() export_config_one_staging = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket1/staging/US_XX'), ) metric_view_two = MetricBigQueryViewBuilder( dataset_id='dataset', view_id='view2', view_query_template='select * from view2', dimensions=['d', 'e', 'f'], ).build() export_config_two_staging = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table2', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket2/staging/US_XX'), ) self.staging_paths = [ export_config_one_staging.output_path('txt'), export_config_two_staging.output_path('txt') ]