def __init__(
     self,
     region_code: str,
     start_date_bound: Optional[str],
     end_date_bound: Optional[str],
     dry_run: bool,
     project_id: str,
 ):
     self.region_code = region_code
     self.file_type = GcsfsDirectIngestFileType.UNSPECIFIED
     self.start_date_bound = start_date_bound
     self.end_date_bound = end_date_bound
     self.dry_run = dry_run
     self.project_id = project_id
     self.region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path(
         gcsfs_direct_ingest_storage_directory_path_for_region(
             region_code, SystemLevel.STATE, project_id=self.project_id))
     self.region_storage_raw_dir_path = GcsfsDirectoryPath.from_absolute_path(
         gcsfs_direct_ingest_storage_directory_path_for_region(
             region_code,
             SystemLevel.STATE,
             GcsfsDirectIngestFileType.RAW_DATA,
             project_id=self.project_id,
         ))
     self.log_output_path = os.path.join(
         os.path.dirname(__file__),
         f"move_storage_files_from_unspecified_to_raw_start_bound_{self.region_code}_region_{self.start_date_bound}"
         f"_end_bound_{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt",
     )
     self.mutex = threading.Lock()
     self.move_list: List[Tuple[str, str]] = []
     self.move_progress: Optional[Bar] = None
 def __init__(
     self,
     region_code: str,
     dry_run: bool,
 ):
     self.region_code = region_code
     self.file_type = GcsfsDirectIngestFileType.UNSPECIFIED
     self.dry_run = dry_run
     self.project_id = 'recidiviz-123'
     self.region_ingest_bucket_dir_path = GcsfsDirectoryPath.from_absolute_path(
         gcsfs_direct_ingest_directory_path_for_region(
             region_code, SystemLevel.STATE, project_id=self.project_id))
     self.region_storage_raw_dir_path = GcsfsDirectoryPath.from_absolute_path(
         gcsfs_direct_ingest_storage_directory_path_for_region(
             region_code,
             SystemLevel.STATE,
             GcsfsDirectIngestFileType.RAW_DATA,
             project_id=self.project_id))
     self.log_output_path = os.path.join(
         os.path.dirname(__file__),
         f'move_prod_ingest_files_to_raw_start_bound_{self.region_code}_region_dry_run_{dry_run}_'
         f'{datetime.datetime.now().isoformat()}.txt')
     self.mutex = threading.Lock()
     self.move_list: List[Tuple[str, str]] = []
     self.move_progress: Optional[Bar] = None
Example #3
0
    def __init__(
        self,
        project_id: str,
        region: str,
        lower_bound_update_datetime: Optional[datetime.datetime],
        gcs_destination_path: Optional[str] = None,
    ):
        self.project_id = project_id
        self.region = region.lower()

        self.auth = SftpAuth.for_region(region)
        self.delegate = SftpDownloadDelegateFactory.build(region_code=region)
        self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build())

        self.unable_to_download_items: List[str] = []
        self.downloaded_items: List[Tuple[str, datetime.datetime]] = []

        self.lower_bound_update_datetime = lower_bound_update_datetime
        self.bucket = (
            GcsfsDirectoryPath.from_absolute_path(
                gcsfs_direct_ingest_directory_path_for_region(
                    region, SystemLevel.STATE, project_id=self.project_id
                )
            )
            if gcs_destination_path is None
            else GcsfsDirectoryPath.from_absolute_path(gcs_destination_path)
        )
        self.download_dir = GcsfsDirectoryPath.from_dir_and_subdir(
            dir_path=self.bucket, subdir=RAW_INGEST_DIRECTORY
        )
    def __init__(
        self,
        region_code: str,
        file_type: GcsfsDirectIngestFileType,
        start_date_bound: Optional[str],
        end_date_bound: Optional[str],
        dry_run: bool,
    ):
        self.file_type = file_type
        self.prod_region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region_code, SystemLevel.STATE, project_id="recidiviz-123"
            )
        )
        self.staging_region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region_code, SystemLevel.STATE, project_id="recidiviz-staging"
            )
        )
        self.dry_run = dry_run
        self.start_date_bound = start_date_bound
        self.end_date_bound = end_date_bound

        self.log_output_path = os.path.join(
            os.path.dirname(__file__),
            f"copy_prod_to_staging_result_{region_code}_start_bound_{self.start_date_bound}_end_bound_"
            f"{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt",
        )
        self.mutex = threading.Lock()
        self.copy_list: List[Tuple[str, str]] = []
        self.copy_progress: Optional[Bar] = None
    def test_get_configs_for_export_name(
            self, mock_environment: mock.MagicMock) -> None:
        """Tests get_configs_for_export_name function to ensure that export names correctly match"""

        mock_environment.return_value = "production"
        export_configs_for_filter = view_export_manager.get_configs_for_export_name(
            export_name=self.mock_export_name,
            state_code=self.mock_state_code,
            project_id=self.mock_project_id,
        )
        view = self.mock_view_builder.build()
        metric_view = self.mock_metric_view_builder.build()

        expected_view_config_list = [
            ExportBigQueryViewConfig(
                bq_view_namespace=self.mock_big_query_view_namespace,
                view=view,
                view_filter_clause=
                f" WHERE state_code = '{self.mock_state_code}'",
                intermediate_table_name=
                f"{view.view_id}_table_{self.mock_state_code}",
                output_directory=GcsfsDirectoryPath.from_absolute_path(
                    "gs://{project_id}-dataset-location/subdirectory/{state_code}"
                    .format(
                        project_id=self.mock_project_id,
                        state_code=self.mock_state_code,
                    )),
                export_output_formats=[ExportOutputFormatType.JSON],
            ),
            ExportBigQueryViewConfig(
                bq_view_namespace=self.mock_big_query_view_namespace,
                view=metric_view,
                view_filter_clause=
                f" WHERE state_code = '{self.mock_state_code}'",
                intermediate_table_name=
                f"{view.view_id}_table_{self.mock_state_code}",
                output_directory=GcsfsDirectoryPath.from_absolute_path(
                    "gs://{project_id}-dataset-location/subdirectory/{state_code}"
                    .format(
                        project_id=self.mock_project_id,
                        state_code=self.mock_state_code,
                    )),
                export_output_formats=[
                    ExportOutputFormatType.JSON,
                    ExportOutputFormatType.METRIC,
                ],
            ),
        ]

        self.assertEqual(expected_view_config_list, export_configs_for_filter)

        # Test for case insensitivity

        export_configs_for_filter = view_export_manager.get_configs_for_export_name(
            export_name=self.mock_export_name.lower(),
            state_code=self.mock_state_code.lower(),
            project_id=self.mock_project_id,
        )
        self.assertEqual(expected_view_config_list, export_configs_for_filter)
Example #6
0
    def __init__(self,
                 region_name: str,
                 system_level: SystemLevel,
                 ingest_directory_path: Optional[str] = None,
                 storage_directory_path: Optional[str] = None,
                 max_delay_sec_between_files: Optional[int] = None):
        super().__init__(region_name, system_level)
        self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build())
        self.max_delay_sec_between_files = max_delay_sec_between_files

        if not ingest_directory_path:
            ingest_directory_path = \
                gcsfs_direct_ingest_directory_path_for_region(region_name,
                                                              system_level)
        self.ingest_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(ingest_directory_path)

        if not storage_directory_path:
            storage_directory_path = \
                gcsfs_direct_ingest_storage_directory_path_for_region(
                    region_name, system_level)

        self.storage_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(storage_directory_path)

        self.temp_output_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(gcsfs_direct_ingest_temporary_output_directory_path())

        ingest_job_file_type_filter = \
            GcsfsDirectIngestFileType.INGEST_VIEW \
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None
        self.file_prioritizer = \
            GcsfsDirectIngestJobPrioritizer(
                self.fs,
                self.ingest_directory_path,
                self.get_file_tag_rank_list(),
                ingest_job_file_type_filter)

        self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT

        self.file_metadata_manager = PostgresDirectIngestFileMetadataManager(
            region_code=self.region.region_code)

        self.raw_file_import_manager = DirectIngestRawFileImportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_directory_path,
            big_query_client=BigQueryClientImpl())

        self.ingest_view_export_manager = DirectIngestIngestViewExportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            file_metadata_manager=self.file_metadata_manager,
            big_query_client=BigQueryClientImpl(),
            view_collector=DirectIngestPreProcessedIngestViewCollector(
                self.region, self.get_file_tag_rank_list()))
Example #7
0
    def setUp(self) -> None:
        self.mock_bq_client = mock.create_autospec(BigQueryClient)
        self.mock_validator = mock.create_autospec(BigQueryViewExportValidator)

        self.mock_project_id = "fake-project"

        self.metadata_patcher = mock.patch(
            "recidiviz.utils.metadata.project_id")
        self.mock_project_id_fn = self.metadata_patcher.start()
        self.mock_project_id_fn.return_value = self.mock_project_id

        self.view_builder = SimpleBigQueryViewBuilder(
            dataset_id="test_dataset",
            view_id="test_view",
            view_query_template="SELECT NULL LIMIT 0",
        )
        self.second_view_builder = SimpleBigQueryViewBuilder(
            dataset_id="test_dataset",
            view_id="test_view_2",
            view_query_template="SELECT NULL LIMIT 0",
        )
        self.view_export_configs = [
            ExportBigQueryViewConfig(
                view=self.view_builder.build(),
                view_filter_clause=" WHERE state_code = 'US_XX'",
                intermediate_table_name=
                f"{self.view_builder.view_id}_table_US_XX",
                output_directory=GcsfsDirectoryPath.from_absolute_path(
                    "gs://{project_id}-dataset-location/subdirectory/{state_code}"
                    .format(
                        project_id=self.mock_project_id,
                        state_code="US_XX",
                    )),
                export_output_formats=[
                    ExportOutputFormatType.JSON,
                    ExportOutputFormatType.HEADERLESS_CSV,
                ],
            ),
            ExportBigQueryViewConfig(
                view=self.second_view_builder.build(),
                view_filter_clause=" WHERE state_code = 'US_XX'",
                intermediate_table_name=
                f"{self.second_view_builder.view_id}_table_US_XX",
                output_directory=GcsfsDirectoryPath.from_absolute_path(
                    "gs://{project_id}-dataset-location/subdirectory/{state_code}"
                    .format(
                        project_id=self.mock_project_id,
                        state_code="US_XX",
                    )),
                export_output_formats=[
                    ExportOutputFormatType.JSON,
                    ExportOutputFormatType.CSV,
                ],
            ),
        ]
    def test_export_dashboard_data_to_cloud_storage(
            self, mock_view_exporter,
            mock_view_update_manager_rematerialize) -> None:
        """Tests the table is created from the view and then extracted."""
        view_export_manager.export_view_data_to_cloud_storage(
            self.mock_state_code, mock_view_exporter)

        view = self.mock_view_builder.build()
        metric_view = self.mock_metric_view_builder.build()

        view_export_configs = [
            ExportBigQueryViewConfig(
                view=view,
                view_filter_clause=" WHERE state_code = 'US_XX'",
                intermediate_table_name=f"{view.view_id}_table_US_XX",
                output_directory=GcsfsDirectoryPath.from_absolute_path(
                    "gs://{project_id}-dataset-location/subdirectory/{state_code}"
                    .format(
                        project_id=self.mock_project_id,
                        state_code="US_XX",
                    )),
                export_output_formats=[ExportOutputFormatType.JSON],
            ),
            ExportBigQueryViewConfig(
                view=metric_view,
                view_filter_clause=" WHERE state_code = 'US_XX'",
                intermediate_table_name=f"{view.view_id}_table_US_XX",
                output_directory=GcsfsDirectoryPath.from_absolute_path(
                    "gs://{project_id}-dataset-location/subdirectory/{state_code}"
                    .format(
                        project_id=self.mock_project_id,
                        state_code="US_XX",
                    )),
                export_output_formats=[
                    ExportOutputFormatType.JSON,
                    ExportOutputFormatType.METRIC,
                ],
            ),
        ]

        mock_view_update_manager_rematerialize.assert_called()
        mock_view_exporter.export_and_validate.assert_has_calls(
            [
                mock.call([]),  # CSV export
                mock.call([
                    view_export_configs[1].pointed_to_staging_subdirectory()
                ]),  # JSON export
                mock.call([
                    conf.pointed_to_staging_subdirectory()
                    for conf in view_export_configs
                ]),  # METRIC export
            ],
            any_order=True,
        )
    def setUp(self) -> None:
        self.project_id = "recidiviz-456"
        self.project_id_patcher = patch("recidiviz.utils.metadata.project_id")
        self.project_id_patcher.start().return_value = self.project_id
        self.test_region = fake_region(
            region_code="us_xx", are_raw_data_bq_imports_enabled_in_env=True)

        self.region_module_patcher = patch.object(
            direct_ingest_raw_table_migration_collector,
            "regions",
            new=controller_fixtures,
        )
        self.region_module_patcher.start()

        self.fs = DirectIngestGCSFileSystem(FakeGCSFileSystem())
        self.ingest_directory_path = GcsfsDirectoryPath(
            bucket_name="direct/controllers/fixtures")
        self.temp_output_path = GcsfsDirectoryPath(bucket_name="temp_bucket")

        self.region_raw_file_config = DirectIngestRegionRawFileConfig(
            region_code="us_xx",
            yaml_config_file_dir=fixtures.as_filepath("us_xx"),
        )

        self.mock_big_query_client = create_autospec(BigQueryClient)
        self.num_lines_uploaded = 0

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.side_effect = (
            self.mock_import_raw_file_to_big_query)

        self.import_manager = DirectIngestRawFileImportManager(
            region=self.test_region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_path,
            region_raw_file_config=self.region_raw_file_config,
            big_query_client=self.mock_big_query_client,
        )
        self.import_manager.csv_reader = _TestSafeGcsCsvReader(
            self.fs.gcs_file_system)

        self.time_patcher = patch(
            "recidiviz.ingest.direct.controllers.direct_ingest_raw_file_import_manager.time"
        )
        self.mock_time = self.time_patcher.start()

        def fake_get_dataset_ref(dataset_id: str) -> bigquery.DatasetReference:
            return bigquery.DatasetReference(project=self.project_id,
                                             dataset_id=dataset_id)

        self.mock_big_query_client.dataset_ref_for_id = fake_get_dataset_ref
Example #10
0
    def __init__(
        self,
        project_id: str,
        region: str,
        file_type_to_move: GcsfsDirectIngestFileType,
        destination_file_type: GcsfsDirectIngestFileType,
        start_date_bound: Optional[str],
        end_date_bound: Optional[str],
        dry_run: bool,
        file_filter: Optional[str],
    ):

        self.project_id = project_id
        self.region = region
        self.file_type_to_move = file_type_to_move
        self.destination_file_type = destination_file_type

        if (
            self.file_type_to_move != self.destination_file_type
            and self.file_type_to_move != GcsfsDirectIngestFileType.UNSPECIFIED
        ):
            raise ValueError(
                "Args file_type_to_move and destination_file_type must match if type to move is UNSPECIFIED"
            )

        self.start_date_bound = start_date_bound
        self.end_date_bound = end_date_bound
        self.dry_run = dry_run
        self.file_filter = file_filter

        self.storage_bucket = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region, SystemLevel.STATE, project_id=self.project_id
            )
        )
        self.ingest_bucket = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_directory_path_for_region(
                region, SystemLevel.STATE, project_id=self.project_id
            )
        )

        self.mutex = threading.Lock()
        self.collect_progress: Optional[Bar] = None
        self.move_progress: Optional[Bar] = None
        self.moves_list: List[Tuple[str, str]] = []
        self.log_output_path = os.path.join(
            os.path.dirname(__file__),
            f"move_result_{region}_{self.project_id}_start_bound_{self.start_date_bound}_end_bound_"
            f"{self.end_date_bound}_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt",
        )
    def test_metric_export_state_agnostic(self):
        """Tests the export_configs_for_views_to_export function on the ExportMetricDatasetConfig class when the
        export is state-agnostic."""
        state_agnostic_dataset_export_config = ExportMetricDatasetConfig(
            dataset_id='dataset_id',
            metric_view_builders_to_export=self.views_for_dataset,
            output_directory_uri_template=
            "gs://{project_id}-bucket-without-state-codes",
            state_code_filter=None,
            export_name=None)

        view_configs_to_export = state_agnostic_dataset_export_config.export_configs_for_views_to_export(
            project_id=self.mock_project_id)

        expected_view = self.mock_view_builder.build()

        expected_view_export_configs = [
            ExportMetricBigQueryViewConfig(
                view=expected_view,
                view_filter_clause=None,
                intermediate_table_name=f"{expected_view.view_id}_table",
                output_directory=GcsfsDirectoryPath.from_absolute_path(
                    state_agnostic_dataset_export_config.
                    output_directory_uri_template.format(
                        project_id=self.mock_project_id, )))
        ]

        self.assertEqual(expected_view_export_configs, view_configs_to_export)
    def test_metric_export_state_specific(self):
        """Tests the export_configs_for_views_to_export function on the ExportMetricDatasetConfig class when the
        export is state-specific."""
        specific_state_dataset_export_config = ExportMetricDatasetConfig(
            dataset_id='dataset_id',
            metric_view_builders_to_export=self.views_for_dataset,
            output_directory_uri_template="gs://{project_id}-bucket",
            state_code_filter='US_XX',
            export_name=None)

        view_configs_to_export = specific_state_dataset_export_config.export_configs_for_views_to_export(
            project_id=self.mock_project_id)

        expected_view = self.mock_view_builder.build()

        expected_view_export_configs = [
            ExportMetricBigQueryViewConfig(
                view=expected_view,
                view_filter_clause=" WHERE state_code = 'US_XX'",
                intermediate_table_name=f"{expected_view.view_id}_table_US_XX",
                output_directory=GcsfsDirectoryPath.from_absolute_path(
                    f"gs://{self.mock_project_id}-bucket/US_XX"))
        ]

        self.assertEqual(expected_view_export_configs, view_configs_to_export)
Example #13
0
    def test_metric_export_lantern_dashboard(self) -> None:
        """Tests the export_configs_for_views_to_export function on the ExportViewCollectionConfig class when the
        export is state-agnostic."""
        lantern_dashboard_dataset_export_config = ExportViewCollectionConfig(
            view_builders_to_export=self.views_for_dataset,
            output_directory_uri_template=
            "gs://{project_id}-bucket-without-state-codes",
            export_name="TEST_EXPORT",
            bq_view_namespace=self.mock_big_query_view_namespace,
        )

        view_configs_to_export = (lantern_dashboard_dataset_export_config.
                                  export_configs_for_views_to_export(
                                      project_id=self.mock_project_id, ))

        expected_view = self.mock_view_builder.build()

        expected_view_export_configs = [
            ExportBigQueryViewConfig(
                bq_view_namespace=self.mock_big_query_view_namespace,
                view=expected_view,
                view_filter_clause=None,
                intermediate_table_name=f"{expected_view.view_id}_table",
                output_directory=GcsfsDirectoryPath.from_absolute_path(
                    lantern_dashboard_dataset_export_config.
                    output_directory_uri_template.format(
                        project_id=self.mock_project_id, )),
                export_output_formats=[
                    ExportOutputFormatType.JSON,
                    ExportOutputFormatType.METRIC,
                ],
            )
        ]

        self.assertEqual(expected_view_export_configs, view_configs_to_export)
    def test_export_dashboard_data_to_cloud_storage_validation_error(self,
                                                                     mock_view_exporter,
                                                                     mock_view_update_manager):
        """Tests the table is created from the view and then extracted."""

        mock_view_exporter.export_and_validate.side_effect = ViewExportValidationError

        # Should not throw
        metric_view_export_manager.export_view_data_to_cloud_storage(mock_state_code, mock_view_exporter)

        view = self.mock_view_builder.build()

        view_export_configs = [ExportMetricBigQueryViewConfig(
            view=view,
            view_filter_clause=" WHERE state_code = 'US_XX'",
            intermediate_table_name=f"{view.view_id}_table_US_XX",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://{project_id}-dataset-location/subdirectory/{state_code}".format(
                    project_id=self.mock_project_id,
                    state_code='US_XX',
                )
            )
        )]

        mock_view_update_manager.assert_called()
        mock_view_exporter.export_and_validate.assert_called_with(view_export_configs)
Example #15
0
 def get_paths_to_upload(self) -> List[Tuple[str, datetime.datetime]]:
     """Returns the appropriate paths to upload and the proper associated timestamp that
     it is to be normalized with. Skips any files that are not properly supported."""
     path_candidates = []
     for path, timestamp in self.paths_with_timestamps:
         if self.gcsfs.is_dir(path):
             directory = GcsfsDirectoryPath.from_absolute_path(path)
             files_in_directory = self.gcsfs.ls_with_blob_prefix(
                 bucket_name=directory.bucket_name,
                 blob_prefix=directory.relative_path,
             )
             for file in files_in_directory:
                 if self._is_supported_extension(file.abs_path()):
                     path_candidates.append((file.abs_path(), timestamp))
                 else:
                     self.skipped_files.append(file.abs_path())
         elif self.gcsfs.is_file(path):
             file = GcsfsFilePath.from_absolute_path(path)
             if self._is_supported_extension(file.abs_path()):
                 path_candidates.append((file.abs_path(), timestamp))
             else:
                 self.skipped_files.append(file.abs_path())
         else:
             logging.warning(
                 "Could not indicate %s as a directory or a file in %s. Skipping",
                 path,
                 self.destination_ingest_bucket.uri(),
             )
             self.unable_to_upload_files.append(path)
             continue
     return path_candidates
Example #16
0
def gcsfs_direct_ingest_storage_directory_path_for_region(
    *,
    region_code: str,
    system_level: SystemLevel,
    ingest_instance: DirectIngestInstance,
    file_type: Optional[GcsfsDirectIngestFileType] = None,
    project_id: Optional[str] = None,
) -> GcsfsDirectoryPath:
    if project_id is None:
        project_id = metadata.project_id()
        if not project_id:
            raise ValueError("Project id not set")

    suffix = bucket_suffix_for_ingest_instance(ingest_instance)
    bucket_name = build_ingest_storage_bucket_name(
        project_id=project_id,
        system_level_str=system_level.value.lower(),
        suffix=suffix,
    )
    storage_bucket = GcsfsBucketPath(bucket_name)

    if file_type is not None:
        subdir = os.path.join(region_code.lower(), file_type.value)
    else:
        subdir = region_code.lower()
    return GcsfsDirectoryPath.from_dir_and_subdir(storage_bucket, subdir)
    def test_metric_export_lantern_dashboard_with_state(self):
        """Tests the export_configs_for_views_to_export function on the ExportViewCollectionConfig class when the
        export is state-specific."""
        lantern_dashboard_with_state_dataset_export_config = ExportViewCollectionConfig(
            view_builders_to_export=self.views_for_dataset,
            output_directory_uri_template="gs://{project_id}-bucket",
            state_code_filter="US_XX",
            export_name="TEST_EXPORT",
            bq_view_namespace=self.mock_big_query_view_namespace,
        )

        view_configs_to_export = lantern_dashboard_with_state_dataset_export_config.export_configs_for_views_to_export(
            project_id=self.mock_project_id
        )

        expected_view = self.mock_view_builder.build()

        expected_view_export_configs = [
            ExportBigQueryViewConfig(
                view=expected_view,
                view_filter_clause=" WHERE state_code = 'US_XX'",
                intermediate_table_name=f"{expected_view.view_id}_table_US_XX",
                output_directory=GcsfsDirectoryPath.from_absolute_path(
                    f"gs://{self.mock_project_id}-bucket/US_XX"
                ),
                export_output_formats=[
                    ExportOutputFormatType.JSON,
                    ExportOutputFormatType.METRIC,
                ],
            )
        ]

        self.assertEqual(expected_view_export_configs, view_configs_to_export)
Example #18
0
    def export_configs_for_views_to_export(
            self, project_id: str) -> Sequence[ExportBigQueryViewConfig]:
        """Builds a list of ExportBigQueryViewConfig that define how all views in
        view_builders_to_export should be exported to Google Cloud Storage."""
        view_filter_clause = (f" WHERE state_code = '{self.state_code_filter}'"
                              if self.state_code_filter else None)

        intermediate_table_name = "{export_view_name}_table"
        output_directory = self.output_directory_uri_template.format(
            project_id=project_id)

        if self.state_code_filter:
            intermediate_table_name += f"_{self.state_code_filter}"
            output_directory += f"/{self.state_code_filter}"

        configs = []
        for vb in self.view_builders_to_export:
            view = vb.build()
            optional_args = {}
            if self.export_output_formats is not None:
                optional_args[
                    "export_output_formats"] = self.export_output_formats
            configs.append(
                ExportBigQueryViewConfig(
                    view=view,
                    view_filter_clause=view_filter_clause,
                    intermediate_table_name=intermediate_table_name.format(
                        export_view_name=view.view_id),
                    output_directory=GcsfsDirectoryPath.from_absolute_path(
                        output_directory),
                    **optional_args,
                ))
        return configs
Example #19
0
 def create_export_manager(
     self,
     region: Region,
     is_detect_row_deletion_view: bool = False,
     materialize_raw_data_table_views: bool = False,
     controller_file_tags: Optional[List[str]] = None,
 ) -> DirectIngestIngestViewExportManager:
     metadata_manager = PostgresDirectIngestFileMetadataManager(
         region.region_code)
     controller_file_tags = (["ingest_view"] if controller_file_tags is None
                             else controller_file_tags)
     return DirectIngestIngestViewExportManager(
         region=region,
         fs=FakeGCSFileSystem(),
         ingest_directory_path=GcsfsDirectoryPath.from_absolute_path(
             "ingest_bucket"),
         big_query_client=self.mock_client,
         file_metadata_manager=metadata_manager,
         view_collector=_ViewCollector(  # type: ignore[arg-type]
             region,
             controller_file_tags=controller_file_tags,
             is_detect_row_deletion_view=is_detect_row_deletion_view,
             materialize_raw_data_table_views=
             materialize_raw_data_table_views,
         ),
         launched_file_tags=controller_file_tags,
     )
 def __init__(self, file_type: GcsfsDirectIngestFileType, region_code: str,
              start_date_bound: Optional[str],
              end_date_bound: Optional[str], dry_run: bool, project_id: str,
              file_filter: Optional[str]):
     self.file_type = file_type
     self.region_code = region_code
     self.start_date_bound = start_date_bound
     self.end_date_bound = end_date_bound
     self.dry_run = dry_run
     self.file_filter = file_filter
     self.project_id = project_id
     self.region_storage_dir_path_for_file_type = GcsfsDirectoryPath.from_absolute_path(
         gcsfs_direct_ingest_storage_directory_path_for_region(
             region_code,
             SystemLevel.STATE,
             self.file_type,
             project_id=self.project_id))
     self.log_output_path = os.path.join(
         os.path.dirname(__file__),
         f'move_storage_files_to_deprecated_start_bound_{self.region_code}_region_{self.start_date_bound}'
         f'_end_bound_{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt'
     )
     self.mutex = threading.Lock()
     self.move_list: List[Tuple[str, str]] = []
     self.move_progress: Optional[Bar] = None
    def export_configs_for_views_to_export(self, project_id: str) -> Sequence[ExportMetricBigQueryViewConfig]:
        """Builds a list of ExportMetricBigQueryViewConfigs that define how all metric views in
        metric_view_builders_to_export should be exported to Google Cloud Storage."""
        view_filter_clause = (f" WHERE state_code = '{self.state_code_filter}'"
                              if self.state_code_filter else None)

        intermediate_table_name = "{export_view_name}_table"
        output_directory = self.output_directory_uri_template.format(
            project_id=project_id
        )

        if self.state_code_filter:
            intermediate_table_name += f"_{self.state_code_filter}"
            output_directory += f"/{self.state_code_filter}"

        return [
            ExportMetricBigQueryViewConfig(
                view=view,
                view_filter_clause=view_filter_clause,
                intermediate_table_name=intermediate_table_name.format(
                    export_view_name=view.view_id
                ),
                output_directory=GcsfsDirectoryPath.from_absolute_path(output_directory),
            )
            for view in [vb.build() for vb in self.metric_view_builders_to_export]
        ]
    def __init__(
        self,
        project_id: str,
        region: str,
        lower_bound_update_datetime: Optional[datetime.datetime],
        gcs_destination_path: Optional[GcsfsDirectoryPath] = None,
    ):
        self.project_id = project_id
        self.region = region.lower()

        self.auth = SftpAuth.for_region(region)
        self.delegate = SftpDownloadDelegateFactory.build(region_code=region)
        self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build())

        self.unable_to_download_items: List[str] = []
        self.downloaded_items: List[Tuple[str, datetime.datetime]] = []
        self.skipped_files: List[str] = []

        self.lower_bound_update_datetime = lower_bound_update_datetime
        self.bucket = (gcsfs_sftp_download_bucket_path_for_region(
            region, SystemLevel.STATE, project_id=self.project_id) if
                       gcs_destination_path is None else gcs_destination_path)
        self.download_dir = GcsfsDirectoryPath.from_dir_and_subdir(
            dir_path=self.bucket, subdir=RAW_INGEST_DIRECTORY)

        self.postgres_direct_ingest_file_metadata_manager = (
            PostgresDirectIngestRawFileMetadataManager(
                region,
                DirectIngestInstance.PRIMARY.database_version(
                    SystemLevel.STATE,
                    state_code=StateCode(self.region.upper())).name,
            ))
def build_path(bucket_template: str, state: str,
               pdf_name: str) -> GcsfsFilePath:
    return GcsfsFilePath.from_directory_and_file_name(
        GcsfsDirectoryPath(bucket_template.format(metadata.project_id()),
                           state),
        pdf_name,
    )
    def _move_files(self, from_uri: str):
        curr_gcsfs_file_path = GcsfsFilePath.from_absolute_path(from_uri)
        previous_date_format = filename_parts_from_path(
            curr_gcsfs_file_path).date_str
        new_date_format = date.fromisoformat(previous_date_format).strftime(
            "%Y/%m/%d/")

        path_with_new_file_name = GcsfsFilePath.from_absolute_path(
            to_normalized_unprocessed_file_path_from_normalized_path(
                from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        if DirectIngestGCSFileSystem.is_processed_file(curr_gcsfs_file_path):
            path_with_new_file_name = GcsfsFilePath.from_absolute_path(
                to_normalized_processed_file_path_from_normalized_path(
                    from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        raw_dir_with_date = GcsfsDirectoryPath.from_dir_and_subdir(
            self.region_storage_raw_dir_path, new_date_format)

        to_uri = GcsfsFilePath.from_directory_and_file_name(
            raw_dir_with_date, path_with_new_file_name.file_name).uri()

        if not self.dry_run:
            gsutil_mv(from_path=from_uri, to_path=to_uri)
        with self.mutex:
            self.move_list.append((from_uri, to_uri))
            if self.move_progress:
                self.move_progress.next()
Example #25
0
    def get_paths_to_upload(self) -> List[Tuple[str, datetime.datetime]]:
        """Returns the appropriate paths to upload and the proper associated timestamp that
        it is to be normalized with. Skips any files that are not properly supported."""
        path_candidates = []
        for path, timestamp in self.paths_with_timestamps:
            if self.gcsfs.is_dir(path):
                directory = GcsfsDirectoryPath.from_absolute_path(path)
                files_in_directory = self.gcsfs.ls_with_blob_prefix(
                    bucket_name=directory.bucket_name,
                    blob_prefix=directory.relative_path,
                )
                for file in files_in_directory:
                    path_candidates.append((file.abs_path(), timestamp))
            elif self.gcsfs.is_file(path):
                file = GcsfsFilePath.from_absolute_path(path)
                path_candidates.append((file.abs_path(), timestamp))
            else:
                logging.warning(
                    "Could not indicate %s as a directory or a file in %s. Skipping",
                    path,
                    self.gcs_destination_path.uri(),
                )
                self.unable_to_upload_files.append(path)
                continue

        result = []
        for path, timestamp in path_candidates:
            _, ext = os.path.splitext(path)
            if not ext or ext not in self.SUPPORTED_EXTENSIONS:
                logging.info("Skipping file [%s] - invalid extension %s", path,
                             ext)
                continue
            result.append((path, timestamp))

        return result
    def setUp(self) -> None:
        self.metadata_patcher = patch("recidiviz.utils.metadata.project_id")
        self.mock_project_id_fn = self.metadata_patcher.start()
        self.mock_project_id_fn.return_value = "project-id"

        self.mock_bq_view_namespace = BigQueryViewNamespace.STATE

        metric_view_one = MetricBigQueryViewBuilder(
            dataset_id="dataset",
            view_id="view1",
            description="view1 description",
            view_query_template="select * from table",
            dimensions=("a", "b", "c"),
        ).build()

        export_config_one_staging = ExportBigQueryViewConfig(
            bq_view_namespace=self.mock_bq_view_namespace,
            view=metric_view_one,
            view_filter_clause="WHERE state_code = 'US_XX'",
            intermediate_table_name="intermediate_table",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://bucket1/staging/US_XX"),
        )

        metric_view_two = MetricBigQueryViewBuilder(
            dataset_id="dataset",
            view_id="view2",
            description="view2 description",
            view_query_template="select * from view2",
            dimensions=("d", "e", "f"),
        ).build()

        export_config_two_staging = ExportBigQueryViewConfig(
            bq_view_namespace=self.mock_bq_view_namespace,
            view=metric_view_two,
            view_filter_clause="WHERE state_code = 'US_XX'",
            intermediate_table_name="intermediate_table2",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://bucket2/staging/US_XX"),
        )

        self.staging_paths = [
            export_config_one_staging.output_path("txt"),
            export_config_two_staging.output_path("txt"),
        ]
 def is_dir(self, path: str) -> bool:
     try:
         directory = GcsfsDirectoryPath.from_absolute_path(path)
         has_dir = self.ls_with_blob_prefix(
             bucket_name=directory.bucket_name,
             blob_prefix=directory.relative_path)
         return len(has_dir) > 0
     except ValueError:
         return False
Example #28
0
def state_aggregate() -> Tuple[str, HTTPStatus]:
    """Calls state aggregates"""
    bucket = get_str_param_value("bucket", request.args)
    state = get_str_param_value("state", request.args)
    filename = get_str_param_value("filename", request.args)
    project_id = metadata.project_id()
    logging.info("The project id is %s", project_id)
    if not bucket or not state or not filename:
        raise StateAggregateError("All of state, bucket, and filename must be provided")
    directory_path = GcsfsDirectoryPath(bucket, state)
    path = GcsfsFilePath.from_directory_and_file_name(directory_path, filename)
    parser = STATE_TO_PARSER[state]
    fs = GcsfsFactory.build()
    logging.info("The path to download from is %s", path)

    logging.info("The files in the directory are:")
    logging.info(
        fs.ls_with_blob_prefix(
            bucket_name=directory_path.bucket_name,
            blob_prefix=directory_path.relative_path,
        )
    )

    # Providing a stream buffer to tabula reader does not work because it
    # tries to load the file into the local filesystem, since appengine is a
    # read only filesystem (except for the tmpdir) we download the file into
    # the local tmpdir and pass that in.
    handle = fs.download_to_temp_file(path)
    if not handle:
        raise StateAggregateError(f"Unable to download file: {path}")
    logging.info("Successfully downloaded file from gcs: %s", handle.local_file_path)

    result = parser(handle.local_file_path)
    logging.info("Successfully parsed the report")
    for table, df in result.items():
        dao.write_df(table, df)

    # If we are successful, we want to move the file out of the cloud
    # function triggered directory, and into the historical path.
    historical_path = GcsfsFilePath.from_directory_and_file_name(
        GcsfsDirectoryPath(HISTORICAL_BUCKET.format(project_id), state), filename
    )
    fs.mv(path, historical_path)
    return "", HTTPStatus.OK
Example #29
0
 def is_dir(self, path: str) -> bool:
     try:
         directory = GcsfsDirectoryPath.from_absolute_path(path)
         # If the directory is empty, has_dir will have 1 entry, which is the Blob representing the directory
         # Otherwise, if the directory doesn't exist on GCS, has_dir will return an empty list
         has_dir = self.ls_with_blob_prefix(
             bucket_name=directory.bucket_name,
             blob_prefix=directory.relative_path)
         return len(has_dir) > 0
     except ValueError:
         return False
Example #30
0
    def __init__(
        self,
        paths_with_timestamps: List[Tuple[str, datetime.datetime]],
        project_id: str,
        region: str,
        gcs_destination_path: Optional[str] = None,
    ):
        self.paths_with_timestamps = paths_with_timestamps
        self.project_id = project_id
        self.region = region.lower()

        self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build())
        self.gcs_destination_path = (
            GcsfsDirectoryPath.from_absolute_path(
                gcsfs_direct_ingest_directory_path_for_region(
                    region, SystemLevel.STATE, project_id=self.project_id))
            if gcs_destination_path is None else
            GcsfsDirectoryPath.from_absolute_path(gcs_destination_path))
        self.uploaded_files: List[str] = []
        self.unable_to_upload_files: List[str] = []