Ejemplo n.º 1
0
    def test_skip_already_processed_or_discovered_files(
        self,
        mock_fs_factory: Mock,
    ) -> None:
        mock_fs = FakeGCSFileSystem()
        mock_fs.test_add_path(
            path=GcsfsFilePath.from_bucket_and_blob_name(
                "recidiviz-456-direct-ingest-state-us-xx",
                "raw_data/test_file.txt"),
            local_path=None,
        )
        mock_fs.test_add_path(
            path=GcsfsFilePath.from_bucket_and_blob_name(
                "recidiviz-456-direct-ingest-state-us-xx",
                "raw_data/test_file.csv"),
            local_path=None,
        )

        mock_fs.test_add_path(
            path=GcsfsFilePath.from_bucket_and_blob_name(
                "recidiviz-456-direct-ingest-state-us-xx",
                "raw_data/skipped.csv",
            ),
            local_path=None,
        )
        mock_fs.test_add_path(
            path=GcsfsFilePath.from_bucket_and_blob_name(
                "recidiviz-456-direct-ingest-state-us-xx",
                "raw_data/discovered.csv",
            ),
            local_path=None,
        )
        mock_fs_factory.return_value = mock_fs
        controller = UploadStateFilesToIngestBucketController(
            paths_with_timestamps=[
                (
                    "recidiviz-456-direct-ingest-state-us-xx/raw_data/test_file.txt",
                    TODAY,
                ),
                (
                    "recidiviz-456-direct-ingest-state-us-xx/raw_data/test_file.csv",
                    TODAY,
                ),
                (
                    "recidiviz-456-direct-ingest-state-us-xx/raw_data/skipped.csv",
                    TODAY,
                ),
                (
                    "recidiviz-456-direct-ingest-state-us-xx/raw_data/discovered.csv",
                    TODAY,
                ),
            ],
            project_id="recidiviz-456",
            region="us_xx",
        )
        result: MultiRequestResultWithSkipped[str, str,
                                              str] = controller.do_upload()
        self.assertListEqual(
            result.successes,
            [
                "recidiviz-456-direct-ingest-state-us-xx/raw_data/test_file.txt",
                "recidiviz-456-direct-ingest-state-us-xx/raw_data/test_file.csv",
            ],
        )
        self.assertListEqual(
            result.skipped,
            [
                "recidiviz-456-direct-ingest-state-us-xx/raw_data/skipped.csv",
                "recidiviz-456-direct-ingest-state-us-xx/raw_data/discovered.csv",
            ],
        )
        self.assertFalse(self.us_xx_manager.is_instance_paused())
 def is_locked(self, name: str) -> bool:
     """Checks if @param name is locked by checking if file exists. Returns true if locked, false if unlocked"""
     path = GcsfsFilePath(bucket_name=self.bucket_name, blob_name=name)
     return self.fs.exists(path)
Ejemplo n.º 3
0
 def set_config_yaml(self, contents: str) -> None:
     path = GcsfsFilePath.from_absolute_path(
         f"gs://{self.mock_project_id}-configs/cloud_sql_to_bq_config.yaml")
     self.fake_gcs.upload_from_string(path=path,
                                      contents=contents,
                                      content_type="text/yaml")
Ejemplo n.º 4
0
def gcs_path(filepath: str) -> GcsfsFilePath:
    return GcsfsFilePath.from_absolute_path(
        os.path.join("gs://justice_counts", filepath))
Ejemplo n.º 5
0
 def on_file_added(self, path: GcsfsFilePath) -> None:
     if path.abs_path().startswith(
             self.controller.ingest_bucket_path.abs_path()):
         self.controller.handle_file(path,
                                     start_ingest=self.can_start_ingest)
Ejemplo n.º 6
0
def _test_get_local_file(file_path: GcsfsFilePath) -> str:
    local_path = os.path.join(
        os.path.realpath(os.path.dirname(os.path.realpath(__file__))), "auth_fixtures"
    )
    return Path(os.path.join(local_path, file_path.abs_path())).read_text()
Ejemplo n.º 7
0
    def test_export_final_existence_validation_failed(self) -> None:
        metric_view_one = MetricBigQueryViewBuilder(
            dataset_id='dataset',
            view_id='view1',
            view_query_template='select * from table',
            dimensions=['a', 'b', 'c'],
        ).build()

        export_config_one = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket1/US_XX'),
        )
        export_config_one_staging = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket1/staging/US_XX'),
        )

        metric_view_two = MetricBigQueryViewBuilder(
            dataset_id='dataset',
            view_id='view2',
            view_query_template='select * from view2',
            dimensions=['d', 'e', 'f'],
        ).build()

        export_config_two = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table2',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket2/US_XX'),
        )
        export_config_two_staging = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table2',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket2/staging/US_XX'),
        )

        mock_bq_client = create_autospec(BigQueryClient)
        mock_fs = create_autospec(GCSFileSystem)

        # This should cause export_and_validate to raise a ValueError
        mock_fs.exists.return_value = False

        delegate_one = create_autospec(BigQueryViewExporter)
        delegate_one_staging_paths = [
            export_config_one_staging.output_path('json'),
            export_config_two_staging.output_path('json')
        ]
        delegate_one.export_and_validate.return_value = delegate_one_staging_paths

        delegate_two = create_autospec(BigQueryViewExporter)
        delegate_two_staging_paths = [
            export_config_one_staging.output_path('txt'),
            export_config_two_staging.output_path('txt')
        ]
        delegate_two.export_and_validate.return_value = delegate_two_staging_paths

        # Make the actual call
        exporter = CompositeBigQueryViewExporter(mock_bq_client, mock_fs,
                                                 [delegate_one, delegate_two])

        with pytest.raises(ViewExportValidationError) as e:
            exporter.export_and_validate(
                [export_config_one, export_config_two])

        # We get an error at the very end of the export chain because even though delegate validations passed, the
        # final validation failed
        self.assertIn(
            'Validation on path bucket1/US_XX/view1.json failed the metric file export. '
            'Stopping execution here.', str(e.value))

        # The delegate exporters validations all passed so we still copy from staging to final
        delegate_one.export_and_validate.assert_has_calls([
            call([export_config_one_staging, export_config_two_staging]),
        ])

        delegate_two.export_and_validate.assert_has_calls([
            call([export_config_one_staging, export_config_two_staging]),
        ])

        mock_fs.copy.assert_has_calls([
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.json'),
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='US_XX/view1.json')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.json'),
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='US_XX/view2.json')),
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.txt'),
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='US_XX/view1.txt')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.txt'),
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='US_XX/view2.txt'))
        ])

        mock_fs.delete.assert_has_calls([
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.json')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.json')),
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.txt')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.txt'))
        ])

        # Only one call to the Exists validation made because the first one failed
        mock_fs.exists.assert_has_calls([
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='US_XX/view1.json')),
        ])
Ejemplo n.º 8
0
 def output_path(self, extension: str) -> GcsfsFilePath:
     file_name = f'{self.view.view_id}.{extension}'
     return GcsfsFilePath.from_directory_and_file_name(self.output_directory, file_name)
Ejemplo n.º 9
0
 def default_config_path() -> GcsfsFilePath:
     return GcsfsFilePath.from_absolute_path(
         f"gs://{metadata.project_id()}-configs/cloud_sql_to_bq_config.yaml"
     )
Ejemplo n.º 10
0
 def test_post_process_downloads(self) -> None:
     result = self.delegate.post_process_downloads(
         GcsfsFilePath.from_absolute_path("test_bucket/test.txt"),
         FakeGCSFileSystem(),
     )
     self.assertEqual(result, "test_bucket/test.txt")
Ejemplo n.º 11
0
 def revert_staging_path_to_original(staging_path: GcsfsFilePath) -> GcsfsFilePath:
     non_staging_relative_path = staging_path.blob_name.lstrip('staging/')
     return GcsfsFilePath.from_absolute_path(f'{staging_path.bucket_name}/{non_staging_relative_path}')
Ejemplo n.º 12
0
def process_job() -> Tuple[str, HTTPStatus]:
    """Processes a single direct ingest file, specified in the provided ingest
    arguments.
    """
    logging.info("Received request to process direct ingest job: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    file_path = get_str_param_value("file_path",
                                    request.values,
                                    preserve_case=True)

    if not region_code or not file_path:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    gcsfs_path = GcsfsFilePath.from_absolute_path(file_path)

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                gcsfs_path.bucket_path).value,
    ):
        json_data = request.get_data(as_text=True)
        ingest_args = _parse_cloud_task_args(json_data)

        if not ingest_args:
            raise DirectIngestError(
                msg="process_job was called with no GcsfsIngestArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(ingest_args, GcsfsIngestArgs):
            raise DirectIngestError(
                msg=
                f"process_job was called with incorrect args type [{type(ingest_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if gcsfs_path != ingest_args.file_path:
            raise DirectIngestError(
                msg=f"Different paths were passed in the url and request body\n"
                f"url: {gcsfs_path.uri()}\n"
                f"body: {ingest_args.file_path.uri()}",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        with monitoring.push_tags(
            {TagKey.INGEST_TASK_TAG: ingest_args.task_id_tag()}):
            try:
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=ingest_args.file_path.bucket_path,
                    allow_unlaunched=False,
                )
            except DirectIngestError as e:
                if e.is_bad_request():
                    logging.error(str(e))
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            try:
                controller.run_ingest_job_and_kick_scheduler_on_completion(
                    ingest_args)
            except GCSPseudoLockAlreadyExists as e:
                logging.warning(str(e))
                return str(e), HTTPStatus.CONFLICT
    return "", HTTPStatus.OK
Ejemplo n.º 13
0
def raw_data_import() -> Tuple[str, HTTPStatus]:
    """Imports a single raw direct ingest CSV file from a location in GCS File System to its corresponding raw data
    table in BQ.
    """
    logging.info("Received request to do direct ingest raw data import: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    file_path = get_str_param_value("file_path",
                                    request.values,
                                    preserve_case=True)

    if not region_code or not file_path:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    gcsfs_path = GcsfsFilePath.from_absolute_path(file_path)

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                gcsfs_path.bucket_path).value,
    ):
        json_data = request.get_data(as_text=True)
        data_import_args = _parse_cloud_task_args(json_data)

        if not data_import_args:
            raise DirectIngestError(
                msg=
                "raw_data_import was called with no GcsfsRawDataBQImportArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(data_import_args, GcsfsRawDataBQImportArgs):
            raise DirectIngestError(
                msg=
                f"raw_data_import was called with incorrect args type [{type(data_import_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if gcsfs_path != data_import_args.raw_data_file_path:
            raise DirectIngestError(
                msg=f"Different paths were passed in the url and request body\n"
                f"url: {gcsfs_path.uri()}\n"
                f"body: {data_import_args.raw_data_file_path.uri()}",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        with monitoring.push_tags(
            {TagKey.RAW_DATA_IMPORT_TAG: data_import_args.task_id_tag()}):
            try:
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=data_import_args.raw_data_file_path.
                    bucket_path,
                    allow_unlaunched=False,
                )
            except DirectIngestError as e:
                if e.is_bad_request():
                    logging.error(str(e))
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            controller.do_raw_data_import(data_import_args)
    return "", HTTPStatus.OK
Ejemplo n.º 14
0
 def is_file(self, path: str) -> bool:
     try:
         file = GcsfsFilePath.from_absolute_path(path)
         return self.exists(file)
     except ValueError:
         return False
Ejemplo n.º 15
0
 def test_direct_ingest_file_moves_with_file_types(self) -> None:
     self.fully_process_file(
         datetime.datetime.now(),
         GcsfsFilePath(bucket_name="my_bucket", blob_name="test_file.csv"),
     )
Ejemplo n.º 16
0
    def test_export_happy_path(self) -> None:
        metric_view_one = MetricBigQueryViewBuilder(
            dataset_id="dataset",
            view_id="view1",
            view_query_template="select * from table",
            dimensions=("a", "b", "c"),
        ).build()

        export_config_one = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause="WHERE state_code = 'US_XX'",
            intermediate_table_name="intermediate_table",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://bucket1/US_XX"),
        )
        export_config_one_staging = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause="WHERE state_code = 'US_XX'",
            intermediate_table_name="intermediate_table",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://bucket1/staging/US_XX"),
        )

        metric_view_two = MetricBigQueryViewBuilder(
            dataset_id="dataset",
            view_id="view2",
            view_query_template="select * from view2",
            dimensions=("d", "e", "f"),
        ).build()

        export_config_two = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause="WHERE state_code = 'US_XX'",
            intermediate_table_name="intermediate_table2",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://bucket2/US_XX"),
        )
        export_config_two_staging = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause="WHERE state_code = 'US_XX'",
            intermediate_table_name="intermediate_table2",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://bucket2/staging/US_XX"),
        )

        mock_fs = create_autospec(GCSFileSystem)

        mock_fs.exists.return_value = True

        delegate_one = create_autospec(BigQueryViewExporter)
        delegate_one_staging_paths = [
            export_config_one_staging.output_path("json"),
            export_config_two_staging.output_path("json"),
        ]
        delegate_one.export_and_validate.return_value = delegate_one_staging_paths

        delegate_two = create_autospec(BigQueryViewExporter)
        delegate_two_staging_paths = [
            export_config_one_staging.output_path("txt"),
            export_config_two_staging.output_path("txt"),
        ]
        delegate_two.export_and_validate.return_value = delegate_two_staging_paths

        # Make the actual call
        export_views_with_exporters(
            mock_fs,
            [export_config_one, export_config_two],
            {
                ExportOutputFormatType.JSON: delegate_one,
                ExportOutputFormatType.METRIC: delegate_two,
            },
        )

        # Assert all mocks called as expected
        delegate_one.export_and_validate.assert_has_calls([
            call([export_config_one_staging, export_config_two_staging]),
        ])

        delegate_two.export_and_validate.assert_has_calls([
            call([export_config_one_staging, export_config_two_staging]),
        ])

        mock_fs.copy.assert_has_calls(
            [
                call(
                    GcsfsFilePath(bucket_name="bucket1",
                                  blob_name="staging/US_XX/view1.json"),
                    GcsfsFilePath(bucket_name="bucket1",
                                  blob_name="US_XX/view1.json"),
                ),
                call(
                    GcsfsFilePath(bucket_name="bucket2",
                                  blob_name="staging/US_XX/view2.json"),
                    GcsfsFilePath(bucket_name="bucket2",
                                  blob_name="US_XX/view2.json"),
                ),
                call(
                    GcsfsFilePath(bucket_name="bucket1",
                                  blob_name="staging/US_XX/view1.txt"),
                    GcsfsFilePath(bucket_name="bucket1",
                                  blob_name="US_XX/view1.txt"),
                ),
                call(
                    GcsfsFilePath(bucket_name="bucket2",
                                  blob_name="staging/US_XX/view2.txt"),
                    GcsfsFilePath(bucket_name="bucket2",
                                  blob_name="US_XX/view2.txt"),
                ),
            ],
            any_order=True,
        )

        mock_fs.delete.assert_has_calls(
            [
                call(
                    GcsfsFilePath(bucket_name="bucket1",
                                  blob_name="staging/US_XX/view1.json")),
                call(
                    GcsfsFilePath(bucket_name="bucket2",
                                  blob_name="staging/US_XX/view2.json")),
                call(
                    GcsfsFilePath(bucket_name="bucket1",
                                  blob_name="staging/US_XX/view1.txt")),
                call(
                    GcsfsFilePath(bucket_name="bucket2",
                                  blob_name="staging/US_XX/view2.txt")),
            ],
            any_order=True,
        )
Ejemplo n.º 17
0
    def fully_process_file(self, dt: datetime.datetime,
                           path: GcsfsFilePath) -> None:
        """Mimics all the file system calls for a single file in the direct
        ingest system, from getting added to the ingest bucket, turning to a
        processed file, then getting moved to storage."""

        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            path,
            region_code=TEST_STATE_REGION.region_code,
            has_fixture=False,
        )

        start_num_total_files = len(self.fs.gcs_file_system.all_paths)
        # pylint: disable=protected-access
        start_ingest_paths = self.fs._ls_with_file_prefix(
            self.INGEST_DIR_PATH, "", None)
        start_storage_paths = self.fs._ls_with_file_prefix(
            self.STORAGE_DIR_PATH, "", None)

        start_raw_storage_paths = self.fs._ls_with_file_prefix(
            self.STORAGE_DIR_PATH,
            "",
            file_type_filter=GcsfsDirectIngestFileType.RAW_DATA,
        )
        start_ingest_view_storage_paths = self.fs._ls_with_file_prefix(
            self.STORAGE_DIR_PATH,
            "",
            file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW,
        )

        # File is renamed to normalized path
        self.fs.mv_path_to_normalized_path(path,
                                           GcsfsDirectIngestFileType.RAW_DATA,
                                           dt)

        raw_unprocessed = self.fs.get_unprocessed_file_paths(
            self.INGEST_DIR_PATH,
            file_type_filter=GcsfsDirectIngestFileType.RAW_DATA,
        )
        self.assertEqual(len(raw_unprocessed), 1)
        self.assertTrue(self.fs.is_seen_unprocessed_file(raw_unprocessed[0]))

        # ... raw file imported to BQ

        processed_path = self.fs.mv_path_to_processed_path(raw_unprocessed[0])

        processed = self.fs.get_processed_file_paths(self.INGEST_DIR_PATH,
                                                     None)
        self.assertEqual(len(processed), 1)

        self.fs.copy(
            processed_path,
            GcsfsFilePath.from_absolute_path(
                to_normalized_unprocessed_file_path_from_normalized_path(
                    processed_path.abs_path(),
                    file_type_override=GcsfsDirectIngestFileType.INGEST_VIEW,
                )),
        )
        self.fs.mv_path_to_storage(processed_path, self.STORAGE_DIR_PATH)

        ingest_unprocessed = self.fs.get_unprocessed_file_paths(
            self.INGEST_DIR_PATH,
            file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW)
        self.assertEqual(len(ingest_unprocessed), 1)
        self.assertTrue(self.fs.is_seen_unprocessed_file(
            ingest_unprocessed[0]))

        # ... file is ingested

        # File is moved to processed path
        self.fs.mv_path_to_processed_path(ingest_unprocessed[0])
        processed = self.fs.get_processed_file_paths(self.INGEST_DIR_PATH,
                                                     None)
        self.assertEqual(len(processed), 1)
        self.assertTrue(self.fs.is_processed_file(processed[0]))

        unprocessed = self.fs.get_unprocessed_file_paths(
            self.INGEST_DIR_PATH, None)
        self.assertEqual(len(unprocessed), 0)

        # File is moved to storage
        self.fs.mv_processed_paths_before_date_to_storage(
            self.INGEST_DIR_PATH,
            self.STORAGE_DIR_PATH,
            date_str_bound=dt.date().isoformat(),
            include_bound=True,
            file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW,
        )

        end_ingest_paths = self.fs._ls_with_file_prefix(self.INGEST_DIR_PATH,
                                                        "",
                                                        file_type_filter=None)
        end_storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH,
                                                         "",
                                                         file_type_filter=None)
        end_raw_storage_paths = self.fs._ls_with_file_prefix(
            self.STORAGE_DIR_PATH,
            "",
            file_type_filter=GcsfsDirectIngestFileType.RAW_DATA,
        )
        end_ingest_view_storage_paths = self.fs._ls_with_file_prefix(
            self.STORAGE_DIR_PATH,
            "",
            file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW,
        )

        # Each file gets re-exported as ingest view
        splitting_factor = 2

        expected_final_total_files = start_num_total_files + splitting_factor - 1
        self.assertEqual(len(self.fs.gcs_file_system.all_paths),
                         expected_final_total_files)
        self.assertEqual(len(end_ingest_paths), len(start_ingest_paths) - 1)
        self.assertEqual(len(end_storage_paths),
                         len(start_storage_paths) + 1 * splitting_factor)
        self.assertEqual(
            len(end_raw_storage_paths) + len(end_ingest_view_storage_paths),
            len(end_storage_paths),
        )
        self.assertEqual(len(end_raw_storage_paths),
                         len(start_raw_storage_paths) + 1)
        self.assertEqual(
            len(end_ingest_view_storage_paths),
            len(start_ingest_view_storage_paths) + 1,
        )

        for sp in end_storage_paths:
            parts = filename_parts_from_path(sp)
            if sp.abs_path() not in {
                    p.abs_path()
                    for p in start_storage_paths
            }:
                self.assertTrue(sp.abs_path().startswith(
                    self.STORAGE_DIR_PATH.abs_path()))
                dir_path, storage_file_name = os.path.split(sp.abs_path())

                self.assertTrue(parts.file_type.value in dir_path)
                name, _ = path.file_name.split(".")
                self.assertTrue(name in storage_file_name)
Ejemplo n.º 18
0
def retrieve_data(state_code: str, report_type: str, batch_id: str) -> List[Recipient]:
    """Retrieves the data for email generation of the given report type for the given state.

    Get the data from Cloud Storage and return it in a list of dictionaries. Saves the data file into an archive
    bucket on completion, so that we have the ability to troubleshoot or re-generate a previous batch of emails
    later on.

    Args:
        state_code: State identifier used to retrieve appropriate data
        report_type: The type of report, used to determine the data file name
        batch_id: The identifier for this batch

    Returns:
        A list of recipient data dictionaries

    Raises:
        Non-recoverable errors that should stop execution. Attempts to catch and handle errors that are recoverable.
        Provides logging for debug purposes whenever possible.
    """
    data_bucket = utils.get_data_storage_bucket_name()
    data_filename = ""
    gcs_file_system = GcsfsFactory.build()
    try:
        data_filename = utils.get_data_filename(state_code, report_type)
        path = GcsfsFilePath.from_absolute_path(f"gs://{data_bucket}/{data_filename}")
        file_contents = gcs_file_system.download_as_string(path)
    except BaseException:
        logging.info("Unable to load data file %s/%s", data_bucket, data_filename)
        raise

    archive_bucket = utils.get_data_archive_bucket_name()
    archive_filename = ""
    try:
        archive_filename = utils.get_data_archive_filename(batch_id)
        archive_path = GcsfsFilePath.from_absolute_path(
            f"gs://{archive_bucket}/{archive_filename}"
        )
        gcs_file_system.upload_from_string(
            path=archive_path, contents=file_contents, content_type="text/json"
        )
    except Exception:
        logging.error(
            "Unable to archive the data file to %s/%s", archive_bucket, archive_filename
        )
        raise

    json_list = file_contents.splitlines()

    recipient_data: List[dict] = []
    for json_str in json_list:
        try:
            item = json.loads(json_str)
        except Exception as err:
            logging.error(
                "Unable to parse JSON found in the file %s. Offending json string is: '%s'. <%s> %s",
                data_filename,
                json_str,
                type(err).__name__,
                err,
            )
        else:
            recipient_data.append(item)

    logging.info(
        "Retrieved %s recipients from data file %s", len(recipient_data), data_filename
    )
    return [
        Recipient.from_report_json(
            {
                **recipient,
                utils.KEY_BATCH_ID: batch_id,
            }
        )
        for recipient in recipient_data
    ]
    def get_output_path(self, chunk_num: int):
        name, _extension = os.path.splitext(self.path.file_name)

        return GcsfsFilePath.from_directory_and_file_name(
            self.temp_output_directory_path, f'temp_{name}_{chunk_num}.csv')
    def test_filename_parts_from_path_unspecified_file_type(self):
        with self.assertRaises(DirectIngestError):
            filename_parts_from_path(
                GcsfsFilePath.from_absolute_path(
                    'bucket/us_ca_sf/elite_offenders.csv'))

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-nd/unprocessed_2019-08-07T22:09:18:770655_'
                'elite_offenders.csv'))

        self.assertEqual(parts.processed_state, 'unprocessed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_type,
                         GcsfsDirectIngestFileType.UNSPECIFIED)
        self.assertEqual(parts.file_tag, 'elite_offenders')
        self.assertEqual(parts.filename_suffix, None)
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-08-07T22:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-08-07')
        self.assertEqual(parts.is_file_split, False)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-nd/processed_2019-09-07T00:09:18:770655_'
                'elite_offenders.csv'))

        self.assertEqual(parts.processed_state, 'processed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_type,
                         GcsfsDirectIngestFileType.UNSPECIFIED)
        self.assertEqual(parts.file_tag, 'elite_offenders')
        self.assertEqual(parts.filename_suffix, None)
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')
        self.assertEqual(parts.is_file_split, False)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-nd/processed_2019-09-07T00:09:18:770655_'
                'elite_offenders_1split.csv'))

        self.assertEqual(parts.processed_state, 'processed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_type,
                         GcsfsDirectIngestFileType.UNSPECIFIED)
        self.assertEqual(parts.file_tag, 'elite_offenders')
        self.assertEqual(parts.filename_suffix, '1split')
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')

        # Needs the actual file_split suffix to be a file split
        self.assertEqual(parts.is_file_split, False)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-nd/processed_2019-09-07T00:09:18:770655_'
                'elite_offenders_002_file_split.csv'))

        self.assertEqual(parts.processed_state, 'processed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_type,
                         GcsfsDirectIngestFileType.UNSPECIFIED)
        self.assertEqual(parts.file_tag, 'elite_offenders')
        self.assertEqual(parts.filename_suffix, '002_file_split')
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')

        self.assertEqual(parts.is_file_split, True)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-nd/processed_2019-09-07T00:09:18:770655_'
                'elite_offenders_002_file_split_size300.csv'))

        self.assertEqual(parts.processed_state, 'processed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_type,
                         GcsfsDirectIngestFileType.UNSPECIFIED)
        self.assertEqual(parts.file_tag, 'elite_offenders')
        self.assertEqual(parts.filename_suffix, '002_file_split_size300')
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')

        self.assertEqual(parts.is_file_split, True)
        self.assertEqual(parts.file_split_size, 300)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-nd/processed_2019-09-07T00:09:18:770655_'
                'BrazosCounty_2019_09_25.csv'))

        self.assertEqual(parts.processed_state, 'processed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_type,
                         GcsfsDirectIngestFileType.UNSPECIFIED)
        self.assertEqual(parts.file_tag, 'BrazosCounty')
        self.assertEqual(parts.filename_suffix, '2019_09_25')
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')

        self.assertEqual(parts.is_file_split, False)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-nd/processed_2019-09-07T00:09:18:770655_'
                'BrazosCounty_2019_09_25_002_file_split_size300.csv'))

        self.assertEqual(parts.processed_state, 'processed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_type,
                         GcsfsDirectIngestFileType.UNSPECIFIED)
        self.assertEqual(parts.file_tag, 'BrazosCounty')
        self.assertEqual(parts.filename_suffix,
                         '2019_09_25_002_file_split_size300')
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')

        self.assertEqual(parts.is_file_split, True)
        self.assertEqual(parts.file_split_size, 300)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-mo/unprocessed_2019-09-07T00:09:18:770655_'
                'tak001_offender_identification.csv'))

        self.assertEqual(parts.processed_state, 'unprocessed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_type,
                         GcsfsDirectIngestFileType.UNSPECIFIED)
        self.assertEqual(parts.file_tag, 'tak001_offender_identification')
        self.assertEqual(parts.filename_suffix, None)
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')

        self.assertEqual(parts.is_file_split, False)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-mo/unprocessed_2019-09-07T00:09:18:770655_'
                'tak001_offender_identification_002_file_split_size300.csv'))

        self.assertEqual(parts.processed_state, 'unprocessed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_type,
                         GcsfsDirectIngestFileType.UNSPECIFIED)
        self.assertEqual(parts.file_tag, 'tak001_offender_identification')
        self.assertEqual(parts.filename_suffix, '002_file_split_size300')
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')

        self.assertEqual(parts.is_file_split, True)
        self.assertEqual(parts.file_split_size, 300)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'storage_bucket/region_subdir/2020-04-29/processed_2020-04-29T18:02:41:789323_test_file-(1).csv'
            ))

        self.assertEqual(parts.processed_state, 'processed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_type,
                         GcsfsDirectIngestFileType.UNSPECIFIED)
        self.assertEqual(parts.file_tag, 'test_file')
        self.assertEqual(parts.filename_suffix, None)
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2020-04-29T18:02:41:789323'))
        self.assertEqual(parts.date_str, '2020-04-29')

        self.assertEqual(parts.is_file_split, False)
Ejemplo n.º 21
0
    def test_export_happy_path(self) -> None:
        metric_view_one = MetricBigQueryViewBuilder(
            dataset_id='dataset',
            view_id='view1',
            view_query_template='select * from table',
            dimensions=['a', 'b', 'c'],
        ).build()

        export_config_one = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket1/US_XX'),
        )
        export_config_one_staging = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket1/staging/US_XX'),
        )

        metric_view_two = MetricBigQueryViewBuilder(
            dataset_id='dataset',
            view_id='view2',
            view_query_template='select * from view2',
            dimensions=['d', 'e', 'f'],
        ).build()

        export_config_two = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table2',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket2/US_XX'),
        )
        export_config_two_staging = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table2',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket2/staging/US_XX'),
        )

        mock_bq_client = create_autospec(BigQueryClient)
        mock_fs = create_autospec(GCSFileSystem)

        mock_fs.exists.return_value = True

        delegate_one = create_autospec(BigQueryViewExporter)
        delegate_one_staging_paths = [
            export_config_one_staging.output_path('json'),
            export_config_two_staging.output_path('json')
        ]
        delegate_one.export_and_validate.return_value = delegate_one_staging_paths

        delegate_two = create_autospec(BigQueryViewExporter)
        delegate_two_staging_paths = [
            export_config_one_staging.output_path('txt'),
            export_config_two_staging.output_path('txt')
        ]
        delegate_two.export_and_validate.return_value = delegate_two_staging_paths

        # Make the actual call
        exporter = CompositeBigQueryViewExporter(mock_bq_client, mock_fs,
                                                 [delegate_one, delegate_two])
        exporter.export_and_validate([export_config_one, export_config_two])

        # Assert all mocks called as expected
        delegate_one.export_and_validate.assert_has_calls([
            call([export_config_one_staging, export_config_two_staging]),
        ])

        delegate_two.export_and_validate.assert_has_calls([
            call([export_config_one_staging, export_config_two_staging]),
        ])

        mock_fs.copy.assert_has_calls([
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.json'),
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='US_XX/view1.json')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.json'),
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='US_XX/view2.json')),
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.txt'),
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='US_XX/view1.txt')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.txt'),
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='US_XX/view2.txt'))
        ])

        mock_fs.delete.assert_has_calls([
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.json')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.json')),
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.txt')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.txt'))
        ])

        mock_fs.exists.assert_has_calls([
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='US_XX/view1.json')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='US_XX/view2.json')),
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='US_XX/view1.txt')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='US_XX/view2.txt')),
        ])
Ejemplo n.º 22
0
 def _lock_body_for_lock(self, name: str) -> Optional[GCSPseudoLockBody]:
     path = GcsfsFilePath(bucket_name=self.bucket_name, blob_name=name)
     return self._lock_body_for_path(path)
Ejemplo n.º 23
0
    def test_filename_parts_from_path_with_file_type(self) -> None:
        with self.assertRaises(DirectIngestError):
            filename_parts_from_path(
                GcsfsFilePath.from_absolute_path("bucket/us_ca_sf/elite_offenders.csv")
            )

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                "bucket-us-nd/unprocessed_2019-08-07T22:09:18:770655_"
                "raw_elite_offenders.csv"
            )
        )

        self.assertEqual(parts.processed_state, "unprocessed")
        self.assertEqual(parts.extension, "csv")
        self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.RAW_DATA)
        self.assertEqual(parts.file_tag, "elite_offenders")
        self.assertEqual(parts.filename_suffix, None)
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat("2019-08-07T22:09:18:770655"),
        )
        self.assertEqual(parts.date_str, "2019-08-07")
        self.assertEqual(parts.is_file_split, False)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                "bucket-us-nd/processed_2019-09-07T00:09:18:770655_"
                "ingest_view_elite_offenders.csv"
            )
        )

        self.assertEqual(parts.processed_state, "processed")
        self.assertEqual(parts.extension, "csv")
        self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.INGEST_VIEW)
        self.assertEqual(parts.file_tag, "elite_offenders")
        self.assertEqual(parts.filename_suffix, None)
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat("2019-09-07T00:09:18:770655"),
        )
        self.assertEqual(parts.date_str, "2019-09-07")
        self.assertEqual(parts.is_file_split, False)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                "bucket-us-nd/processed_2019-09-07T00:09:18:770655_"
                "raw_elite_offenders_1split.csv"
            )
        )

        self.assertEqual(parts.processed_state, "processed")
        self.assertEqual(parts.extension, "csv")
        self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.RAW_DATA)
        self.assertEqual(parts.file_tag, "elite_offenders")
        self.assertEqual(parts.filename_suffix, "1split")
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat("2019-09-07T00:09:18:770655"),
        )
        self.assertEqual(parts.date_str, "2019-09-07")

        # Needs the actual file_split suffix to be a file split
        self.assertEqual(parts.is_file_split, False)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                "bucket-us-nd/processed_2019-09-07T00:09:18:770655_"
                "ingest_view_elite_offenders_002_file_split.csv"
            )
        )

        self.assertEqual(parts.processed_state, "processed")
        self.assertEqual(parts.extension, "csv")
        self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.INGEST_VIEW)
        self.assertEqual(parts.file_tag, "elite_offenders")
        self.assertEqual(parts.filename_suffix, "002_file_split")
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat("2019-09-07T00:09:18:770655"),
        )
        self.assertEqual(parts.date_str, "2019-09-07")

        self.assertEqual(parts.is_file_split, True)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                "bucket-us-nd/processed_2019-09-07T00:09:18:770655_"
                "raw_elite_offenders_002_file_split_size300.csv"
            )
        )

        self.assertEqual(parts.processed_state, "processed")
        self.assertEqual(parts.extension, "csv")
        self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.RAW_DATA)
        self.assertEqual(parts.file_tag, "elite_offenders")
        self.assertEqual(parts.filename_suffix, "002_file_split_size300")
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat("2019-09-07T00:09:18:770655"),
        )
        self.assertEqual(parts.date_str, "2019-09-07")

        self.assertEqual(parts.is_file_split, True)
        self.assertEqual(parts.file_split_size, 300)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                "bucket-us-nd/processed_2019-09-07T00:09:18:770655_"
                "ingest_view_BrazosCounty_2019_09_25.csv"
            )
        )

        self.assertEqual(parts.processed_state, "processed")
        self.assertEqual(parts.extension, "csv")
        self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.INGEST_VIEW)
        self.assertEqual(parts.file_tag, "BrazosCounty")
        self.assertEqual(parts.filename_suffix, "2019_09_25")
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat("2019-09-07T00:09:18:770655"),
        )
        self.assertEqual(parts.date_str, "2019-09-07")

        self.assertEqual(parts.is_file_split, False)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                "bucket-us-nd/processed_2019-09-07T00:09:18:770655_"
                "raw_BrazosCounty_2019_09_25_002_file_split_size300.csv"
            )
        )

        self.assertEqual(parts.processed_state, "processed")
        self.assertEqual(parts.extension, "csv")
        self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.RAW_DATA)
        self.assertEqual(parts.file_tag, "BrazosCounty")
        self.assertEqual(parts.filename_suffix, "2019_09_25_002_file_split_size300")
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat("2019-09-07T00:09:18:770655"),
        )
        self.assertEqual(parts.date_str, "2019-09-07")

        self.assertEqual(parts.is_file_split, True)
        self.assertEqual(parts.file_split_size, 300)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                "bucket-us-mo/unprocessed_2019-09-07T00:09:18:770655_"
                "ingest_view_tak001_offender_identification.csv"
            )
        )

        self.assertEqual(parts.processed_state, "unprocessed")
        self.assertEqual(parts.extension, "csv")
        self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.INGEST_VIEW)
        self.assertEqual(parts.file_tag, "tak001_offender_identification")
        self.assertEqual(parts.filename_suffix, None)
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat("2019-09-07T00:09:18:770655"),
        )
        self.assertEqual(parts.date_str, "2019-09-07")

        self.assertEqual(parts.is_file_split, False)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                "bucket-us-mo/unprocessed_2019-09-07T00:09:18:770655_"
                "raw_tak001_offender_identification_002_file_split_size300.csv"
            )
        )

        self.assertEqual(parts.processed_state, "unprocessed")
        self.assertEqual(parts.extension, "csv")
        self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.RAW_DATA)
        self.assertEqual(parts.file_tag, "tak001_offender_identification")
        self.assertEqual(parts.filename_suffix, "002_file_split_size300")
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat("2019-09-07T00:09:18:770655"),
        )
        self.assertEqual(parts.date_str, "2019-09-07")

        self.assertEqual(parts.is_file_split, True)
        self.assertEqual(parts.file_split_size, 300)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                "storage_bucket/raw/2020/04/29/processed_2020-04-29T18:02:41:789323_raw_test_file-(1).csv"
            )
        )

        self.assertEqual(parts.processed_state, "processed")
        self.assertEqual(parts.extension, "csv")
        self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.RAW_DATA)
        self.assertEqual(parts.file_tag, "test_file")
        self.assertEqual(parts.filename_suffix, None)
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat("2020-04-29T18:02:41:789323"),
        )
        self.assertEqual(parts.date_str, "2020-04-29")

        self.assertEqual(parts.is_file_split, False)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                "bucket-us-mo/unprocessed_2019-09-07T00:09:18:770655_"
                "raw_tak001_offender_identification_002_file_split_size300-(5).csv"
            )
        )

        self.assertEqual(parts.processed_state, "unprocessed")
        self.assertEqual(parts.extension, "csv")
        self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.RAW_DATA)
        self.assertEqual(parts.file_tag, "tak001_offender_identification")
        self.assertEqual(parts.filename_suffix, "002_file_split_size300")
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat("2019-09-07T00:09:18:770655"),
        )
        self.assertEqual(parts.date_str, "2019-09-07")

        self.assertEqual(parts.is_file_split, True)
        self.assertEqual(parts.file_split_size, 300)
Ejemplo n.º 24
0
    def get_output_path(self, chunk_num: int) -> GcsfsFilePath:
        name, _extension = os.path.splitext(self.path.file_name)

        return GcsfsFilePath.from_directory_and_file_name(
            self.output_directory_path,
            f"temp_direct_ingest_{name}_{chunk_num}.csv")
Ejemplo n.º 25
0
    def export(self, export_configs: Sequence[ExportBigQueryViewConfig]) -> List[GcsfsFilePath]:
        export_query_configs = [c.as_export_query_config(bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON)
                                for c in export_configs]
        self.bq_client.export_query_results_to_cloud_storage(export_query_configs)

        return [GcsfsFilePath.from_absolute_path(config.output_uri) for config in export_query_configs]
Ejemplo n.º 26
0
 def post_process_downloads(self, downloaded_path: GcsfsFilePath,
                            _: GCSFileSystem) -> str:
     """The US_ID server doesn't require any post-processing."""
     return downloaded_path.abs_path()
Ejemplo n.º 27
0
def _gcsfs_path_for_batch_metadata(
    batch_id: str, state_code: StateCode
) -> GcsfsFilePath:
    return GcsfsFilePath.from_absolute_path(
        f"gs://{get_email_content_bucket_name()}/{state_code.value}/{batch_id}/metadata.json"
    )
Ejemplo n.º 28
0
 def delete(self, path: GcsfsFilePath) -> None:
     with self.mutex:
         self.files.pop(path.abs_path())
    def _split_file_if_necessary(self, path: GcsfsFilePath) -> bool:
        """Checks if the given file needs to be split according to this controller's |file_split_line_limit|.

        Returns True if the file was split, False if splitting was not necessary.
        """

        should_split = self._should_split_file(path)
        if not should_split:
            logging.info("No need to split file path [%s].", path.abs_path())
            return False

        logging.info("Proceeding to file splitting for path [%s].",
                     path.abs_path())

        original_metadata = None
        if self.region.are_ingest_view_exports_enabled_in_env():
            original_metadata = self.file_metadata_manager.get_file_metadata(
                path)

        output_dir = GcsfsDirectoryPath.from_file_path(path)

        split_contents_paths = self._split_file(path)
        upload_paths = []
        for i, split_contents_path in enumerate(split_contents_paths):
            upload_path = self._create_split_file_path(path,
                                                       output_dir,
                                                       split_num=i)

            logging.info(
                "Copying split [%s] to direct ingest directory at path [%s].",
                i,
                upload_path.abs_path(),
            )

            upload_paths.append(upload_path)
            try:
                self.fs.mv(split_contents_path, upload_path)
            except Exception as e:
                logging.error(
                    "Threw error while copying split files from temp bucket - attempting to clean up before rethrowing."
                    " [%s]",
                    e,
                )
                for p in upload_paths:
                    self.fs.delete(p)
                raise e

        # We wait to register files with metadata manager until all files have been successfully copied to avoid leaving
        # the metadata manager in an inconsistent state.
        if self.region.are_ingest_view_exports_enabled_in_env():
            if not isinstance(original_metadata,
                              DirectIngestIngestFileMetadata):
                raise ValueError(
                    "Attempting to split a non-ingest view type file")

            logging.info(
                "Registering [%s] split files with the metadata manager.",
                len(upload_paths),
            )

            for upload_path in upload_paths:
                ingest_file_metadata = (
                    self.file_metadata_manager.register_ingest_file_split(
                        original_metadata, upload_path))
                self.file_metadata_manager.mark_ingest_view_exported(
                    ingest_file_metadata)

            self.file_metadata_manager.mark_file_as_processed(path)

        logging.info(
            "Done splitting file [%s] into [%s] paths, moving it to storage.",
            path.abs_path(),
            len(split_contents_paths),
        )

        self.fs.mv_path_to_storage(path, self.storage_directory_path)

        return True
Ejemplo n.º 30
0
def get_attachment_filepath(batch_id: str, email_address: str) -> GcsfsFilePath:
    bucket = get_email_content_bucket_name()
    folder = get_attachments_folder(batch_id)
    return GcsfsFilePath.from_absolute_path(
        f"gs://{bucket}/{folder}/{email_address}.txt"
    )