Example #1
0
    def test_get_metadata_for_raw_files_discovered_after_datetime(self):
        with freeze_time('2015-01-02T03:05:05'):
            raw_unprocessed_path_1 = self._make_unprocessed_path(
                'bucket/file_tag.csv',
                GcsfsDirectIngestFileType.RAW_DATA,
                dt=datetime.datetime.utcnow())
            self.metadata_manager.register_new_file(raw_unprocessed_path_1)
            self.metadata_manager_other_region.register_new_file(
                raw_unprocessed_path_1)

        with freeze_time('2015-01-02T03:06:06'):
            raw_unprocessed_path_2 = self._make_unprocessed_path(
                'bucket/other_tag.csv',
                GcsfsDirectIngestFileType.RAW_DATA,
                dt=datetime.datetime.utcnow())
            self.metadata_manager.register_new_file(raw_unprocessed_path_2)

        with freeze_time('2015-01-02T03:07:07'):
            raw_unprocessed_path_3 = self._make_unprocessed_path(
                'bucket/file_tag.csv',
                GcsfsDirectIngestFileType.RAW_DATA,
                dt=datetime.datetime.utcnow())
            self.metadata_manager.register_new_file(raw_unprocessed_path_3)

        expected_list = [
            DirectIngestRawFileMetadata.new_with_defaults(
                region_code=self.metadata_manager.region_code,
                file_tag='file_tag',
                discovery_time=datetime.datetime(2015, 1, 2, 3, 5, 5),
                normalized_file_name=
                'unprocessed_2015-01-02T03:05:05:000000_raw_file_tag.csv',
                datetimes_contained_upper_bound_inclusive=datetime.datetime(
                    2015, 1, 2, 3, 5, 5)),
            DirectIngestRawFileMetadata.new_with_defaults(
                region_code=self.metadata_manager.region_code,
                file_tag='file_tag',
                discovery_time=datetime.datetime(2015, 1, 2, 3, 7, 7),
                normalized_file_name=
                'unprocessed_2015-01-02T03:07:07:000000_raw_file_tag.csv',
                datetimes_contained_upper_bound_inclusive=datetime.datetime(
                    2015, 1, 2, 3, 7, 7))
        ]

        self.assertEqual(
            expected_list,
            self.metadata_manager.
            get_metadata_for_raw_files_discovered_after_datetime(
                'file_tag', discovery_time_lower_bound_exclusive=None))

        expected_list = expected_list[-1:]

        self.assertEqual(
            expected_list,
            self.metadata_manager.
            get_metadata_for_raw_files_discovered_after_datetime(
                'file_tag',
                discovery_time_lower_bound_exclusive=datetime.datetime(
                    2015, 1, 2, 3, 7, 0)))
    def test_get_raw_file_metadata_unique_to_state(self):
        # Arrange
        raw_unprocessed_path = self._make_unprocessed_path(
            'bucket/file_tag.csv', GcsfsDirectIngestFileType.RAW_DATA)

        self.metadata_manager_other_region.mark_file_as_discovered(
            raw_unprocessed_path)

        # Act
        self.metadata_manager.mark_file_as_discovered(raw_unprocessed_path)
        metadata = self.metadata_manager.get_file_metadata(
            raw_unprocessed_path)

        # Assert
        expected_metadata = DirectIngestRawFileMetadata.new_with_defaults(
            region_code=self.metadata_manager.region_code,
            file_tag='file_tag',
            discovery_time=datetime.datetime(2015, 1, 2, 3, 4, 6),
            normalized_file_name=
            'unprocessed_2015-01-02T03:03:03:000003_raw_file_tag.csv',
            processed_time=None,
            datetimes_contained_upper_bound_inclusive=datetime.datetime(
                2015, 1, 2, 3, 3, 3, 3))

        self.assertIsInstance(metadata, DirectIngestRawFileMetadata)
        self.assertIsNotNone(metadata.file_id)

        self.assertEqual(expected_metadata, metadata)
 def _metadata_for_unprocessed_file_path(
         self, path: GcsfsFilePath) -> DirectIngestRawFileMetadata:
     parts = filename_parts_from_path(path)
     return DirectIngestRawFileMetadata(
         region_code=self.test_region.region_code,
         file_tag=parts.file_tag,
         file_id=123,
         processed_time=None,
         normalized_file_name=path.file_name,
         discovery_time=datetime.datetime.now(),
         datetimes_contained_upper_bound_inclusive=parts.
         utc_upload_datetime,
     )
Example #4
0
    def test_getIngestViewExportTaskArgs_happy(self) -> None:
        # Arrange
        region = self.create_fake_region(ingest_view_exports_enabled=True)
        export_manager = self.create_export_manager(region)
        export_manager.file_metadata_manager.get_ingest_view_metadata_for_most_recent_valid_job = Mock(  # type: ignore
            return_value=DirectIngestIngestFileMetadata(
                file_id=_ID,
                region_code=region.region_code,
                file_tag="ingest_view",
                normalized_file_name="normalized_file_name",
                processed_time=_DATE_1,
                is_invalidated=False,
                is_file_split=False,
                job_creation_time=_DATE_1,
                export_time=_DATE_1,
                datetimes_contained_lower_bound_exclusive=_DATE_1,
                datetimes_contained_upper_bound_inclusive=_DATE_1,
                discovery_time=_DATE_1,
            ))
        export_manager.file_metadata_manager.get_metadata_for_raw_files_discovered_after_datetime = Mock(  # type: ignore
            return_value=[
                DirectIngestRawFileMetadata(
                    file_id=2,
                    region_code=region.region_code,
                    file_tag="ingest_view",
                    discovery_time=_DATE_2,
                    normalized_file_name=
                    "unprocessed_2015-01-02T03:03:03:000003_raw_file_tag.csv",
                    processed_time=None,
                    datetimes_contained_upper_bound_inclusive=_DATE_2,
                )
            ])

        # Act
        args = export_manager.get_ingest_view_export_task_args()

        # Assert
        self.assertListEqual(
            args,
            [
                GcsfsIngestViewExportArgs(
                    ingest_view_name="ingest_view",
                    upper_bound_datetime_prev=_DATE_1,
                    upper_bound_datetime_to_export=_DATE_2,
                )
            ],
        )
Example #5
0
    def test_getIngestViewExportTaskArgs_rawCodeTableOlderThanLastExport(
            self) -> None:
        # Arrange
        CODE_TABLE_TAG = "RECIDIVIZ_REFERENCE_ingest_view"
        region = self.create_fake_region(ingest_view_exports_enabled=True)
        export_manager = self.create_export_manager(
            region, controller_file_tags=[CODE_TABLE_TAG])
        export_manager.file_metadata_manager.get_ingest_view_metadata_for_most_recent_valid_job = Mock(  # type: ignore
            return_value=DirectIngestIngestFileMetadata(
                file_id=_ID,
                region_code=region.region_code,
                file_tag=CODE_TABLE_TAG,
                normalized_file_name="normalized_file_name",
                processed_time=_DATE_2,
                is_invalidated=False,
                is_file_split=False,
                job_creation_time=_DATE_2,
                export_time=_DATE_2,
                datetimes_contained_lower_bound_exclusive=_DATE_2 -
                datetime.timedelta(days=7),
                datetimes_contained_upper_bound_inclusive=_DATE_2,
                discovery_time=_DATE_2,
            ))
        export_manager.file_metadata_manager.get_metadata_for_raw_files_discovered_after_datetime = Mock(  # type: ignore
            return_value=[
                DirectIngestRawFileMetadata(
                    file_id=2,
                    region_code=region.region_code,
                    file_tag=CODE_TABLE_TAG,
                    discovery_time=_DATE_1,
                    normalized_file_name=
                    "unprocessed_2015-01-02T03:03:03:000003_raw_file_tag.csv",
                    processed_time=None,
                    datetimes_contained_upper_bound_inclusive=_DATE_1,
                )
            ])

        # Act
        args = export_manager.get_ingest_view_export_task_args()

        # Assert
        # New code tables are backdated but don't need to be re-ingested, so ignore them.
        self.assertListEqual(args, [])
Example #6
0
    def test_getIngestViewExportTaskArgs_rawFileOlderThanLastExport(
            self) -> None:
        # Arrange
        region = self.create_fake_region(ingest_view_exports_enabled=True)
        export_manager = self.create_export_manager(region)
        export_manager.file_metadata_manager.get_ingest_view_metadata_for_most_recent_valid_job = Mock(  # type: ignore
            return_value=DirectIngestIngestFileMetadata(
                file_id=_ID,
                region_code=region.region_code,
                file_tag="ingest_view",
                normalized_file_name="normalized_file_name",
                processed_time=_DATE_2,
                is_invalidated=False,
                is_file_split=False,
                job_creation_time=_DATE_2,
                export_time=_DATE_2,
                datetimes_contained_lower_bound_exclusive=_DATE_2,
                datetimes_contained_upper_bound_inclusive=_DATE_2,
                discovery_time=_DATE_2,
            ))
        export_manager.file_metadata_manager.get_metadata_for_raw_files_discovered_after_datetime = Mock(  # type: ignore
            return_value=[
                DirectIngestRawFileMetadata(
                    file_id=2,
                    region_code=region.region_code,
                    file_tag="ingest_view",
                    discovery_time=_DATE_1,
                    normalized_file_name=
                    "unprocessed_2015-01-02T03:03:03:000003_raw_file_tag.csv",
                    processed_time=None,
                    datetimes_contained_upper_bound_inclusive=_DATE_1,
                )
            ])

        # Act
        with pytest.raises(
                ValueError,
                match=r"upper bound date.*before the last valid export"):
            export_manager.get_ingest_view_export_task_args()
Example #7
0
def do_upload(
    state_code: StateCode,
    sandbox_dataset_prefix: str,
    source_bucket: GcsfsBucketPath,
    file_tag_filter: Optional[str],
) -> None:
    """Imports a set of raw data files in the given source bucket into a sandbox
    dataset.
    """

    input_str = input(
        f"Have you already uploaded raw files to [{source_bucket.uri()}] using script "
        f"`recidiviz.tools.ingest.operations.upload_raw_state_files_to_ingest_bucket_with_date` "
        f"with arg `--destination-bucket {source_bucket.bucket_name}`?. [y/n] "
    )

    if input_str.upper() != "Y":
        return

    import_manager = SandboxDirectIngestRawFileImportManager(
        state_code=state_code,
        sandbox_dataset_prefix=sandbox_dataset_prefix,
        test_ingest_bucket=source_bucket,
    )

    bq_client = BigQueryClientImpl()

    # Create the dataset up front with table expiration
    bq_client.create_dataset_if_necessary(
        bq_client.dataset_ref_for_id(dataset_id=import_manager.sandbox_dataset),
        default_table_expiration_ms=TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS,
    )

    raw_files_to_import = import_manager.get_unprocessed_raw_files_to_import()

    failures_by_exception = defaultdict(list)

    for i, file_path in enumerate(raw_files_to_import):
        parts = filename_parts_from_path(file_path)
        if file_tag_filter and not re.search(file_tag_filter, parts.file_tag):
            logging.info("** Skipping file with tag [%s] **", parts.file_tag)
            continue

        logging.info("Running file with tag [%s]", parts.file_tag)

        try:
            import_manager.import_raw_file_to_big_query(
                file_path,
                DirectIngestRawFileMetadata(
                    file_id=i,
                    region_code=state_code.value,
                    file_tag=parts.file_tag,
                    processed_time=None,
                    discovery_time=datetime.datetime.now(),
                    normalized_file_name=file_path.file_name,
                    datetimes_contained_upper_bound_inclusive=parts.utc_upload_datetime,
                ),
            )
        except Exception as e:
            logging.exception(e)
            failures_by_exception[str(e)].append(file_path.abs_path())

    if failures_by_exception:
        logging.error("************************* FAILURES ************************")
        total_files = 0
        all_failed_paths = []
        for error, file_list in failures_by_exception.items():
            total_files += len(file_list)
            all_failed_paths += file_list

            logging.error(
                "Failed [%s] files with error [%s]: %s",
                len(file_list),
                error,
                file_list,
            )
            logging.error("***********************************************************")
        raise ValueError(f"Failed to import [{total_files}] files: {all_failed_paths}")