def register_ingest_file_export_job(
        self, ingest_view_job_args: GcsfsIngestViewExportArgs
    ) -> DirectIngestIngestFileMetadata:
        session = SessionFactory.for_schema_base(OperationsBase)

        try:
            metadata = schema.DirectIngestIngestFileMetadata(
                region_code=self.region_code,
                file_tag=ingest_view_job_args.ingest_view_name,
                is_invalidated=False,
                is_file_split=False,
                job_creation_time=datetime.datetime.utcnow(),
                datetimes_contained_lower_bound_exclusive=ingest_view_job_args.upper_bound_datetime_prev,
                datetimes_contained_upper_bound_inclusive=ingest_view_job_args.upper_bound_datetime_to_export,
            )
            session.add(metadata)
            session.commit()
            metadata_entity = self._ingest_file_schema_metadata_as_entity(metadata)
        except Exception as e:
            session.rollback()
            raise e
        finally:
            session.close()

        return metadata_entity
Esempio n. 2
0
    def register_ingest_file_split(
        self,
        original_file_metadata: DirectIngestIngestFileMetadata,
        path: GcsfsFilePath,
    ) -> DirectIngestIngestFileMetadata:
        self._check_is_ingest_view_file_path(path)

        with SessionFactory.using_database(self.database_key) as session:
            metadata = schema.DirectIngestIngestFileMetadata(
                region_code=self.region_code,
                file_tag=original_file_metadata.file_tag,
                is_invalidated=False,
                is_file_split=True,
                job_creation_time=datetime.datetime.now(tz=pytz.UTC),
                normalized_file_name=path.file_name,
                datetimes_contained_lower_bound_exclusive=original_file_metadata
                .datetimes_contained_lower_bound_exclusive,
                datetimes_contained_upper_bound_inclusive=original_file_metadata
                .datetimes_contained_upper_bound_inclusive,
                ingest_database_name=original_file_metadata.
                ingest_database_name,
            )
            session.add(metadata)
            session.commit()
            return self._ingest_file_schema_metadata_as_entity(metadata)
    def register_ingest_file_split(
        self,
        original_file_metadata: DirectIngestIngestFileMetadata,
        path: GcsfsFilePath,
    ) -> DirectIngestIngestFileMetadata:
        session = SessionFactory.for_schema_base(OperationsBase)

        try:
            metadata = schema.DirectIngestIngestFileMetadata(
                region_code=self.region_code,
                file_tag=original_file_metadata.file_tag,
                is_invalidated=False,
                is_file_split=True,
                job_creation_time=datetime.datetime.utcnow(),
                normalized_file_name=path.file_name,
                datetimes_contained_lower_bound_exclusive=original_file_metadata.datetimes_contained_lower_bound_exclusive,
                datetimes_contained_upper_bound_inclusive=original_file_metadata.datetimes_contained_upper_bound_inclusive,
            )
            session.add(metadata)
            session.commit()
            metadata_entity = self._ingest_file_schema_metadata_as_entity(metadata)
        except Exception as e:
            session.rollback()
            raise e
        finally:
            session.close()

        return metadata_entity
Esempio n. 4
0
    def test_exportViewForArgs_noLowerBound(self):
        # Arrange
        region = self.create_fake_region()
        export_manager = self.create_export_manager(region)
        export_args = GcsfsIngestViewExportArgs(
            ingest_view_name='ingest_view',
            upper_bound_datetime_prev=None,
            upper_bound_datetime_to_export=_DATE_2)

        session = SessionFactory.for_schema_base(OperationsBase)
        metadata = schema.DirectIngestIngestFileMetadata(
            file_id=_ID,
            region_code=region.region_code,
            file_tag=export_args.ingest_view_name,
            normalized_file_name='normalized_file_name',
            is_invalidated=False,
            is_file_split=False,
            job_creation_time=_DATE_1,
            export_time=None,
            datetimes_contained_lower_bound_exclusive=export_args.upper_bound_datetime_prev,
            datetimes_contained_upper_bound_inclusive=export_args.upper_bound_datetime_to_export
        )
        expected_metadata = attr.evolve(self.to_entity(metadata), export_time=_DATE_4)

        session.add(metadata)
        session.commit()
        session.close()

        # Act
        with freeze_time(_DATE_4.isoformat()):
            export_manager.export_view_for_args(export_args)

        # Assert
        self.mock_client.create_table_from_query_async.assert_has_calls([
            mock.call(
                dataset_id='us_xx_ingest_views',
                overwrite=True,
                query=mock.ANY,
                query_parameters=[self.generate_query_params_for_date(export_args.upper_bound_datetime_to_export)],
                table_id='ingest_view_2020_07_20_00_00_00_upper_bound'),
        ])
        expected_query = \
            'SELECT * FROM `recidiviz-456.us_xx_ingest_views.ingest_view_2020_07_20_00_00_00_upper_bound` ' \
            'ORDER BY colA, colC;'
        self.assert_exported_to_gcs_with_query(expected_query)
        self.mock_client.delete_table.assert_has_calls([
            mock.call(dataset_id='us_xx_ingest_views', table_id='ingest_view_2020_07_20_00_00_00_upper_bound')])
        assert_session = SessionFactory.for_schema_base(OperationsBase)
        found_metadata = self.to_entity(one(assert_session.query(schema.DirectIngestIngestFileMetadata).all()))
        self.assertEqual(expected_metadata, found_metadata)
        assert_session.close()
Esempio n. 5
0
    def test_exportViewForArgs_detectRowDeletionView_noLowerBound(
            self) -> None:
        # Arrange
        region = self.create_fake_region()
        export_manager = self.create_export_manager(
            region, is_detect_row_deletion_view=True)
        export_args = GcsfsIngestViewExportArgs(
            ingest_view_name="ingest_view",
            upper_bound_datetime_prev=None,
            upper_bound_datetime_to_export=_DATE_2,
        )

        session = SessionFactory.for_schema_base(OperationsBase)
        metadata = schema.DirectIngestIngestFileMetadata(
            file_id=_ID,
            region_code=region.region_code,
            file_tag=export_args.ingest_view_name,
            normalized_file_name="normalized_file_name",
            is_invalidated=False,
            is_file_split=False,
            job_creation_time=_DATE_1,
            export_time=None,
            datetimes_contained_lower_bound_exclusive=export_args.
            upper_bound_datetime_prev,
            datetimes_contained_upper_bound_inclusive=export_args.
            upper_bound_datetime_to_export,
        )
        expected_metadata = attr.evolve(self.to_entity(metadata),
                                        export_time=_DATE_4)

        session.add(metadata)
        session.commit()
        session.close()

        # Act
        with freeze_time(_DATE_4.isoformat()):
            export_manager.export_view_for_args(export_args)

        # Assert
        self.mock_client.run_query_async.assert_not_called()
        self.mock_client.export_query_results_to_cloud_storage.assert_not_called(
        )
        self.mock_client.delete_table.assert_not_called()

        assert_session = SessionFactory.for_schema_base(OperationsBase)
        found_metadata = self.to_entity(
            one(
                assert_session.query(
                    schema.DirectIngestIngestFileMetadata).all()))
        self.assertEqual(expected_metadata, found_metadata)
        assert_session.close()
Esempio n. 6
0
 def test_ingest_file_metadata_split_file_no_file_name_raises(self):
     session = SessionFactory.for_schema_base(OperationsBase)
     ingest_file_metadata = schema.DirectIngestIngestFileMetadata(
         region_code='us_xx_yyyy',
         file_tag='file_tag',
         is_invalidated=False,
         is_file_split=True,
         job_creation_time=datetime.datetime.now(),
         datetimes_contained_lower_bound_exclusive=None,
         datetimes_contained_upper_bound_inclusive=datetime.datetime(
             2020, 5, 11),
     )
     session.add(ingest_file_metadata)
     with self.assertRaises(IntegrityError):
         session.commit()
Esempio n. 7
0
 def test_ingest_file_metadata(self):
     session = SessionFactory.for_schema_base(OperationsBase)
     ingest_file_metadata = schema.DirectIngestIngestFileMetadata(
         region_code='us_xx_yyyy',
         file_tag='file_tag',
         is_invalidated=False,
         is_file_split=False,
         job_creation_time=datetime.datetime.now(),
         datetimes_contained_lower_bound_exclusive=None,
         datetimes_contained_upper_bound_inclusive=datetime.datetime(
             2020, 5, 11))
     session.add(ingest_file_metadata)
     session.commit()
     result_metadata = one(
         session.query(schema.DirectIngestIngestFileMetadata).all())
     self.assertEqual(result_metadata, ingest_file_metadata)
     self.assertIsNotNone(result_metadata.file_id)
Esempio n. 8
0
 def test_ingest_file_metadata_split_file_no_file_name_raises(self) -> None:
     with SessionFactory.using_database(self.database_key,
                                        autocommit=False) as session:
         ingest_file_metadata = schema.DirectIngestIngestFileMetadata(
             region_code="us_xx_yyyy",
             file_tag="file_tag",
             is_invalidated=False,
             is_file_split=True,
             job_creation_time=datetime.datetime.now(),
             datetimes_contained_lower_bound_exclusive=None,
             datetimes_contained_upper_bound_inclusive=datetime.datetime(
                 2020, 5, 11),
             ingest_database_name=DEFAULT_DB_NAME,
         )
         session.add(ingest_file_metadata)
         with self.assertRaises(IntegrityError):
             session.commit()
Esempio n. 9
0
    def test_ingest_file_processed_time_no_discovery_time_raises(self):
        session = SessionFactory.for_schema_base(OperationsBase)
        ingest_file_metadata = schema.DirectIngestIngestFileMetadata(
            region_code="us_xx_yyyy",
            file_tag="file_tag",
            is_invalidated=False,
            job_creation_time=datetime.datetime.now(),
            datetimes_contained_lower_bound_exclusive=None,
            datetimes_contained_upper_bound_inclusive=datetime.datetime(
                2020, 5, 11),
            export_time=datetime.datetime(2020, 5, 12),
            normalized_file_name="foo.txt",
            processed_time=datetime.datetime(2020, 5, 13),
        )
        session.add(ingest_file_metadata)

        with self.assertRaises(IntegrityError):
            session.commit()
Esempio n. 10
0
 def register_ingest_file_export_job(
     self, ingest_view_job_args: GcsfsIngestViewExportArgs
 ) -> DirectIngestIngestFileMetadata:
     with SessionFactory.using_database(self.database_key) as session:
         metadata = schema.DirectIngestIngestFileMetadata(
             region_code=self.region_code,
             file_tag=ingest_view_job_args.ingest_view_name,
             is_invalidated=False,
             is_file_split=False,
             job_creation_time=datetime.datetime.now(tz=pytz.UTC),
             datetimes_contained_lower_bound_exclusive=ingest_view_job_args.
             upper_bound_datetime_prev,
             datetimes_contained_upper_bound_inclusive=ingest_view_job_args.
             upper_bound_datetime_to_export,
             ingest_database_name=self.ingest_database_name,
         )
         session.add(metadata)
         session.commit()
         return self._ingest_file_schema_metadata_as_entity(metadata)
Esempio n. 11
0
 def test_ingest_file_metadata(self) -> None:
     with SessionFactory.using_database(self.database_key,
                                        autocommit=False) as session:
         ingest_file_metadata = schema.DirectIngestIngestFileMetadata(
             region_code="us_xx_yyyy",
             file_tag="file_tag",
             is_invalidated=False,
             is_file_split=False,
             job_creation_time=datetime.datetime.now(),
             datetimes_contained_lower_bound_exclusive=None,
             datetimes_contained_upper_bound_inclusive=datetime.datetime(
                 2020, 5, 11),
             ingest_database_name=DEFAULT_DB_NAME,
         )
         session.add(ingest_file_metadata)
         session.commit()
         result_metadata = one(
             session.query(schema.DirectIngestIngestFileMetadata).all())
         self.assertEqual(result_metadata, ingest_file_metadata)
         self.assertIsNotNone(result_metadata.file_id)
Esempio n. 12
0
    def test_ingest_file_metadata_file_name_without_export_time_does_not_raise(
            self):
        session = SessionFactory.for_schema_base(OperationsBase)
        ingest_file_metadata = schema.DirectIngestIngestFileMetadata(
            region_code="us_xx_yyyy",
            file_tag="file_tag",
            is_invalidated=False,
            is_file_split=False,
            job_creation_time=datetime.datetime.now(),
            datetimes_contained_lower_bound_exclusive=None,
            datetimes_contained_upper_bound_inclusive=datetime.datetime(
                2020, 5, 11),
            normalized_file_name="foo.txt",
        )

        session.add(ingest_file_metadata)
        session.commit()
        result_metadata = one(
            session.query(schema.DirectIngestIngestFileMetadata).all())
        self.assertEqual(result_metadata, ingest_file_metadata)
        self.assertIsNotNone(result_metadata.file_id)
Esempio n. 13
0
    def test_ingest_file_datetimes_contained_constraint(self) -> None:
        with SessionFactory.using_database(self.database_key,
                                           autocommit=False) as session:
            ingest_file_metadata = schema.DirectIngestIngestFileMetadata(
                region_code="us_xx_yyyy",
                file_tag="file_tag",
                is_invalidated=False,
                job_creation_time=datetime.datetime.now(),
                datetimes_contained_lower_bound_exclusive=datetime.datetime(
                    2020, 6, 11),
                datetimes_contained_upper_bound_inclusive=datetime.datetime(
                    2020, 5, 11),
                export_time=datetime.datetime(2020, 5, 12),
                discovery_time=datetime.datetime(2020, 5, 12),
                normalized_file_name="foo.txt",
                processed_time=datetime.datetime(2020, 5, 13),
                ingest_database_name=DEFAULT_DB_NAME,
            )
            session.add(ingest_file_metadata)

            with self.assertRaises(IntegrityError):
                session.commit()
Esempio n. 14
0
    def test_exportViewForArgs_alreadyExported(self):
        # Arrange
        region = self.create_fake_region()
        export_manager = self.create_export_manager(region)
        export_args = GcsfsIngestViewExportArgs(
            ingest_view_name='ingest_view',
            upper_bound_datetime_prev=_DATE_1,
            upper_bound_datetime_to_export=_DATE_2)

        session = SessionFactory.for_schema_base(OperationsBase)
        metadata = schema.DirectIngestIngestFileMetadata(
            file_id=_ID,
            region_code=region.region_code,
            file_tag=export_args.ingest_view_name,
            normalized_file_name='normalized_file_name',
            is_invalidated=False,
            is_file_split=False,
            job_creation_time=_DATE_1,
            export_time=_DATE_2,
            datetimes_contained_lower_bound_exclusive=export_args.upper_bound_datetime_prev,
            datetimes_contained_upper_bound_inclusive=export_args.upper_bound_datetime_to_export
        )
        expected_metadata = self.to_entity(metadata)
        session.add(metadata)
        session.commit()
        session.close()

        # Act
        export_manager.export_view_for_args(export_args)

        # Assert
        self.mock_client.create_table_from_query_async.assert_not_called()
        self.mock_client.export_query_results_to_cloud_storage.assert_not_called()
        self.mock_client.delete_table.assert_not_called()
        assert_session = SessionFactory.for_schema_base(OperationsBase)
        found_metadata = self.to_entity(one(assert_session.query(schema.DirectIngestIngestFileMetadata).all()))
        self.assertEqual(expected_metadata, found_metadata)
        assert_session.close()
Esempio n. 15
0
    def test_exportViewForArgs_detectRowDeletionView(self) -> None:
        # Arrange
        region = self.create_fake_region()
        export_manager = self.create_export_manager(
            region, is_detect_row_deletion_view=True)
        export_args = GcsfsIngestViewExportArgs(
            ingest_view_name="ingest_view",
            upper_bound_datetime_prev=_DATE_1,
            upper_bound_datetime_to_export=_DATE_2,
        )

        session = SessionFactory.for_schema_base(OperationsBase)
        metadata = schema.DirectIngestIngestFileMetadata(
            file_id=_ID,
            region_code=region.region_code,
            file_tag=export_args.ingest_view_name,
            normalized_file_name="normalized_file_name",
            is_invalidated=False,
            is_file_split=False,
            job_creation_time=_DATE_1,
            export_time=None,
            datetimes_contained_lower_bound_exclusive=export_args.
            upper_bound_datetime_prev,
            datetimes_contained_upper_bound_inclusive=export_args.
            upper_bound_datetime_to_export,
        )
        expected_metadata = attr.evolve(self.to_entity(metadata),
                                        export_time=_DATE_4)
        session.add(metadata)
        session.commit()
        session.close()

        # Act
        with freeze_time(_DATE_4.isoformat()):
            export_manager.export_view_for_args(export_args)

        expected_upper_bound_query = _DATE_2_UPPER_BOUND_CREATE_TABLE_SCRIPT
        expected_lower_bound_query = expected_upper_bound_query.replace(
            "2020_07_20_00_00_00_upper_bound",
            "2019_07_20_00_00_00_lower_bound")

        # Assert
        self.mock_client.run_query_async.assert_has_calls([
            mock.call(
                query_str=expected_upper_bound_query,
                query_parameters=[
                    self.generate_query_params_for_date(_DATE_2)
                ],
            ),
            mock.call(
                query_str=expected_lower_bound_query,
                query_parameters=[
                    self.generate_query_params_for_date(_DATE_1)
                ],
            ),
        ])
        # Lower bound is the first part of the subquery, not upper bound.
        expected_query = (
            "(SELECT * FROM `recidiviz-456.us_xx_ingest_views.ingest_view_2019_07_20_00_00_00_lower_bound`) "
            "EXCEPT DISTINCT "
            "(SELECT * FROM `recidiviz-456.us_xx_ingest_views.ingest_view_2020_07_20_00_00_00_upper_bound`)"
            "ORDER BY colA, colC;")
        self.assert_exported_to_gcs_with_query(expected_query)
        self.mock_client.delete_table.assert_has_calls([
            mock.call(
                dataset_id="us_xx_ingest_views",
                table_id="ingest_view_2020_07_20_00_00_00_upper_bound",
            ),
            mock.call(
                dataset_id="us_xx_ingest_views",
                table_id="ingest_view_2019_07_20_00_00_00_lower_bound",
            ),
        ])

        assert_session = SessionFactory.for_schema_base(OperationsBase)
        found_metadata = self.to_entity(
            one(
                assert_session.query(
                    schema.DirectIngestIngestFileMetadata).all()))
        self.assertEqual(expected_metadata, found_metadata)
        assert_session.close()