def register_ingest_file_export_job( self, ingest_view_job_args: GcsfsIngestViewExportArgs ) -> DirectIngestIngestFileMetadata: session = SessionFactory.for_schema_base(OperationsBase) try: metadata = schema.DirectIngestIngestFileMetadata( region_code=self.region_code, file_tag=ingest_view_job_args.ingest_view_name, is_invalidated=False, is_file_split=False, job_creation_time=datetime.datetime.utcnow(), datetimes_contained_lower_bound_exclusive=ingest_view_job_args.upper_bound_datetime_prev, datetimes_contained_upper_bound_inclusive=ingest_view_job_args.upper_bound_datetime_to_export, ) session.add(metadata) session.commit() metadata_entity = self._ingest_file_schema_metadata_as_entity(metadata) except Exception as e: session.rollback() raise e finally: session.close() return metadata_entity
def register_ingest_file_split( self, original_file_metadata: DirectIngestIngestFileMetadata, path: GcsfsFilePath, ) -> DirectIngestIngestFileMetadata: self._check_is_ingest_view_file_path(path) with SessionFactory.using_database(self.database_key) as session: metadata = schema.DirectIngestIngestFileMetadata( region_code=self.region_code, file_tag=original_file_metadata.file_tag, is_invalidated=False, is_file_split=True, job_creation_time=datetime.datetime.now(tz=pytz.UTC), normalized_file_name=path.file_name, datetimes_contained_lower_bound_exclusive=original_file_metadata .datetimes_contained_lower_bound_exclusive, datetimes_contained_upper_bound_inclusive=original_file_metadata .datetimes_contained_upper_bound_inclusive, ingest_database_name=original_file_metadata. ingest_database_name, ) session.add(metadata) session.commit() return self._ingest_file_schema_metadata_as_entity(metadata)
def register_ingest_file_split( self, original_file_metadata: DirectIngestIngestFileMetadata, path: GcsfsFilePath, ) -> DirectIngestIngestFileMetadata: session = SessionFactory.for_schema_base(OperationsBase) try: metadata = schema.DirectIngestIngestFileMetadata( region_code=self.region_code, file_tag=original_file_metadata.file_tag, is_invalidated=False, is_file_split=True, job_creation_time=datetime.datetime.utcnow(), normalized_file_name=path.file_name, datetimes_contained_lower_bound_exclusive=original_file_metadata.datetimes_contained_lower_bound_exclusive, datetimes_contained_upper_bound_inclusive=original_file_metadata.datetimes_contained_upper_bound_inclusive, ) session.add(metadata) session.commit() metadata_entity = self._ingest_file_schema_metadata_as_entity(metadata) except Exception as e: session.rollback() raise e finally: session.close() return metadata_entity
def test_exportViewForArgs_noLowerBound(self): # Arrange region = self.create_fake_region() export_manager = self.create_export_manager(region) export_args = GcsfsIngestViewExportArgs( ingest_view_name='ingest_view', upper_bound_datetime_prev=None, upper_bound_datetime_to_export=_DATE_2) session = SessionFactory.for_schema_base(OperationsBase) metadata = schema.DirectIngestIngestFileMetadata( file_id=_ID, region_code=region.region_code, file_tag=export_args.ingest_view_name, normalized_file_name='normalized_file_name', is_invalidated=False, is_file_split=False, job_creation_time=_DATE_1, export_time=None, datetimes_contained_lower_bound_exclusive=export_args.upper_bound_datetime_prev, datetimes_contained_upper_bound_inclusive=export_args.upper_bound_datetime_to_export ) expected_metadata = attr.evolve(self.to_entity(metadata), export_time=_DATE_4) session.add(metadata) session.commit() session.close() # Act with freeze_time(_DATE_4.isoformat()): export_manager.export_view_for_args(export_args) # Assert self.mock_client.create_table_from_query_async.assert_has_calls([ mock.call( dataset_id='us_xx_ingest_views', overwrite=True, query=mock.ANY, query_parameters=[self.generate_query_params_for_date(export_args.upper_bound_datetime_to_export)], table_id='ingest_view_2020_07_20_00_00_00_upper_bound'), ]) expected_query = \ 'SELECT * FROM `recidiviz-456.us_xx_ingest_views.ingest_view_2020_07_20_00_00_00_upper_bound` ' \ 'ORDER BY colA, colC;' self.assert_exported_to_gcs_with_query(expected_query) self.mock_client.delete_table.assert_has_calls([ mock.call(dataset_id='us_xx_ingest_views', table_id='ingest_view_2020_07_20_00_00_00_upper_bound')]) assert_session = SessionFactory.for_schema_base(OperationsBase) found_metadata = self.to_entity(one(assert_session.query(schema.DirectIngestIngestFileMetadata).all())) self.assertEqual(expected_metadata, found_metadata) assert_session.close()
def test_exportViewForArgs_detectRowDeletionView_noLowerBound( self) -> None: # Arrange region = self.create_fake_region() export_manager = self.create_export_manager( region, is_detect_row_deletion_view=True) export_args = GcsfsIngestViewExportArgs( ingest_view_name="ingest_view", upper_bound_datetime_prev=None, upper_bound_datetime_to_export=_DATE_2, ) session = SessionFactory.for_schema_base(OperationsBase) metadata = schema.DirectIngestIngestFileMetadata( file_id=_ID, region_code=region.region_code, file_tag=export_args.ingest_view_name, normalized_file_name="normalized_file_name", is_invalidated=False, is_file_split=False, job_creation_time=_DATE_1, export_time=None, datetimes_contained_lower_bound_exclusive=export_args. upper_bound_datetime_prev, datetimes_contained_upper_bound_inclusive=export_args. upper_bound_datetime_to_export, ) expected_metadata = attr.evolve(self.to_entity(metadata), export_time=_DATE_4) session.add(metadata) session.commit() session.close() # Act with freeze_time(_DATE_4.isoformat()): export_manager.export_view_for_args(export_args) # Assert self.mock_client.run_query_async.assert_not_called() self.mock_client.export_query_results_to_cloud_storage.assert_not_called( ) self.mock_client.delete_table.assert_not_called() assert_session = SessionFactory.for_schema_base(OperationsBase) found_metadata = self.to_entity( one( assert_session.query( schema.DirectIngestIngestFileMetadata).all())) self.assertEqual(expected_metadata, found_metadata) assert_session.close()
def test_ingest_file_metadata_split_file_no_file_name_raises(self): session = SessionFactory.for_schema_base(OperationsBase) ingest_file_metadata = schema.DirectIngestIngestFileMetadata( region_code='us_xx_yyyy', file_tag='file_tag', is_invalidated=False, is_file_split=True, job_creation_time=datetime.datetime.now(), datetimes_contained_lower_bound_exclusive=None, datetimes_contained_upper_bound_inclusive=datetime.datetime( 2020, 5, 11), ) session.add(ingest_file_metadata) with self.assertRaises(IntegrityError): session.commit()
def test_ingest_file_metadata(self): session = SessionFactory.for_schema_base(OperationsBase) ingest_file_metadata = schema.DirectIngestIngestFileMetadata( region_code='us_xx_yyyy', file_tag='file_tag', is_invalidated=False, is_file_split=False, job_creation_time=datetime.datetime.now(), datetimes_contained_lower_bound_exclusive=None, datetimes_contained_upper_bound_inclusive=datetime.datetime( 2020, 5, 11)) session.add(ingest_file_metadata) session.commit() result_metadata = one( session.query(schema.DirectIngestIngestFileMetadata).all()) self.assertEqual(result_metadata, ingest_file_metadata) self.assertIsNotNone(result_metadata.file_id)
def test_ingest_file_metadata_split_file_no_file_name_raises(self) -> None: with SessionFactory.using_database(self.database_key, autocommit=False) as session: ingest_file_metadata = schema.DirectIngestIngestFileMetadata( region_code="us_xx_yyyy", file_tag="file_tag", is_invalidated=False, is_file_split=True, job_creation_time=datetime.datetime.now(), datetimes_contained_lower_bound_exclusive=None, datetimes_contained_upper_bound_inclusive=datetime.datetime( 2020, 5, 11), ingest_database_name=DEFAULT_DB_NAME, ) session.add(ingest_file_metadata) with self.assertRaises(IntegrityError): session.commit()
def test_ingest_file_processed_time_no_discovery_time_raises(self): session = SessionFactory.for_schema_base(OperationsBase) ingest_file_metadata = schema.DirectIngestIngestFileMetadata( region_code="us_xx_yyyy", file_tag="file_tag", is_invalidated=False, job_creation_time=datetime.datetime.now(), datetimes_contained_lower_bound_exclusive=None, datetimes_contained_upper_bound_inclusive=datetime.datetime( 2020, 5, 11), export_time=datetime.datetime(2020, 5, 12), normalized_file_name="foo.txt", processed_time=datetime.datetime(2020, 5, 13), ) session.add(ingest_file_metadata) with self.assertRaises(IntegrityError): session.commit()
def register_ingest_file_export_job( self, ingest_view_job_args: GcsfsIngestViewExportArgs ) -> DirectIngestIngestFileMetadata: with SessionFactory.using_database(self.database_key) as session: metadata = schema.DirectIngestIngestFileMetadata( region_code=self.region_code, file_tag=ingest_view_job_args.ingest_view_name, is_invalidated=False, is_file_split=False, job_creation_time=datetime.datetime.now(tz=pytz.UTC), datetimes_contained_lower_bound_exclusive=ingest_view_job_args. upper_bound_datetime_prev, datetimes_contained_upper_bound_inclusive=ingest_view_job_args. upper_bound_datetime_to_export, ingest_database_name=self.ingest_database_name, ) session.add(metadata) session.commit() return self._ingest_file_schema_metadata_as_entity(metadata)
def test_ingest_file_metadata(self) -> None: with SessionFactory.using_database(self.database_key, autocommit=False) as session: ingest_file_metadata = schema.DirectIngestIngestFileMetadata( region_code="us_xx_yyyy", file_tag="file_tag", is_invalidated=False, is_file_split=False, job_creation_time=datetime.datetime.now(), datetimes_contained_lower_bound_exclusive=None, datetimes_contained_upper_bound_inclusive=datetime.datetime( 2020, 5, 11), ingest_database_name=DEFAULT_DB_NAME, ) session.add(ingest_file_metadata) session.commit() result_metadata = one( session.query(schema.DirectIngestIngestFileMetadata).all()) self.assertEqual(result_metadata, ingest_file_metadata) self.assertIsNotNone(result_metadata.file_id)
def test_ingest_file_metadata_file_name_without_export_time_does_not_raise( self): session = SessionFactory.for_schema_base(OperationsBase) ingest_file_metadata = schema.DirectIngestIngestFileMetadata( region_code="us_xx_yyyy", file_tag="file_tag", is_invalidated=False, is_file_split=False, job_creation_time=datetime.datetime.now(), datetimes_contained_lower_bound_exclusive=None, datetimes_contained_upper_bound_inclusive=datetime.datetime( 2020, 5, 11), normalized_file_name="foo.txt", ) session.add(ingest_file_metadata) session.commit() result_metadata = one( session.query(schema.DirectIngestIngestFileMetadata).all()) self.assertEqual(result_metadata, ingest_file_metadata) self.assertIsNotNone(result_metadata.file_id)
def test_ingest_file_datetimes_contained_constraint(self) -> None: with SessionFactory.using_database(self.database_key, autocommit=False) as session: ingest_file_metadata = schema.DirectIngestIngestFileMetadata( region_code="us_xx_yyyy", file_tag="file_tag", is_invalidated=False, job_creation_time=datetime.datetime.now(), datetimes_contained_lower_bound_exclusive=datetime.datetime( 2020, 6, 11), datetimes_contained_upper_bound_inclusive=datetime.datetime( 2020, 5, 11), export_time=datetime.datetime(2020, 5, 12), discovery_time=datetime.datetime(2020, 5, 12), normalized_file_name="foo.txt", processed_time=datetime.datetime(2020, 5, 13), ingest_database_name=DEFAULT_DB_NAME, ) session.add(ingest_file_metadata) with self.assertRaises(IntegrityError): session.commit()
def test_exportViewForArgs_alreadyExported(self): # Arrange region = self.create_fake_region() export_manager = self.create_export_manager(region) export_args = GcsfsIngestViewExportArgs( ingest_view_name='ingest_view', upper_bound_datetime_prev=_DATE_1, upper_bound_datetime_to_export=_DATE_2) session = SessionFactory.for_schema_base(OperationsBase) metadata = schema.DirectIngestIngestFileMetadata( file_id=_ID, region_code=region.region_code, file_tag=export_args.ingest_view_name, normalized_file_name='normalized_file_name', is_invalidated=False, is_file_split=False, job_creation_time=_DATE_1, export_time=_DATE_2, datetimes_contained_lower_bound_exclusive=export_args.upper_bound_datetime_prev, datetimes_contained_upper_bound_inclusive=export_args.upper_bound_datetime_to_export ) expected_metadata = self.to_entity(metadata) session.add(metadata) session.commit() session.close() # Act export_manager.export_view_for_args(export_args) # Assert self.mock_client.create_table_from_query_async.assert_not_called() self.mock_client.export_query_results_to_cloud_storage.assert_not_called() self.mock_client.delete_table.assert_not_called() assert_session = SessionFactory.for_schema_base(OperationsBase) found_metadata = self.to_entity(one(assert_session.query(schema.DirectIngestIngestFileMetadata).all())) self.assertEqual(expected_metadata, found_metadata) assert_session.close()
def test_exportViewForArgs_detectRowDeletionView(self) -> None: # Arrange region = self.create_fake_region() export_manager = self.create_export_manager( region, is_detect_row_deletion_view=True) export_args = GcsfsIngestViewExportArgs( ingest_view_name="ingest_view", upper_bound_datetime_prev=_DATE_1, upper_bound_datetime_to_export=_DATE_2, ) session = SessionFactory.for_schema_base(OperationsBase) metadata = schema.DirectIngestIngestFileMetadata( file_id=_ID, region_code=region.region_code, file_tag=export_args.ingest_view_name, normalized_file_name="normalized_file_name", is_invalidated=False, is_file_split=False, job_creation_time=_DATE_1, export_time=None, datetimes_contained_lower_bound_exclusive=export_args. upper_bound_datetime_prev, datetimes_contained_upper_bound_inclusive=export_args. upper_bound_datetime_to_export, ) expected_metadata = attr.evolve(self.to_entity(metadata), export_time=_DATE_4) session.add(metadata) session.commit() session.close() # Act with freeze_time(_DATE_4.isoformat()): export_manager.export_view_for_args(export_args) expected_upper_bound_query = _DATE_2_UPPER_BOUND_CREATE_TABLE_SCRIPT expected_lower_bound_query = expected_upper_bound_query.replace( "2020_07_20_00_00_00_upper_bound", "2019_07_20_00_00_00_lower_bound") # Assert self.mock_client.run_query_async.assert_has_calls([ mock.call( query_str=expected_upper_bound_query, query_parameters=[ self.generate_query_params_for_date(_DATE_2) ], ), mock.call( query_str=expected_lower_bound_query, query_parameters=[ self.generate_query_params_for_date(_DATE_1) ], ), ]) # Lower bound is the first part of the subquery, not upper bound. expected_query = ( "(SELECT * FROM `recidiviz-456.us_xx_ingest_views.ingest_view_2019_07_20_00_00_00_lower_bound`) " "EXCEPT DISTINCT " "(SELECT * FROM `recidiviz-456.us_xx_ingest_views.ingest_view_2020_07_20_00_00_00_upper_bound`)" "ORDER BY colA, colC;") self.assert_exported_to_gcs_with_query(expected_query) self.mock_client.delete_table.assert_has_calls([ mock.call( dataset_id="us_xx_ingest_views", table_id="ingest_view_2020_07_20_00_00_00_upper_bound", ), mock.call( dataset_id="us_xx_ingest_views", table_id="ingest_view_2019_07_20_00_00_00_lower_bound", ), ]) assert_session = SessionFactory.for_schema_base(OperationsBase) found_metadata = self.to_entity( one( assert_session.query( schema.DirectIngestIngestFileMetadata).all())) self.assertEqual(expected_metadata, found_metadata) assert_session.close()