def test_get_ingest_view_metadata_pending_export_all_exported_in_region( self): args = GcsfsIngestViewExportArgs( ingest_view_name='file_tag', upper_bound_datetime_prev=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3)) args_other_region = GcsfsIngestViewExportArgs( ingest_view_name='other_file_tag', upper_bound_datetime_prev=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3)) with freeze_time('2015-01-02T03:06:06'): self.metadata_manager.register_ingest_file_export_job(args) self.metadata_manager_other_region.register_ingest_file_export_job( args_other_region) with freeze_time('2015-01-02T03:07:07'): path = self._make_unprocessed_path( 'bucket/file_tag.csv', file_type=GcsfsDirectIngestFileType.INGEST_VIEW, dt=datetime.datetime.utcnow()) metadata = self.metadata_manager.get_ingest_view_metadata_for_export_job( args) self.metadata_manager.register_ingest_view_export_file_name( metadata, path) # ... export actually performed in here self.metadata_manager.mark_ingest_view_exported(metadata) self.assertEqual( [], self.metadata_manager.get_ingest_view_metadata_pending_export())
def test_gcsfs_ingest_view_export_args(self) -> None: dt_lower = datetime.datetime(2019, 1, 22, 11, 22, 33, 444444) dt_upper = datetime.datetime(2019, 11, 22, 11, 22, 33, 444444) args = GcsfsIngestViewExportArgs( ingest_view_name="my_file_tag", upper_bound_datetime_prev=None, upper_bound_datetime_to_export=dt_upper, ) self.assertEqual( "ingest_view_export_my_file_tag-None-2019_11_22_11_22_33_444444", args.task_id_tag(), ) args = GcsfsIngestViewExportArgs( ingest_view_name="my_file_tag", upper_bound_datetime_prev=dt_lower, upper_bound_datetime_to_export=dt_upper, ) self.assertEqual( "ingest_view_export_my_file_tag-2019_01_22_11_22_33_444444-2019_11_22_11_22_33_444444", args.task_id_tag(), )
def test_get_ingest_view_metadata_for_most_recent_valid_job(self): with freeze_time('2015-01-02T03:05:05'): self.metadata_manager.register_ingest_file_export_job( GcsfsIngestViewExportArgs( ingest_view_name='file_tag', upper_bound_datetime_prev=None, upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2))) with freeze_time('2015-01-02T03:06:06'): self.metadata_manager.register_ingest_file_export_job( GcsfsIngestViewExportArgs( ingest_view_name='file_tag', upper_bound_datetime_prev=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3))) with freeze_time('2015-01-02T03:07:07'): self.metadata_manager.register_ingest_file_export_job( GcsfsIngestViewExportArgs( ingest_view_name='another_tag', upper_bound_datetime_prev=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3), upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 3, 4, 4, 4))) most_recent_valid_job = self.metadata_manager.get_ingest_view_metadata_for_most_recent_valid_job( 'file_tag') self.assertIsNotNone(most_recent_valid_job) self.assertEqual('file_tag', most_recent_valid_job.file_tag) self.assertEqual( datetime.datetime(2015, 1, 2, 2, 2, 2, 2), most_recent_valid_job.datetimes_contained_lower_bound_exclusive) self.assertEqual( datetime.datetime(2015, 1, 2, 3, 3, 3, 3), most_recent_valid_job.datetimes_contained_upper_bound_inclusive) # Invalidate the row that was just returned session = SessionFactory.for_schema_base(OperationsBase) results = session.query( schema.DirectIngestIngestFileMetadata).filter_by( file_id=most_recent_valid_job.file_id).all() result = one(results) result.is_invalidated = True session.commit() most_recent_valid_job = self.metadata_manager.get_ingest_view_metadata_for_most_recent_valid_job( 'file_tag') self.assertIsNotNone(most_recent_valid_job) self.assertEqual('file_tag', most_recent_valid_job.file_tag) self.assertEqual( None, most_recent_valid_job.datetimes_contained_lower_bound_exclusive) self.assertEqual( datetime.datetime(2015, 1, 2, 2, 2, 2, 2), most_recent_valid_job.datetimes_contained_upper_bound_inclusive)
def test_ingest_view_file_same_args_after_invalidation(self): args = GcsfsIngestViewExportArgs( ingest_view_name='file_tag', upper_bound_datetime_prev=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3)) ingest_view_unprocessed_path = self._make_unprocessed_path( 'bucket/file_tag.csv', GcsfsDirectIngestFileType.INGEST_VIEW) self.run_ingest_view_file_progression(args, self.metadata_manager, ingest_view_unprocessed_path) # Invalidate the previous row session = SessionFactory.for_schema_base(OperationsBase) results = session.query(schema.DirectIngestIngestFileMetadata).all() result = one(results) result.is_invalidated = True session.commit() # Now we can rerun with the same args ingest_view_unprocessed_path = self._make_unprocessed_path( 'bucket/file_tag.csv', GcsfsDirectIngestFileType.INGEST_VIEW, dt=datetime.datetime.now()) self.run_ingest_view_file_progression(args, self.metadata_manager, ingest_view_unprocessed_path)
def test_ingest_view_export(self, mock_supported, mock_region, mock_environment): mock_supported.return_value = ['us_xx'] region_code = 'us_xx' mock_environment.return_value = 'staging' mock_controller = create_autospec(GcsfsDirectIngestController) mock_region.return_value = fake_region(region_code=region_code, environment='staging', ingestor=mock_controller) export_args = GcsfsIngestViewExportArgs( ingest_view_name='my_ingest_view', upper_bound_datetime_prev=datetime.datetime(2020, 4, 29), upper_bound_datetime_to_export=datetime.datetime(2020, 4, 30)) request_args = { 'region': region_code, } body = { 'cloud_task_args': export_args.to_serializable(), 'args_type': 'GcsfsIngestViewExportArgs', } body_encoded = json.dumps(body).encode() headers = {'X-Appengine-Cron': 'test-cron'} response = self.client.post('/ingest_view_export', query_string=request_args, headers=headers, data=body_encoded) self.assertEqual(200, response.status_code) mock_controller.do_ingest_view_export.assert_called_with(export_args)
def get_ingest_view_export_task_args( self) -> List[GcsfsIngestViewExportArgs]: """Looks at what files have been exported for a given region and returns args for all the export jobs that should be started, given what has updated in the raw data tables since the last time we exported data. Also returns any tasks that have not yet completed. """ if not self.region.are_ingest_view_exports_enabled_in_env(): raise ValueError( f'Ingest view exports not enabled for region [{self.region.region_code}]' ) logging.info('Gathering export state for each ingest tag') ingest_view_to_export_state = {} for ingest_view_tag, ingest_view in self.ingest_views_by_tag.items(): export_state = self._get_export_state_for_ingest_view(ingest_view) self._validate_ascending_raw_file_update_dates(export_state) ingest_view_to_export_state[ingest_view_tag] = export_state logging.info('Done gathering export state for each ingest tag') # At this point we know that we have no new raw data backfills that should invalidate either pending or past # completed ingest view exports (checked in _validate_ascending_raw_file_update_dates()). We can now generate # any new jobs. jobs_to_schedule = [] metadata_pending_export = self.file_metadata_manager.get_ingest_view_metadata_pending_export( ) if metadata_pending_export: args_list = self._export_args_from_metadata( metadata_pending_export) jobs_to_schedule.extend(args_list) logging.info('Found [%s] already pending jobs to schedule.', len(jobs_to_schedule)) logging.info('Generating new ingest jobs.') for ingest_view_tag, export_state in ingest_view_to_export_state.items( ): lower_bound_datetime_exclusive = \ export_state.last_export_metadata.datetimes_contained_upper_bound_inclusive \ if export_state.last_export_metadata else None ingest_args_list = [] for _date, upper_bound_datetime_inclusive in export_state.max_update_datetime_by_date: args = GcsfsIngestViewExportArgs( ingest_view_name=ingest_view_tag, upper_bound_datetime_prev=lower_bound_datetime_exclusive, upper_bound_datetime_to_export= upper_bound_datetime_inclusive) logging.info('Generating job args for tag [%s]: [%s].', ingest_view_tag, args) self.file_metadata_manager.register_ingest_file_export_job( args) ingest_args_list.append(args) lower_bound_datetime_exclusive = upper_bound_datetime_inclusive jobs_to_schedule.extend(ingest_args_list) logging.info('Returning [%s] jobs to schedule.', len(jobs_to_schedule)) return jobs_to_schedule
def _export_args_from_metadata( metadata_list: List[DirectIngestIngestFileMetadata]) -> List[GcsfsIngestViewExportArgs]: return [GcsfsIngestViewExportArgs( ingest_view_name=metadata.file_tag, upper_bound_datetime_prev=metadata.datetimes_contained_lower_bound_exclusive, upper_bound_datetime_to_export=metadata.datetimes_contained_upper_bound_inclusive ) for metadata in metadata_list]
def test_ingest_view_file_progression_same_args_twice_throws(self) -> None: args = GcsfsIngestViewExportArgs( ingest_view_name="file_tag", upper_bound_datetime_prev=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3), ) ingest_view_unprocessed_path = self._make_unprocessed_path( "bucket/file_tag.csv", GcsfsDirectIngestFileType.INGEST_VIEW) self.run_ingest_view_file_progression(args, self.metadata_manager, ingest_view_unprocessed_path) with self.assertRaises(IntegrityError): ingest_view_unprocessed_path = self._make_unprocessed_path( "bucket/file_tag.csv", GcsfsDirectIngestFileType.INGEST_VIEW, dt=datetime.datetime.now(), ) self.run_ingest_view_file_progression( args, self.metadata_manager, ingest_view_unprocessed_path) session = SessionFactory.for_schema_base(OperationsBase) results = session.query(schema.DirectIngestIngestFileMetadata).all() self.assertEqual(1, len(results))
def test_get_ingest_view_metadata_pending_export_basic(self): args = GcsfsIngestViewExportArgs( ingest_view_name='file_tag', upper_bound_datetime_prev=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3)) with freeze_time('2015-01-02T03:06:06'): self.metadata_manager.register_ingest_file_export_job(args) expected_list = [ DirectIngestIngestFileMetadata.new_with_defaults( region_code='US_XX', file_tag='file_tag', is_invalidated=False, is_file_split=False, job_creation_time=datetime.datetime(2015, 1, 2, 3, 6, 6), datetimes_contained_lower_bound_exclusive=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), datetimes_contained_upper_bound_inclusive=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3)) ] self.assertEqual( expected_list, self.metadata_manager.get_ingest_view_metadata_pending_export())
def test_ingest_view_file_progression_two_files_same_tag(self) -> None: args = GcsfsIngestViewExportArgs( ingest_view_name="file_tag", upper_bound_datetime_prev=None, upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), ) ingest_view_unprocessed_path_1 = self._make_unprocessed_path( "bucket/file_tag.csv", GcsfsDirectIngestFileType.INGEST_VIEW, dt=datetime.datetime(2015, 1, 2, 2, 2, 2, 2), ) self.run_ingest_view_file_progression(args, self.metadata_manager, ingest_view_unprocessed_path_1) args = GcsfsIngestViewExportArgs( ingest_view_name="file_tag", upper_bound_datetime_prev=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 3, 3, 3, 3, 3), ) ingest_view_unprocessed_path_2 = self._make_unprocessed_path( "bucket/file_tag.csv", GcsfsDirectIngestFileType.INGEST_VIEW, dt=datetime.datetime(2015, 1, 3, 3, 3, 3, 3), ) self.run_ingest_view_file_progression(args, self.metadata_manager, ingest_view_unprocessed_path_2) session = SessionFactory.for_schema_base(OperationsBase) results = session.query(schema.DirectIngestIngestFileMetadata).all() self.assertEqual( { ingest_view_unprocessed_path_1.file_name, ingest_view_unprocessed_path_2.file_name, }, {r.normalized_file_name for r in results}, ) for r in results: self.assertTrue(r.export_time) self.assertTrue(r.processed_time)
def test_ingest_view_file_progression(self): args = GcsfsIngestViewExportArgs( ingest_view_name='file_tag', upper_bound_datetime_prev=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3)) ingest_view_unprocessed_path = self._make_unprocessed_path( 'bucket/file_tag.csv', GcsfsDirectIngestFileType.INGEST_VIEW) self.run_ingest_view_file_progression(args, self.metadata_manager, ingest_view_unprocessed_path)
def run_parse_file_test(self, expected: IngestInfo, fixture_file_name: str) -> IngestInfo: """Runs a test that reads and parses a given fixture file. Returns the parsed IngestInfo object for tests to run further validations.""" args = ingest_args_for_fixture_file(self.controller, f"{fixture_file_name}.csv") if not isinstance(self.controller.fs.gcs_file_system, FakeGCSFileSystem): raise ValueError( f"Controller fs must have type " f"FakeGCSFileSystem. Found instead " f"type [{type(self.controller.fs.gcs_file_system)}]") if self.controller.region.is_ingest_launched_in_env(): now = datetime.datetime.now() yesterday = now - datetime.timedelta(days=1) ingest_file_export_job_args = GcsfsIngestViewExportArgs( ingest_view_name=fixture_file_name, upper_bound_datetime_to_export=now, upper_bound_datetime_prev=yesterday, output_bucket_name=self.controller.ingest_bucket_path. bucket_name, ) self.controller.file_metadata_manager.register_ingest_file_export_job( ingest_file_export_job_args) self.controller.ingest_view_export_manager.export_view_for_args( ingest_file_export_job_args) else: fixture_util.add_direct_ingest_path( self.controller.fs.gcs_file_system, args.file_path, region_code=self.controller.region_code(), ) # pylint:disable=protected-access fixture_contents_handle = self.controller._get_contents_handle(args) if fixture_contents_handle is None: self.fail("fixture_contents_handle should not be None") final_info = self.controller._parse(args, fixture_contents_handle) print_visible_header_label("FINAL") print(final_info) print_visible_header_label("EXPECTED") print(expected) self.assertEqual(expected, final_info) return final_info
def test_exportViewForArgs_detectRowDeletionView_noLowerBound( self) -> None: # Arrange region = self.create_fake_region() export_manager = self.create_export_manager( region, is_detect_row_deletion_view=True) export_args = GcsfsIngestViewExportArgs( ingest_view_name="ingest_view", upper_bound_datetime_prev=None, upper_bound_datetime_to_export=_DATE_2, ) session = SessionFactory.for_schema_base(OperationsBase) metadata = schema.DirectIngestIngestFileMetadata( file_id=_ID, region_code=region.region_code, file_tag=export_args.ingest_view_name, normalized_file_name="normalized_file_name", is_invalidated=False, is_file_split=False, job_creation_time=_DATE_1, export_time=None, datetimes_contained_lower_bound_exclusive=export_args. upper_bound_datetime_prev, datetimes_contained_upper_bound_inclusive=export_args. upper_bound_datetime_to_export, ) expected_metadata = attr.evolve(self.to_entity(metadata), export_time=_DATE_4) session.add(metadata) session.commit() session.close() # Act with freeze_time(_DATE_4.isoformat()): export_manager.export_view_for_args(export_args) # Assert self.mock_client.run_query_async.assert_not_called() self.mock_client.export_query_results_to_cloud_storage.assert_not_called( ) self.mock_client.delete_table.assert_not_called() assert_session = SessionFactory.for_schema_base(OperationsBase) found_metadata = self.to_entity( one( assert_session.query( schema.DirectIngestIngestFileMetadata).all())) self.assertEqual(expected_metadata, found_metadata) assert_session.close()
def test_exportViewForArgs_noLowerBound(self): # Arrange region = self.create_fake_region() export_manager = self.create_export_manager(region) export_args = GcsfsIngestViewExportArgs( ingest_view_name='ingest_view', upper_bound_datetime_prev=None, upper_bound_datetime_to_export=_DATE_2) session = SessionFactory.for_schema_base(OperationsBase) metadata = schema.DirectIngestIngestFileMetadata( file_id=_ID, region_code=region.region_code, file_tag=export_args.ingest_view_name, normalized_file_name='normalized_file_name', is_invalidated=False, is_file_split=False, job_creation_time=_DATE_1, export_time=None, datetimes_contained_lower_bound_exclusive=export_args.upper_bound_datetime_prev, datetimes_contained_upper_bound_inclusive=export_args.upper_bound_datetime_to_export ) expected_metadata = attr.evolve(self.to_entity(metadata), export_time=_DATE_4) session.add(metadata) session.commit() session.close() # Act with freeze_time(_DATE_4.isoformat()): export_manager.export_view_for_args(export_args) # Assert self.mock_client.create_table_from_query_async.assert_has_calls([ mock.call( dataset_id='us_xx_ingest_views', overwrite=True, query=mock.ANY, query_parameters=[self.generate_query_params_for_date(export_args.upper_bound_datetime_to_export)], table_id='ingest_view_2020_07_20_00_00_00_upper_bound'), ]) expected_query = \ 'SELECT * FROM `recidiviz-456.us_xx_ingest_views.ingest_view_2020_07_20_00_00_00_upper_bound` ' \ 'ORDER BY colA, colC;' self.assert_exported_to_gcs_with_query(expected_query) self.mock_client.delete_table.assert_has_calls([ mock.call(dataset_id='us_xx_ingest_views', table_id='ingest_view_2020_07_20_00_00_00_upper_bound')]) assert_session = SessionFactory.for_schema_base(OperationsBase) found_metadata = self.to_entity(one(assert_session.query(schema.DirectIngestIngestFileMetadata).all())) self.assertEqual(expected_metadata, found_metadata) assert_session.close()
def test_create_direct_ingest_ingest_view_export_task( self, mock_client: mock.MagicMock, mock_uuid: mock.MagicMock) -> None: # Arrange export_args = GcsfsIngestViewExportArgs( ingest_view_name="my_ingest_view", output_bucket_name="my_ingest_bucket", upper_bound_datetime_prev=datetime.datetime(2020, 4, 29), upper_bound_datetime_to_export=datetime.datetime(2020, 4, 30), ) body = { "cloud_task_args": export_args.to_serializable(), "args_type": "GcsfsIngestViewExportArgs", } body_encoded = json.dumps(body).encode() uuid = "random-uuid" mock_uuid.uuid4.return_value = uuid date = "2019-07-20" queue_path = f"{DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2}-path" task_name = DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2 + "/{}-{}-{}".format( _REGION.region_code, date, uuid) url_params = { "region": _REGION.region_code, "output_bucket": "my_ingest_bucket", } task = tasks_v2.types.task_pb2.Task( name=task_name, app_engine_http_request={ "http_method": "POST", "relative_uri": f"/direct/ingest_view_export?{urlencode(url_params)}", "body": body_encoded, }, ) mock_client.return_value.task_path.return_value = task_name mock_client.return_value.queue_path.return_value = queue_path # Act DirectIngestCloudTaskManagerImpl( ).create_direct_ingest_ingest_view_export_task( _REGION, DirectIngestInstance.PRIMARY, export_args) # Assert mock_client.return_value.queue_path.assert_called_with( self.mock_project_id, QUEUES_REGION, DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2) mock_client.return_value.create_task.assert_called_with( parent=queue_path, task=task)
def test_ingest_then_split_progression(self) -> None: args = GcsfsIngestViewExportArgs( ingest_view_name="file_tag", upper_bound_datetime_prev=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3), ) ingest_view_unprocessed_path = self._make_unprocessed_path( "bucket/file_tag.csv", GcsfsDirectIngestFileType.INGEST_VIEW) self.run_ingest_view_file_progression(args, self.metadata_manager, ingest_view_unprocessed_path, split_file=True)
def test_getIngestViewExportTaskArgs_happy(self) -> None: # Arrange region = self.create_fake_region(ingest_view_exports_enabled=True) export_manager = self.create_export_manager(region) export_manager.file_metadata_manager.get_ingest_view_metadata_for_most_recent_valid_job = Mock( # type: ignore return_value=DirectIngestIngestFileMetadata( file_id=_ID, region_code=region.region_code, file_tag="ingest_view", normalized_file_name="normalized_file_name", processed_time=_DATE_1, is_invalidated=False, is_file_split=False, job_creation_time=_DATE_1, export_time=_DATE_1, datetimes_contained_lower_bound_exclusive=_DATE_1, datetimes_contained_upper_bound_inclusive=_DATE_1, discovery_time=_DATE_1, )) export_manager.file_metadata_manager.get_metadata_for_raw_files_discovered_after_datetime = Mock( # type: ignore return_value=[ DirectIngestRawFileMetadata( file_id=2, region_code=region.region_code, file_tag="ingest_view", discovery_time=_DATE_2, normalized_file_name= "unprocessed_2015-01-02T03:03:03:000003_raw_file_tag.csv", processed_time=None, datetimes_contained_upper_bound_inclusive=_DATE_2, ) ]) # Act args = export_manager.get_ingest_view_export_task_args() # Assert self.assertListEqual( args, [ GcsfsIngestViewExportArgs( ingest_view_name="ingest_view", upper_bound_datetime_prev=_DATE_1, upper_bound_datetime_to_export=_DATE_2, ) ], )
def _run_ingest_job_for_filename(self, filename: str) -> None: """Runs ingest for a the ingest view file with the given unnormalized file name.""" get_region_patcher = patch( "recidiviz.persistence.entity_matching.state." "base_state_matching_delegate.get_region") mock_get_region = get_region_patcher.start() mock_get_region.return_value = self._fake_region() environ_patcher = patch.dict("os.environ", {"PERSIST_LOCALLY": "true"}) environ_patcher.start() file_type = (GcsfsDirectIngestFileType.INGEST_VIEW if self.controller.region. is_raw_vs_ingest_file_name_detection_enabled() else None) if not isinstance(self.controller.fs.gcs_file_system, FakeGCSFileSystem): raise ValueError( f"Controller fs must have type " f"FakeGCSFileSystem. Found instead " f"type [{type(self.controller.fs.gcs_file_system)}]") if self.controller.region.are_ingest_view_exports_enabled_in_env(): now = datetime.datetime.utcnow() yesterday = now - datetime.timedelta(days=1) ingest_file_export_job_args = GcsfsIngestViewExportArgs( ingest_view_name=os.path.splitext(filename)[0], upper_bound_datetime_to_export=now, upper_bound_datetime_prev=yesterday, ) self.controller.file_metadata_manager.register_ingest_file_export_job( ingest_file_export_job_args) self.controller.ingest_view_export_manager.export_view_for_args( ingest_file_export_job_args) else: file_path = path_for_fixture_file(self.controller, filename, file_type=file_type, should_normalize=True) self.controller.fs.gcs_file_system.test_add_path( file_path, filename) run_task_queues_to_empty(self.controller) get_region_patcher.stop() environ_patcher.stop()
def test_exportViewForArgs_noExistingMetadata(self): # Arrange region = self.create_fake_region() export_manager = self.create_export_manager(region) export_args = GcsfsIngestViewExportArgs( ingest_view_name='ingest_view', upper_bound_datetime_prev=_DATE_1, upper_bound_datetime_to_export=_DATE_2) # Act with pytest.raises(ValueError): export_manager.export_view_for_args(export_args) # Assert self.mock_client.create_table_from_query_async.assert_not_called() self.mock_client.export_query_results_to_cloud_storage.assert_not_called() self.mock_client.delete_table.assert_not_called()
def test_create_direct_ingest_ingest_view_export_task( self, mock_client, mock_uuid): # Arrange project_id = 'recidiviz-456' export_args = GcsfsIngestViewExportArgs( ingest_view_name='my_ingest_view', upper_bound_datetime_prev=datetime.datetime(2020, 4, 29), upper_bound_datetime_to_export=datetime.datetime(2020, 4, 30)) body = { 'cloud_task_args': export_args.to_serializable(), 'args_type': 'GcsfsIngestViewExportArgs' } body_encoded = json.dumps(body).encode() uuid = 'random-uuid' mock_uuid.uuid4.return_value = uuid date = '2019-07-20' queue_path = _REGION.shared_queue + '-path' task_name = DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2 + '/{}-{}-{}'.format( _REGION.region_code, date, uuid) task = tasks_v2.types.task_pb2.Task( name=task_name, app_engine_http_request={ 'http_method': 'POST', 'relative_uri': f'/direct/ingest_view_export?region={_REGION.region_code}', 'body': body_encoded }) mock_client.return_value.task_path.return_value = task_name mock_client.return_value.queue_path.return_value = queue_path # Act DirectIngestCloudTaskManagerImpl( project_id=project_id ).create_direct_ingest_ingest_view_export_task(_REGION, export_args) # Assert mock_client.return_value.queue_path.assert_called_with( project_id, QUEUES_REGION, DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2) mock_client.return_value.create_task.assert_called_with( parent=queue_path, task=task)
def test_register_ingest_view_export_file_name_already_exists_raises(self): args = GcsfsIngestViewExportArgs( ingest_view_name='file_tag', upper_bound_datetime_prev=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3)) metadata_entity = self.metadata_manager.register_ingest_file_export_job( args) self.metadata_manager.register_ingest_view_export_file_name( metadata_entity, self._make_unprocessed_path('bucket/file_tag.csv', GcsfsDirectIngestFileType.INGEST_VIEW)) with self.assertRaises(ValueError): self.metadata_manager.register_ingest_view_export_file_name( metadata_entity, self._make_unprocessed_path( 'bucket/file_tag.csv', GcsfsDirectIngestFileType.INGEST_VIEW))
def test_exportViewForArgs_ingestViewExportsDisabled(self) -> None: # Arrange region = self.create_fake_region(ingest_view_exports_enabled=False) export_manager = self.create_export_manager(region) export_args = GcsfsIngestViewExportArgs( ingest_view_name="ingest_view", upper_bound_datetime_prev=_DATE_1, upper_bound_datetime_to_export=_DATE_2, ) # Act with pytest.raises(ValueError): export_manager.export_view_for_args(export_args) # Assert self.mock_client.create_dataset_if_necessary.assert_not_called() self.mock_client.run_query_async.assert_not_called() self.mock_client.export_query_results_to_cloud_storage.assert_not_called( ) self.mock_client.delete_table.assert_not_called()
def test_ingest_view_export( self, mock_supported: mock.MagicMock, mock_region: mock.MagicMock, mock_environment: mock.MagicMock, ) -> None: mock_supported.return_value = ["us_xx"] region_code = "us_xx" mock_environment.return_value = "staging" mock_controller = create_autospec(GcsfsDirectIngestController) mock_region.return_value = fake_region(region_code=region_code, environment="staging", ingestor=mock_controller) export_args = GcsfsIngestViewExportArgs( ingest_view_name="my_ingest_view", upper_bound_datetime_prev=datetime.datetime(2020, 4, 29), upper_bound_datetime_to_export=datetime.datetime(2020, 4, 30), ) request_args = { "region": region_code, } body = { "cloud_task_args": export_args.to_serializable(), "args_type": "GcsfsIngestViewExportArgs", } body_encoded = json.dumps(body).encode() headers = {"X-Appengine-Cron": "test-cron"} response = self.client.post( "/ingest_view_export", query_string=request_args, headers=headers, data=body_encoded, ) self.assertEqual(200, response.status_code) mock_controller.do_ingest_view_export.assert_called_with(export_args)
def _run_ingest_job_for_filename(self, filename: str) -> None: """Runs ingest for a the ingest view file with the given unnormalized file name.""" environ_patcher = patch.dict("os.environ", {"PERSIST_LOCALLY": "true"}) environ_patcher.start() file_type = GcsfsDirectIngestFileType.INGEST_VIEW if not isinstance(self.controller.fs.gcs_file_system, FakeGCSFileSystem): raise ValueError( f"Controller fs must have type " f"FakeGCSFileSystem. Found instead " f"type [{type(self.controller.fs.gcs_file_system)}]") if self.controller.region.is_ingest_launched_in_env(): now = datetime.datetime.now(tz=pytz.UTC) yesterday = now - datetime.timedelta(days=1) ingest_file_export_job_args = GcsfsIngestViewExportArgs( ingest_view_name=os.path.splitext(filename)[0], upper_bound_datetime_to_export=now, upper_bound_datetime_prev=yesterday, output_bucket_name=self.controller.ingest_bucket_path. bucket_name, ) self.controller.file_metadata_manager.register_ingest_file_export_job( ingest_file_export_job_args) self.controller.ingest_view_export_manager.export_view_for_args( ingest_file_export_job_args) else: file_path = path_for_fixture_file(self.controller, filename, file_type=file_type, should_normalize=True) self.controller.fs.gcs_file_system.test_add_path( file_path, filename) run_task_queues_to_empty(self.controller) environ_patcher.stop()
def run_parse_file_test(self, expected: IngestInfo, fixture_file_name: str) -> IngestInfo: """Runs a test that reads and parses a given fixture file. Returns the parsed IngestInfo object for tests to run further validations.""" args = ingest_args_for_fixture_file(self.controller, f'{fixture_file_name}.csv') if not isinstance(self.controller.fs, FakeDirectIngestGCSFileSystem): raise ValueError(f"Controller fs must have type " f"FakeDirectIngestGCSFileSystem. Found instead " f"type [{type(self.controller.fs)}]") if self.controller.region.are_ingest_view_exports_enabled_in_env(): ingest_file_export_job_args = GcsfsIngestViewExportArgs( ingest_view_name=fixture_file_name, upper_bound_datetime_to_export=datetime.datetime.utcnow(), upper_bound_datetime_prev=None) self.controller.file_metadata_manager.register_ingest_file_export_job( ingest_file_export_job_args) self.controller.ingest_view_export_manager.export_view_for_args( ingest_file_export_job_args) else: self.controller.fs.test_add_path(args.file_path) # pylint:disable=protected-access fixture_contents_handle = self.controller._get_contents_handle(args) final_info = self.controller._parse(args, fixture_contents_handle) print_visible_header_label('FINAL') print(final_info) print_visible_header_label('EXPECTED') print(expected) self.assertEqual(expected, final_info) return final_info
def test_exportViewForArgs_alreadyExported(self): # Arrange region = self.create_fake_region() export_manager = self.create_export_manager(region) export_args = GcsfsIngestViewExportArgs( ingest_view_name='ingest_view', upper_bound_datetime_prev=_DATE_1, upper_bound_datetime_to_export=_DATE_2) session = SessionFactory.for_schema_base(OperationsBase) metadata = schema.DirectIngestIngestFileMetadata( file_id=_ID, region_code=region.region_code, file_tag=export_args.ingest_view_name, normalized_file_name='normalized_file_name', is_invalidated=False, is_file_split=False, job_creation_time=_DATE_1, export_time=_DATE_2, datetimes_contained_lower_bound_exclusive=export_args.upper_bound_datetime_prev, datetimes_contained_upper_bound_inclusive=export_args.upper_bound_datetime_to_export ) expected_metadata = self.to_entity(metadata) session.add(metadata) session.commit() session.close() # Act export_manager.export_view_for_args(export_args) # Assert self.mock_client.create_table_from_query_async.assert_not_called() self.mock_client.export_query_results_to_cloud_storage.assert_not_called() self.mock_client.delete_table.assert_not_called() assert_session = SessionFactory.for_schema_base(OperationsBase) found_metadata = self.to_entity(one(assert_session.query(schema.DirectIngestIngestFileMetadata).all())) self.assertEqual(expected_metadata, found_metadata) assert_session.close()
for metadata in metadata_list ] if __name__ == "__main__": # Update these variables and run to print an export query you can run in the BigQuery UI region_code_: str = "us_mo" ingest_view_name_: str = "tak001_offender_identification" upper_bound_datetime_prev_: datetime.datetime = datetime.datetime(2020, 10, 15) upper_bound_datetime_to_export_: datetime.datetime = datetime.datetime(2020, 12, 18) with local_project_id_override(GCP_PROJECT_STAGING): region_ = regions.get_region(region_code_, is_direct_ingest=True) view_collector_ = DirectIngestPreProcessedIngestViewCollector(region_, []) views_by_tag_ = { builder.file_tag: builder.build() for builder in view_collector_.collect_view_builders() } debug_query = DirectIngestIngestViewExportManager.debug_query_for_args( views_by_tag_, GcsfsIngestViewExportArgs( ingest_view_name=ingest_view_name_, upper_bound_datetime_prev=upper_bound_datetime_prev_, upper_bound_datetime_to_export=upper_bound_datetime_to_export_, output_bucket_name="any_bucket", ), ) print(debug_query)
def test_exportViewForArgs_detectRowDeletionView(self) -> None: # Arrange region = self.create_fake_region() export_manager = self.create_export_manager( region, is_detect_row_deletion_view=True) export_args = GcsfsIngestViewExportArgs( ingest_view_name="ingest_view", upper_bound_datetime_prev=_DATE_1, upper_bound_datetime_to_export=_DATE_2, ) session = SessionFactory.for_schema_base(OperationsBase) metadata = schema.DirectIngestIngestFileMetadata( file_id=_ID, region_code=region.region_code, file_tag=export_args.ingest_view_name, normalized_file_name="normalized_file_name", is_invalidated=False, is_file_split=False, job_creation_time=_DATE_1, export_time=None, datetimes_contained_lower_bound_exclusive=export_args. upper_bound_datetime_prev, datetimes_contained_upper_bound_inclusive=export_args. upper_bound_datetime_to_export, ) expected_metadata = attr.evolve(self.to_entity(metadata), export_time=_DATE_4) session.add(metadata) session.commit() session.close() # Act with freeze_time(_DATE_4.isoformat()): export_manager.export_view_for_args(export_args) expected_upper_bound_query = _DATE_2_UPPER_BOUND_CREATE_TABLE_SCRIPT expected_lower_bound_query = expected_upper_bound_query.replace( "2020_07_20_00_00_00_upper_bound", "2019_07_20_00_00_00_lower_bound") # Assert self.mock_client.run_query_async.assert_has_calls([ mock.call( query_str=expected_upper_bound_query, query_parameters=[ self.generate_query_params_for_date(_DATE_2) ], ), mock.call( query_str=expected_lower_bound_query, query_parameters=[ self.generate_query_params_for_date(_DATE_1) ], ), ]) # Lower bound is the first part of the subquery, not upper bound. expected_query = ( "(SELECT * FROM `recidiviz-456.us_xx_ingest_views.ingest_view_2019_07_20_00_00_00_lower_bound`) " "EXCEPT DISTINCT " "(SELECT * FROM `recidiviz-456.us_xx_ingest_views.ingest_view_2020_07_20_00_00_00_upper_bound`)" "ORDER BY colA, colC;") self.assert_exported_to_gcs_with_query(expected_query) self.mock_client.delete_table.assert_has_calls([ mock.call( dataset_id="us_xx_ingest_views", table_id="ingest_view_2020_07_20_00_00_00_upper_bound", ), mock.call( dataset_id="us_xx_ingest_views", table_id="ingest_view_2019_07_20_00_00_00_lower_bound", ), ]) assert_session = SessionFactory.for_schema_base(OperationsBase) found_metadata = self.to_entity( one( assert_session.query( schema.DirectIngestIngestFileMetadata).all())) self.assertEqual(expected_metadata, found_metadata) assert_session.close()
def test_debugQueryForArgs(self) -> None: # Arrange region = self.create_fake_region() export_manager = self.create_export_manager(region) export_args = GcsfsIngestViewExportArgs( ingest_view_name="ingest_view", upper_bound_datetime_prev=_DATE_1, upper_bound_datetime_to_export=_DATE_2, ) # Act with freeze_time(_DATE_4.isoformat()): debug_query = DirectIngestIngestViewExportManager.debug_query_for_args( export_manager.ingest_views_by_tag, export_args) expected_debug_query = """CREATE TEMP TABLE ingest_view_2020_07_20_00_00_00_upper_bound AS ( WITH file_tag_first_generated_view AS ( WITH rows_with_recency_rank AS ( SELECT col_name_1a, col_name_1b, ROW_NUMBER() OVER (PARTITION BY col_name_1a, col_name_1b ORDER BY update_datetime DESC) AS recency_rank FROM `recidiviz-456.us_xx_raw_data.file_tag_first` WHERE update_datetime <= DATETIME(2020, 7, 20, 0, 0, 0) ) SELECT * EXCEPT (recency_rank) FROM rows_with_recency_rank WHERE recency_rank = 1 ), tagFullHistoricalExport_generated_view AS ( WITH max_update_datetime AS ( SELECT MAX(update_datetime) AS update_datetime FROM `recidiviz-456.us_xx_raw_data.tagFullHistoricalExport` WHERE update_datetime <= DATETIME(2020, 7, 20, 0, 0, 0) ), max_file_id AS ( SELECT MAX(file_id) AS file_id FROM `recidiviz-456.us_xx_raw_data.tagFullHistoricalExport` WHERE update_datetime = (SELECT update_datetime FROM max_update_datetime) ), rows_with_recency_rank AS ( SELECT COL_1, ROW_NUMBER() OVER (PARTITION BY COL_1 ORDER BY update_datetime DESC) AS recency_rank FROM `recidiviz-456.us_xx_raw_data.tagFullHistoricalExport` WHERE file_id = (SELECT file_id FROM max_file_id) ) SELECT * EXCEPT (recency_rank) FROM rows_with_recency_rank WHERE recency_rank = 1 ) select * from file_tag_first_generated_view JOIN tagFullHistoricalExport_generated_view USING (COL_1) ORDER BY colA, colC ); CREATE TEMP TABLE ingest_view_2019_07_20_00_00_00_lower_bound AS ( WITH file_tag_first_generated_view AS ( WITH rows_with_recency_rank AS ( SELECT col_name_1a, col_name_1b, ROW_NUMBER() OVER (PARTITION BY col_name_1a, col_name_1b ORDER BY update_datetime DESC) AS recency_rank FROM `recidiviz-456.us_xx_raw_data.file_tag_first` WHERE update_datetime <= DATETIME(2019, 7, 20, 0, 0, 0) ) SELECT * EXCEPT (recency_rank) FROM rows_with_recency_rank WHERE recency_rank = 1 ), tagFullHistoricalExport_generated_view AS ( WITH max_update_datetime AS ( SELECT MAX(update_datetime) AS update_datetime FROM `recidiviz-456.us_xx_raw_data.tagFullHistoricalExport` WHERE update_datetime <= DATETIME(2019, 7, 20, 0, 0, 0) ), max_file_id AS ( SELECT MAX(file_id) AS file_id FROM `recidiviz-456.us_xx_raw_data.tagFullHistoricalExport` WHERE update_datetime = (SELECT update_datetime FROM max_update_datetime) ), rows_with_recency_rank AS ( SELECT COL_1, ROW_NUMBER() OVER (PARTITION BY COL_1 ORDER BY update_datetime DESC) AS recency_rank FROM `recidiviz-456.us_xx_raw_data.tagFullHistoricalExport` WHERE file_id = (SELECT file_id FROM max_file_id) ) SELECT * EXCEPT (recency_rank) FROM rows_with_recency_rank WHERE recency_rank = 1 ) select * from file_tag_first_generated_view JOIN tagFullHistoricalExport_generated_view USING (COL_1) ORDER BY colA, colC ); ( SELECT * FROM ingest_view_2020_07_20_00_00_00_upper_bound ) EXCEPT DISTINCT ( SELECT * FROM ingest_view_2019_07_20_00_00_00_lower_bound ) ORDER BY colA, colC;""" # Assert self.assertEqual(expected_debug_query, debug_query)
) for metadata in metadata_list ] if __name__ == "__main__": # Update these variables and run to print an export query you can run in the BigQuery UI region_code_: str = "us_mo" ingest_view_name_: str = "tak001_offender_identification" upper_bound_datetime_prev_: datetime.datetime = datetime.datetime(2020, 10, 15) upper_bound_datetime_to_export_: datetime.datetime = datetime.datetime(2020, 12, 18) with local_project_id_override(GCP_PROJECT_STAGING): region_ = regions.get_region(region_code_, is_direct_ingest=True) view_collector_ = DirectIngestPreProcessedIngestViewCollector(region_, []) views_by_tag_ = { builder.file_tag: builder.build() for builder in view_collector_.collect_view_builders() } debug_query = DirectIngestIngestViewExportManager.debug_query_for_args( views_by_tag_, GcsfsIngestViewExportArgs( ingest_view_name=ingest_view_name_, upper_bound_datetime_prev=upper_bound_datetime_prev_, upper_bound_datetime_to_export=upper_bound_datetime_to_export_, ), ) print(debug_query)