def test_get_ingest_view_metadata_pending_export_all_exported_in_region(
            self):
        args = GcsfsIngestViewExportArgs(
            ingest_view_name='file_tag',
            upper_bound_datetime_prev=datetime.datetime(
                2015, 1, 2, 2, 2, 2, 2),
            upper_bound_datetime_to_export=datetime.datetime(
                2015, 1, 2, 3, 3, 3, 3))
        args_other_region = GcsfsIngestViewExportArgs(
            ingest_view_name='other_file_tag',
            upper_bound_datetime_prev=datetime.datetime(
                2015, 1, 2, 2, 2, 2, 2),
            upper_bound_datetime_to_export=datetime.datetime(
                2015, 1, 2, 3, 3, 3, 3))
        with freeze_time('2015-01-02T03:06:06'):
            self.metadata_manager.register_ingest_file_export_job(args)
            self.metadata_manager_other_region.register_ingest_file_export_job(
                args_other_region)

        with freeze_time('2015-01-02T03:07:07'):
            path = self._make_unprocessed_path(
                'bucket/file_tag.csv',
                file_type=GcsfsDirectIngestFileType.INGEST_VIEW,
                dt=datetime.datetime.utcnow())
            metadata = self.metadata_manager.get_ingest_view_metadata_for_export_job(
                args)
            self.metadata_manager.register_ingest_view_export_file_name(
                metadata, path)
            # ... export actually performed in here
            self.metadata_manager.mark_ingest_view_exported(metadata)

        self.assertEqual(
            [],
            self.metadata_manager.get_ingest_view_metadata_pending_export())
コード例 #2
0
    def test_gcsfs_ingest_view_export_args(self) -> None:
        dt_lower = datetime.datetime(2019, 1, 22, 11, 22, 33, 444444)
        dt_upper = datetime.datetime(2019, 11, 22, 11, 22, 33, 444444)

        args = GcsfsIngestViewExportArgs(
            ingest_view_name="my_file_tag",
            upper_bound_datetime_prev=None,
            upper_bound_datetime_to_export=dt_upper,
        )

        self.assertEqual(
            "ingest_view_export_my_file_tag-None-2019_11_22_11_22_33_444444",
            args.task_id_tag(),
        )

        args = GcsfsIngestViewExportArgs(
            ingest_view_name="my_file_tag",
            upper_bound_datetime_prev=dt_lower,
            upper_bound_datetime_to_export=dt_upper,
        )

        self.assertEqual(
            "ingest_view_export_my_file_tag-2019_01_22_11_22_33_444444-2019_11_22_11_22_33_444444",
            args.task_id_tag(),
        )
    def test_get_ingest_view_metadata_for_most_recent_valid_job(self):
        with freeze_time('2015-01-02T03:05:05'):
            self.metadata_manager.register_ingest_file_export_job(
                GcsfsIngestViewExportArgs(
                    ingest_view_name='file_tag',
                    upper_bound_datetime_prev=None,
                    upper_bound_datetime_to_export=datetime.datetime(
                        2015, 1, 2, 2, 2, 2, 2)))

        with freeze_time('2015-01-02T03:06:06'):
            self.metadata_manager.register_ingest_file_export_job(
                GcsfsIngestViewExportArgs(
                    ingest_view_name='file_tag',
                    upper_bound_datetime_prev=datetime.datetime(
                        2015, 1, 2, 2, 2, 2, 2),
                    upper_bound_datetime_to_export=datetime.datetime(
                        2015, 1, 2, 3, 3, 3, 3)))

        with freeze_time('2015-01-02T03:07:07'):
            self.metadata_manager.register_ingest_file_export_job(
                GcsfsIngestViewExportArgs(
                    ingest_view_name='another_tag',
                    upper_bound_datetime_prev=datetime.datetime(
                        2015, 1, 2, 3, 3, 3, 3),
                    upper_bound_datetime_to_export=datetime.datetime(
                        2015, 1, 2, 3, 4, 4, 4)))

        most_recent_valid_job = self.metadata_manager.get_ingest_view_metadata_for_most_recent_valid_job(
            'file_tag')

        self.assertIsNotNone(most_recent_valid_job)
        self.assertEqual('file_tag', most_recent_valid_job.file_tag)
        self.assertEqual(
            datetime.datetime(2015, 1, 2, 2, 2, 2, 2),
            most_recent_valid_job.datetimes_contained_lower_bound_exclusive)
        self.assertEqual(
            datetime.datetime(2015, 1, 2, 3, 3, 3, 3),
            most_recent_valid_job.datetimes_contained_upper_bound_inclusive)

        # Invalidate the row that was just returned
        session = SessionFactory.for_schema_base(OperationsBase)
        results = session.query(
            schema.DirectIngestIngestFileMetadata).filter_by(
                file_id=most_recent_valid_job.file_id).all()
        result = one(results)
        result.is_invalidated = True
        session.commit()

        most_recent_valid_job = self.metadata_manager.get_ingest_view_metadata_for_most_recent_valid_job(
            'file_tag')
        self.assertIsNotNone(most_recent_valid_job)
        self.assertEqual('file_tag', most_recent_valid_job.file_tag)
        self.assertEqual(
            None,
            most_recent_valid_job.datetimes_contained_lower_bound_exclusive)
        self.assertEqual(
            datetime.datetime(2015, 1, 2, 2, 2, 2, 2),
            most_recent_valid_job.datetimes_contained_upper_bound_inclusive)
    def test_ingest_view_file_same_args_after_invalidation(self):
        args = GcsfsIngestViewExportArgs(
            ingest_view_name='file_tag',
            upper_bound_datetime_prev=datetime.datetime(
                2015, 1, 2, 2, 2, 2, 2),
            upper_bound_datetime_to_export=datetime.datetime(
                2015, 1, 2, 3, 3, 3, 3))

        ingest_view_unprocessed_path = self._make_unprocessed_path(
            'bucket/file_tag.csv', GcsfsDirectIngestFileType.INGEST_VIEW)
        self.run_ingest_view_file_progression(args, self.metadata_manager,
                                              ingest_view_unprocessed_path)

        # Invalidate the previous row
        session = SessionFactory.for_schema_base(OperationsBase)
        results = session.query(schema.DirectIngestIngestFileMetadata).all()
        result = one(results)
        result.is_invalidated = True
        session.commit()

        # Now we can rerun with the same args
        ingest_view_unprocessed_path = self._make_unprocessed_path(
            'bucket/file_tag.csv',
            GcsfsDirectIngestFileType.INGEST_VIEW,
            dt=datetime.datetime.now())
        self.run_ingest_view_file_progression(args, self.metadata_manager,
                                              ingest_view_unprocessed_path)
コード例 #5
0
    def test_ingest_view_export(self, mock_supported, mock_region,
                                mock_environment):
        mock_supported.return_value = ['us_xx']

        region_code = 'us_xx'

        mock_environment.return_value = 'staging'
        mock_controller = create_autospec(GcsfsDirectIngestController)
        mock_region.return_value = fake_region(region_code=region_code,
                                               environment='staging',
                                               ingestor=mock_controller)

        export_args = GcsfsIngestViewExportArgs(
            ingest_view_name='my_ingest_view',
            upper_bound_datetime_prev=datetime.datetime(2020, 4, 29),
            upper_bound_datetime_to_export=datetime.datetime(2020, 4, 30))

        request_args = {
            'region': region_code,
        }
        body = {
            'cloud_task_args': export_args.to_serializable(),
            'args_type': 'GcsfsIngestViewExportArgs',
        }
        body_encoded = json.dumps(body).encode()

        headers = {'X-Appengine-Cron': 'test-cron'}

        response = self.client.post('/ingest_view_export',
                                    query_string=request_args,
                                    headers=headers,
                                    data=body_encoded)
        self.assertEqual(200, response.status_code)
        mock_controller.do_ingest_view_export.assert_called_with(export_args)
コード例 #6
0
    def get_ingest_view_export_task_args(
            self) -> List[GcsfsIngestViewExportArgs]:
        """Looks at what files have been exported for a given region and returns args for all the export jobs that
        should be started, given what has updated in the raw data tables since the last time we exported data. Also
        returns any tasks that have not yet completed.
        """
        if not self.region.are_ingest_view_exports_enabled_in_env():
            raise ValueError(
                f'Ingest view exports not enabled for region [{self.region.region_code}]'
            )

        logging.info('Gathering export state for each ingest tag')
        ingest_view_to_export_state = {}
        for ingest_view_tag, ingest_view in self.ingest_views_by_tag.items():
            export_state = self._get_export_state_for_ingest_view(ingest_view)
            self._validate_ascending_raw_file_update_dates(export_state)
            ingest_view_to_export_state[ingest_view_tag] = export_state
        logging.info('Done gathering export state for each ingest tag')

        # At this point we know that we have no new raw data backfills that should invalidate either pending or past
        # completed ingest view exports (checked in _validate_ascending_raw_file_update_dates()). We can now generate
        # any new jobs.

        jobs_to_schedule = []
        metadata_pending_export = self.file_metadata_manager.get_ingest_view_metadata_pending_export(
        )
        if metadata_pending_export:
            args_list = self._export_args_from_metadata(
                metadata_pending_export)
            jobs_to_schedule.extend(args_list)

        logging.info('Found [%s] already pending jobs to schedule.',
                     len(jobs_to_schedule))

        logging.info('Generating new ingest jobs.')
        for ingest_view_tag, export_state in ingest_view_to_export_state.items(
        ):
            lower_bound_datetime_exclusive = \
                export_state.last_export_metadata.datetimes_contained_upper_bound_inclusive \
                if export_state.last_export_metadata else None

            ingest_args_list = []
            for _date, upper_bound_datetime_inclusive in export_state.max_update_datetime_by_date:
                args = GcsfsIngestViewExportArgs(
                    ingest_view_name=ingest_view_tag,
                    upper_bound_datetime_prev=lower_bound_datetime_exclusive,
                    upper_bound_datetime_to_export=
                    upper_bound_datetime_inclusive)
                logging.info('Generating job args for tag [%s]: [%s].',
                             ingest_view_tag, args)

                self.file_metadata_manager.register_ingest_file_export_job(
                    args)
                ingest_args_list.append(args)
                lower_bound_datetime_exclusive = upper_bound_datetime_inclusive

            jobs_to_schedule.extend(ingest_args_list)

        logging.info('Returning [%s] jobs to schedule.', len(jobs_to_schedule))
        return jobs_to_schedule
コード例 #7
0
 def _export_args_from_metadata(
         metadata_list: List[DirectIngestIngestFileMetadata]) -> List[GcsfsIngestViewExportArgs]:
     return [GcsfsIngestViewExportArgs(
         ingest_view_name=metadata.file_tag,
         upper_bound_datetime_prev=metadata.datetimes_contained_lower_bound_exclusive,
         upper_bound_datetime_to_export=metadata.datetimes_contained_upper_bound_inclusive
     ) for metadata in metadata_list]
コード例 #8
0
    def test_ingest_view_file_progression_same_args_twice_throws(self) -> None:
        args = GcsfsIngestViewExportArgs(
            ingest_view_name="file_tag",
            upper_bound_datetime_prev=datetime.datetime(
                2015, 1, 2, 2, 2, 2, 2),
            upper_bound_datetime_to_export=datetime.datetime(
                2015, 1, 2, 3, 3, 3, 3),
        )

        ingest_view_unprocessed_path = self._make_unprocessed_path(
            "bucket/file_tag.csv", GcsfsDirectIngestFileType.INGEST_VIEW)
        self.run_ingest_view_file_progression(args, self.metadata_manager,
                                              ingest_view_unprocessed_path)

        with self.assertRaises(IntegrityError):
            ingest_view_unprocessed_path = self._make_unprocessed_path(
                "bucket/file_tag.csv",
                GcsfsDirectIngestFileType.INGEST_VIEW,
                dt=datetime.datetime.now(),
            )
            self.run_ingest_view_file_progression(
                args, self.metadata_manager, ingest_view_unprocessed_path)

        session = SessionFactory.for_schema_base(OperationsBase)
        results = session.query(schema.DirectIngestIngestFileMetadata).all()
        self.assertEqual(1, len(results))
    def test_get_ingest_view_metadata_pending_export_basic(self):
        args = GcsfsIngestViewExportArgs(
            ingest_view_name='file_tag',
            upper_bound_datetime_prev=datetime.datetime(
                2015, 1, 2, 2, 2, 2, 2),
            upper_bound_datetime_to_export=datetime.datetime(
                2015, 1, 2, 3, 3, 3, 3))
        with freeze_time('2015-01-02T03:06:06'):
            self.metadata_manager.register_ingest_file_export_job(args)

        expected_list = [
            DirectIngestIngestFileMetadata.new_with_defaults(
                region_code='US_XX',
                file_tag='file_tag',
                is_invalidated=False,
                is_file_split=False,
                job_creation_time=datetime.datetime(2015, 1, 2, 3, 6, 6),
                datetimes_contained_lower_bound_exclusive=datetime.datetime(
                    2015, 1, 2, 2, 2, 2, 2),
                datetimes_contained_upper_bound_inclusive=datetime.datetime(
                    2015, 1, 2, 3, 3, 3, 3))
        ]

        self.assertEqual(
            expected_list,
            self.metadata_manager.get_ingest_view_metadata_pending_export())
コード例 #10
0
    def test_ingest_view_file_progression_two_files_same_tag(self) -> None:

        args = GcsfsIngestViewExportArgs(
            ingest_view_name="file_tag",
            upper_bound_datetime_prev=None,
            upper_bound_datetime_to_export=datetime.datetime(
                2015, 1, 2, 2, 2, 2, 2),
        )

        ingest_view_unprocessed_path_1 = self._make_unprocessed_path(
            "bucket/file_tag.csv",
            GcsfsDirectIngestFileType.INGEST_VIEW,
            dt=datetime.datetime(2015, 1, 2, 2, 2, 2, 2),
        )
        self.run_ingest_view_file_progression(args, self.metadata_manager,
                                              ingest_view_unprocessed_path_1)

        args = GcsfsIngestViewExportArgs(
            ingest_view_name="file_tag",
            upper_bound_datetime_prev=datetime.datetime(
                2015, 1, 2, 2, 2, 2, 2),
            upper_bound_datetime_to_export=datetime.datetime(
                2015, 1, 3, 3, 3, 3, 3),
        )

        ingest_view_unprocessed_path_2 = self._make_unprocessed_path(
            "bucket/file_tag.csv",
            GcsfsDirectIngestFileType.INGEST_VIEW,
            dt=datetime.datetime(2015, 1, 3, 3, 3, 3, 3),
        )
        self.run_ingest_view_file_progression(args, self.metadata_manager,
                                              ingest_view_unprocessed_path_2)

        session = SessionFactory.for_schema_base(OperationsBase)
        results = session.query(schema.DirectIngestIngestFileMetadata).all()

        self.assertEqual(
            {
                ingest_view_unprocessed_path_1.file_name,
                ingest_view_unprocessed_path_2.file_name,
            },
            {r.normalized_file_name
             for r in results},
        )
        for r in results:
            self.assertTrue(r.export_time)
            self.assertTrue(r.processed_time)
    def test_ingest_view_file_progression(self):
        args = GcsfsIngestViewExportArgs(
            ingest_view_name='file_tag',
            upper_bound_datetime_prev=datetime.datetime(
                2015, 1, 2, 2, 2, 2, 2),
            upper_bound_datetime_to_export=datetime.datetime(
                2015, 1, 2, 3, 3, 3, 3))

        ingest_view_unprocessed_path = self._make_unprocessed_path(
            'bucket/file_tag.csv', GcsfsDirectIngestFileType.INGEST_VIEW)
        self.run_ingest_view_file_progression(args, self.metadata_manager,
                                              ingest_view_unprocessed_path)
コード例 #12
0
    def run_parse_file_test(self, expected: IngestInfo,
                            fixture_file_name: str) -> IngestInfo:
        """Runs a test that reads and parses a given fixture file. Returns the
        parsed IngestInfo object for tests to run further validations."""
        args = ingest_args_for_fixture_file(self.controller,
                                            f"{fixture_file_name}.csv")

        if not isinstance(self.controller.fs.gcs_file_system,
                          FakeGCSFileSystem):
            raise ValueError(
                f"Controller fs must have type "
                f"FakeGCSFileSystem. Found instead "
                f"type [{type(self.controller.fs.gcs_file_system)}]")

        if self.controller.region.is_ingest_launched_in_env():
            now = datetime.datetime.now()
            yesterday = now - datetime.timedelta(days=1)
            ingest_file_export_job_args = GcsfsIngestViewExportArgs(
                ingest_view_name=fixture_file_name,
                upper_bound_datetime_to_export=now,
                upper_bound_datetime_prev=yesterday,
                output_bucket_name=self.controller.ingest_bucket_path.
                bucket_name,
            )

            self.controller.file_metadata_manager.register_ingest_file_export_job(
                ingest_file_export_job_args)
            self.controller.ingest_view_export_manager.export_view_for_args(
                ingest_file_export_job_args)
        else:
            fixture_util.add_direct_ingest_path(
                self.controller.fs.gcs_file_system,
                args.file_path,
                region_code=self.controller.region_code(),
            )

        # pylint:disable=protected-access
        fixture_contents_handle = self.controller._get_contents_handle(args)

        if fixture_contents_handle is None:
            self.fail("fixture_contents_handle should not be None")
        final_info = self.controller._parse(args, fixture_contents_handle)

        print_visible_header_label("FINAL")
        print(final_info)

        print_visible_header_label("EXPECTED")
        print(expected)

        self.assertEqual(expected, final_info)

        return final_info
コード例 #13
0
    def test_exportViewForArgs_detectRowDeletionView_noLowerBound(
            self) -> None:
        # Arrange
        region = self.create_fake_region()
        export_manager = self.create_export_manager(
            region, is_detect_row_deletion_view=True)
        export_args = GcsfsIngestViewExportArgs(
            ingest_view_name="ingest_view",
            upper_bound_datetime_prev=None,
            upper_bound_datetime_to_export=_DATE_2,
        )

        session = SessionFactory.for_schema_base(OperationsBase)
        metadata = schema.DirectIngestIngestFileMetadata(
            file_id=_ID,
            region_code=region.region_code,
            file_tag=export_args.ingest_view_name,
            normalized_file_name="normalized_file_name",
            is_invalidated=False,
            is_file_split=False,
            job_creation_time=_DATE_1,
            export_time=None,
            datetimes_contained_lower_bound_exclusive=export_args.
            upper_bound_datetime_prev,
            datetimes_contained_upper_bound_inclusive=export_args.
            upper_bound_datetime_to_export,
        )
        expected_metadata = attr.evolve(self.to_entity(metadata),
                                        export_time=_DATE_4)

        session.add(metadata)
        session.commit()
        session.close()

        # Act
        with freeze_time(_DATE_4.isoformat()):
            export_manager.export_view_for_args(export_args)

        # Assert
        self.mock_client.run_query_async.assert_not_called()
        self.mock_client.export_query_results_to_cloud_storage.assert_not_called(
        )
        self.mock_client.delete_table.assert_not_called()

        assert_session = SessionFactory.for_schema_base(OperationsBase)
        found_metadata = self.to_entity(
            one(
                assert_session.query(
                    schema.DirectIngestIngestFileMetadata).all()))
        self.assertEqual(expected_metadata, found_metadata)
        assert_session.close()
コード例 #14
0
    def test_exportViewForArgs_noLowerBound(self):
        # Arrange
        region = self.create_fake_region()
        export_manager = self.create_export_manager(region)
        export_args = GcsfsIngestViewExportArgs(
            ingest_view_name='ingest_view',
            upper_bound_datetime_prev=None,
            upper_bound_datetime_to_export=_DATE_2)

        session = SessionFactory.for_schema_base(OperationsBase)
        metadata = schema.DirectIngestIngestFileMetadata(
            file_id=_ID,
            region_code=region.region_code,
            file_tag=export_args.ingest_view_name,
            normalized_file_name='normalized_file_name',
            is_invalidated=False,
            is_file_split=False,
            job_creation_time=_DATE_1,
            export_time=None,
            datetimes_contained_lower_bound_exclusive=export_args.upper_bound_datetime_prev,
            datetimes_contained_upper_bound_inclusive=export_args.upper_bound_datetime_to_export
        )
        expected_metadata = attr.evolve(self.to_entity(metadata), export_time=_DATE_4)

        session.add(metadata)
        session.commit()
        session.close()

        # Act
        with freeze_time(_DATE_4.isoformat()):
            export_manager.export_view_for_args(export_args)

        # Assert
        self.mock_client.create_table_from_query_async.assert_has_calls([
            mock.call(
                dataset_id='us_xx_ingest_views',
                overwrite=True,
                query=mock.ANY,
                query_parameters=[self.generate_query_params_for_date(export_args.upper_bound_datetime_to_export)],
                table_id='ingest_view_2020_07_20_00_00_00_upper_bound'),
        ])
        expected_query = \
            'SELECT * FROM `recidiviz-456.us_xx_ingest_views.ingest_view_2020_07_20_00_00_00_upper_bound` ' \
            'ORDER BY colA, colC;'
        self.assert_exported_to_gcs_with_query(expected_query)
        self.mock_client.delete_table.assert_has_calls([
            mock.call(dataset_id='us_xx_ingest_views', table_id='ingest_view_2020_07_20_00_00_00_upper_bound')])
        assert_session = SessionFactory.for_schema_base(OperationsBase)
        found_metadata = self.to_entity(one(assert_session.query(schema.DirectIngestIngestFileMetadata).all()))
        self.assertEqual(expected_metadata, found_metadata)
        assert_session.close()
    def test_create_direct_ingest_ingest_view_export_task(
            self, mock_client: mock.MagicMock,
            mock_uuid: mock.MagicMock) -> None:
        # Arrange
        export_args = GcsfsIngestViewExportArgs(
            ingest_view_name="my_ingest_view",
            output_bucket_name="my_ingest_bucket",
            upper_bound_datetime_prev=datetime.datetime(2020, 4, 29),
            upper_bound_datetime_to_export=datetime.datetime(2020, 4, 30),
        )
        body = {
            "cloud_task_args": export_args.to_serializable(),
            "args_type": "GcsfsIngestViewExportArgs",
        }
        body_encoded = json.dumps(body).encode()
        uuid = "random-uuid"
        mock_uuid.uuid4.return_value = uuid
        date = "2019-07-20"
        queue_path = f"{DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2}-path"

        task_name = DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2 + "/{}-{}-{}".format(
            _REGION.region_code, date, uuid)
        url_params = {
            "region": _REGION.region_code,
            "output_bucket": "my_ingest_bucket",
        }
        task = tasks_v2.types.task_pb2.Task(
            name=task_name,
            app_engine_http_request={
                "http_method": "POST",
                "relative_uri":
                f"/direct/ingest_view_export?{urlencode(url_params)}",
                "body": body_encoded,
            },
        )

        mock_client.return_value.task_path.return_value = task_name
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        DirectIngestCloudTaskManagerImpl(
        ).create_direct_ingest_ingest_view_export_task(
            _REGION, DirectIngestInstance.PRIMARY, export_args)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            self.mock_project_id, QUEUES_REGION,
            DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2)
        mock_client.return_value.create_task.assert_called_with(
            parent=queue_path, task=task)
コード例 #16
0
    def test_ingest_then_split_progression(self) -> None:
        args = GcsfsIngestViewExportArgs(
            ingest_view_name="file_tag",
            upper_bound_datetime_prev=datetime.datetime(
                2015, 1, 2, 2, 2, 2, 2),
            upper_bound_datetime_to_export=datetime.datetime(
                2015, 1, 2, 3, 3, 3, 3),
        )

        ingest_view_unprocessed_path = self._make_unprocessed_path(
            "bucket/file_tag.csv", GcsfsDirectIngestFileType.INGEST_VIEW)
        self.run_ingest_view_file_progression(args,
                                              self.metadata_manager,
                                              ingest_view_unprocessed_path,
                                              split_file=True)
コード例 #17
0
    def test_getIngestViewExportTaskArgs_happy(self) -> None:
        # Arrange
        region = self.create_fake_region(ingest_view_exports_enabled=True)
        export_manager = self.create_export_manager(region)
        export_manager.file_metadata_manager.get_ingest_view_metadata_for_most_recent_valid_job = Mock(  # type: ignore
            return_value=DirectIngestIngestFileMetadata(
                file_id=_ID,
                region_code=region.region_code,
                file_tag="ingest_view",
                normalized_file_name="normalized_file_name",
                processed_time=_DATE_1,
                is_invalidated=False,
                is_file_split=False,
                job_creation_time=_DATE_1,
                export_time=_DATE_1,
                datetimes_contained_lower_bound_exclusive=_DATE_1,
                datetimes_contained_upper_bound_inclusive=_DATE_1,
                discovery_time=_DATE_1,
            ))
        export_manager.file_metadata_manager.get_metadata_for_raw_files_discovered_after_datetime = Mock(  # type: ignore
            return_value=[
                DirectIngestRawFileMetadata(
                    file_id=2,
                    region_code=region.region_code,
                    file_tag="ingest_view",
                    discovery_time=_DATE_2,
                    normalized_file_name=
                    "unprocessed_2015-01-02T03:03:03:000003_raw_file_tag.csv",
                    processed_time=None,
                    datetimes_contained_upper_bound_inclusive=_DATE_2,
                )
            ])

        # Act
        args = export_manager.get_ingest_view_export_task_args()

        # Assert
        self.assertListEqual(
            args,
            [
                GcsfsIngestViewExportArgs(
                    ingest_view_name="ingest_view",
                    upper_bound_datetime_prev=_DATE_1,
                    upper_bound_datetime_to_export=_DATE_2,
                )
            ],
        )
コード例 #18
0
    def _run_ingest_job_for_filename(self, filename: str) -> None:
        """Runs ingest for a the ingest view file with the given unnormalized file name."""
        get_region_patcher = patch(
            "recidiviz.persistence.entity_matching.state."
            "base_state_matching_delegate.get_region")
        mock_get_region = get_region_patcher.start()
        mock_get_region.return_value = self._fake_region()

        environ_patcher = patch.dict("os.environ", {"PERSIST_LOCALLY": "true"})
        environ_patcher.start()

        file_type = (GcsfsDirectIngestFileType.INGEST_VIEW
                     if self.controller.region.
                     is_raw_vs_ingest_file_name_detection_enabled() else None)

        if not isinstance(self.controller.fs.gcs_file_system,
                          FakeGCSFileSystem):
            raise ValueError(
                f"Controller fs must have type "
                f"FakeGCSFileSystem. Found instead "
                f"type [{type(self.controller.fs.gcs_file_system)}]")

        if self.controller.region.are_ingest_view_exports_enabled_in_env():
            now = datetime.datetime.utcnow()
            yesterday = now - datetime.timedelta(days=1)
            ingest_file_export_job_args = GcsfsIngestViewExportArgs(
                ingest_view_name=os.path.splitext(filename)[0],
                upper_bound_datetime_to_export=now,
                upper_bound_datetime_prev=yesterday,
            )

            self.controller.file_metadata_manager.register_ingest_file_export_job(
                ingest_file_export_job_args)
            self.controller.ingest_view_export_manager.export_view_for_args(
                ingest_file_export_job_args)
        else:
            file_path = path_for_fixture_file(self.controller,
                                              filename,
                                              file_type=file_type,
                                              should_normalize=True)
            self.controller.fs.gcs_file_system.test_add_path(
                file_path, filename)

        run_task_queues_to_empty(self.controller)

        get_region_patcher.stop()
        environ_patcher.stop()
コード例 #19
0
    def test_exportViewForArgs_noExistingMetadata(self):
        # Arrange
        region = self.create_fake_region()
        export_manager = self.create_export_manager(region)
        export_args = GcsfsIngestViewExportArgs(
            ingest_view_name='ingest_view',
            upper_bound_datetime_prev=_DATE_1,
            upper_bound_datetime_to_export=_DATE_2)

        # Act
        with pytest.raises(ValueError):
            export_manager.export_view_for_args(export_args)

        # Assert
        self.mock_client.create_table_from_query_async.assert_not_called()
        self.mock_client.export_query_results_to_cloud_storage.assert_not_called()
        self.mock_client.delete_table.assert_not_called()
コード例 #20
0
    def test_create_direct_ingest_ingest_view_export_task(
            self, mock_client, mock_uuid):
        # Arrange
        project_id = 'recidiviz-456'
        export_args = GcsfsIngestViewExportArgs(
            ingest_view_name='my_ingest_view',
            upper_bound_datetime_prev=datetime.datetime(2020, 4, 29),
            upper_bound_datetime_to_export=datetime.datetime(2020, 4, 30))
        body = {
            'cloud_task_args': export_args.to_serializable(),
            'args_type': 'GcsfsIngestViewExportArgs'
        }
        body_encoded = json.dumps(body).encode()
        uuid = 'random-uuid'
        mock_uuid.uuid4.return_value = uuid
        date = '2019-07-20'
        queue_path = _REGION.shared_queue + '-path'

        task_name = DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2 + '/{}-{}-{}'.format(
            _REGION.region_code, date, uuid)
        task = tasks_v2.types.task_pb2.Task(
            name=task_name,
            app_engine_http_request={
                'http_method': 'POST',
                'relative_uri':
                f'/direct/ingest_view_export?region={_REGION.region_code}',
                'body': body_encoded
            })

        mock_client.return_value.task_path.return_value = task_name
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        DirectIngestCloudTaskManagerImpl(
            project_id=project_id
        ).create_direct_ingest_ingest_view_export_task(_REGION, export_args)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            project_id, QUEUES_REGION, DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2)
        mock_client.return_value.create_task.assert_called_with(
            parent=queue_path, task=task)
    def test_register_ingest_view_export_file_name_already_exists_raises(self):
        args = GcsfsIngestViewExportArgs(
            ingest_view_name='file_tag',
            upper_bound_datetime_prev=datetime.datetime(
                2015, 1, 2, 2, 2, 2, 2),
            upper_bound_datetime_to_export=datetime.datetime(
                2015, 1, 2, 3, 3, 3, 3))
        metadata_entity = self.metadata_manager.register_ingest_file_export_job(
            args)
        self.metadata_manager.register_ingest_view_export_file_name(
            metadata_entity,
            self._make_unprocessed_path('bucket/file_tag.csv',
                                        GcsfsDirectIngestFileType.INGEST_VIEW))

        with self.assertRaises(ValueError):
            self.metadata_manager.register_ingest_view_export_file_name(
                metadata_entity,
                self._make_unprocessed_path(
                    'bucket/file_tag.csv',
                    GcsfsDirectIngestFileType.INGEST_VIEW))
コード例 #22
0
    def test_exportViewForArgs_ingestViewExportsDisabled(self) -> None:
        # Arrange
        region = self.create_fake_region(ingest_view_exports_enabled=False)
        export_manager = self.create_export_manager(region)
        export_args = GcsfsIngestViewExportArgs(
            ingest_view_name="ingest_view",
            upper_bound_datetime_prev=_DATE_1,
            upper_bound_datetime_to_export=_DATE_2,
        )

        # Act
        with pytest.raises(ValueError):
            export_manager.export_view_for_args(export_args)

        # Assert
        self.mock_client.create_dataset_if_necessary.assert_not_called()
        self.mock_client.run_query_async.assert_not_called()
        self.mock_client.export_query_results_to_cloud_storage.assert_not_called(
        )
        self.mock_client.delete_table.assert_not_called()
コード例 #23
0
    def test_ingest_view_export(
        self,
        mock_supported: mock.MagicMock,
        mock_region: mock.MagicMock,
        mock_environment: mock.MagicMock,
    ) -> None:
        mock_supported.return_value = ["us_xx"]

        region_code = "us_xx"

        mock_environment.return_value = "staging"
        mock_controller = create_autospec(GcsfsDirectIngestController)
        mock_region.return_value = fake_region(region_code=region_code,
                                               environment="staging",
                                               ingestor=mock_controller)

        export_args = GcsfsIngestViewExportArgs(
            ingest_view_name="my_ingest_view",
            upper_bound_datetime_prev=datetime.datetime(2020, 4, 29),
            upper_bound_datetime_to_export=datetime.datetime(2020, 4, 30),
        )

        request_args = {
            "region": region_code,
        }
        body = {
            "cloud_task_args": export_args.to_serializable(),
            "args_type": "GcsfsIngestViewExportArgs",
        }
        body_encoded = json.dumps(body).encode()

        headers = {"X-Appengine-Cron": "test-cron"}

        response = self.client.post(
            "/ingest_view_export",
            query_string=request_args,
            headers=headers,
            data=body_encoded,
        )
        self.assertEqual(200, response.status_code)
        mock_controller.do_ingest_view_export.assert_called_with(export_args)
コード例 #24
0
    def _run_ingest_job_for_filename(self, filename: str) -> None:
        """Runs ingest for a the ingest view file with the given unnormalized file name."""

        environ_patcher = patch.dict("os.environ", {"PERSIST_LOCALLY": "true"})
        environ_patcher.start()
        file_type = GcsfsDirectIngestFileType.INGEST_VIEW

        if not isinstance(self.controller.fs.gcs_file_system,
                          FakeGCSFileSystem):
            raise ValueError(
                f"Controller fs must have type "
                f"FakeGCSFileSystem. Found instead "
                f"type [{type(self.controller.fs.gcs_file_system)}]")

        if self.controller.region.is_ingest_launched_in_env():
            now = datetime.datetime.now(tz=pytz.UTC)
            yesterday = now - datetime.timedelta(days=1)
            ingest_file_export_job_args = GcsfsIngestViewExportArgs(
                ingest_view_name=os.path.splitext(filename)[0],
                upper_bound_datetime_to_export=now,
                upper_bound_datetime_prev=yesterday,
                output_bucket_name=self.controller.ingest_bucket_path.
                bucket_name,
            )

            self.controller.file_metadata_manager.register_ingest_file_export_job(
                ingest_file_export_job_args)
            self.controller.ingest_view_export_manager.export_view_for_args(
                ingest_file_export_job_args)
        else:
            file_path = path_for_fixture_file(self.controller,
                                              filename,
                                              file_type=file_type,
                                              should_normalize=True)
            self.controller.fs.gcs_file_system.test_add_path(
                file_path, filename)

        run_task_queues_to_empty(self.controller)

        environ_patcher.stop()
コード例 #25
0
    def run_parse_file_test(self, expected: IngestInfo,
                            fixture_file_name: str) -> IngestInfo:
        """Runs a test that reads and parses a given fixture file. Returns the
        parsed IngestInfo object for tests to run further validations."""
        args = ingest_args_for_fixture_file(self.controller,
                                            f'{fixture_file_name}.csv')

        if not isinstance(self.controller.fs, FakeDirectIngestGCSFileSystem):
            raise ValueError(f"Controller fs must have type "
                             f"FakeDirectIngestGCSFileSystem. Found instead "
                             f"type [{type(self.controller.fs)}]")

        if self.controller.region.are_ingest_view_exports_enabled_in_env():
            ingest_file_export_job_args = GcsfsIngestViewExportArgs(
                ingest_view_name=fixture_file_name,
                upper_bound_datetime_to_export=datetime.datetime.utcnow(),
                upper_bound_datetime_prev=None)

            self.controller.file_metadata_manager.register_ingest_file_export_job(
                ingest_file_export_job_args)
            self.controller.ingest_view_export_manager.export_view_for_args(
                ingest_file_export_job_args)
        else:
            self.controller.fs.test_add_path(args.file_path)

        # pylint:disable=protected-access
        fixture_contents_handle = self.controller._get_contents_handle(args)

        final_info = self.controller._parse(args, fixture_contents_handle)

        print_visible_header_label('FINAL')
        print(final_info)

        print_visible_header_label('EXPECTED')
        print(expected)

        self.assertEqual(expected, final_info)

        return final_info
コード例 #26
0
    def test_exportViewForArgs_alreadyExported(self):
        # Arrange
        region = self.create_fake_region()
        export_manager = self.create_export_manager(region)
        export_args = GcsfsIngestViewExportArgs(
            ingest_view_name='ingest_view',
            upper_bound_datetime_prev=_DATE_1,
            upper_bound_datetime_to_export=_DATE_2)

        session = SessionFactory.for_schema_base(OperationsBase)
        metadata = schema.DirectIngestIngestFileMetadata(
            file_id=_ID,
            region_code=region.region_code,
            file_tag=export_args.ingest_view_name,
            normalized_file_name='normalized_file_name',
            is_invalidated=False,
            is_file_split=False,
            job_creation_time=_DATE_1,
            export_time=_DATE_2,
            datetimes_contained_lower_bound_exclusive=export_args.upper_bound_datetime_prev,
            datetimes_contained_upper_bound_inclusive=export_args.upper_bound_datetime_to_export
        )
        expected_metadata = self.to_entity(metadata)
        session.add(metadata)
        session.commit()
        session.close()

        # Act
        export_manager.export_view_for_args(export_args)

        # Assert
        self.mock_client.create_table_from_query_async.assert_not_called()
        self.mock_client.export_query_results_to_cloud_storage.assert_not_called()
        self.mock_client.delete_table.assert_not_called()
        assert_session = SessionFactory.for_schema_base(OperationsBase)
        found_metadata = self.to_entity(one(assert_session.query(schema.DirectIngestIngestFileMetadata).all()))
        self.assertEqual(expected_metadata, found_metadata)
        assert_session.close()
コード例 #27
0
            for metadata in metadata_list
        ]


if __name__ == "__main__":

    # Update these variables and run to print an export query you can run in the BigQuery UI
    region_code_: str = "us_mo"
    ingest_view_name_: str = "tak001_offender_identification"
    upper_bound_datetime_prev_: datetime.datetime = datetime.datetime(2020, 10, 15)
    upper_bound_datetime_to_export_: datetime.datetime = datetime.datetime(2020, 12, 18)

    with local_project_id_override(GCP_PROJECT_STAGING):
        region_ = regions.get_region(region_code_, is_direct_ingest=True)
        view_collector_ = DirectIngestPreProcessedIngestViewCollector(region_, [])
        views_by_tag_ = {
            builder.file_tag: builder.build()
            for builder in view_collector_.collect_view_builders()
        }

        debug_query = DirectIngestIngestViewExportManager.debug_query_for_args(
            views_by_tag_,
            GcsfsIngestViewExportArgs(
                ingest_view_name=ingest_view_name_,
                upper_bound_datetime_prev=upper_bound_datetime_prev_,
                upper_bound_datetime_to_export=upper_bound_datetime_to_export_,
                output_bucket_name="any_bucket",
            ),
        )
        print(debug_query)
コード例 #28
0
    def test_exportViewForArgs_detectRowDeletionView(self) -> None:
        # Arrange
        region = self.create_fake_region()
        export_manager = self.create_export_manager(
            region, is_detect_row_deletion_view=True)
        export_args = GcsfsIngestViewExportArgs(
            ingest_view_name="ingest_view",
            upper_bound_datetime_prev=_DATE_1,
            upper_bound_datetime_to_export=_DATE_2,
        )

        session = SessionFactory.for_schema_base(OperationsBase)
        metadata = schema.DirectIngestIngestFileMetadata(
            file_id=_ID,
            region_code=region.region_code,
            file_tag=export_args.ingest_view_name,
            normalized_file_name="normalized_file_name",
            is_invalidated=False,
            is_file_split=False,
            job_creation_time=_DATE_1,
            export_time=None,
            datetimes_contained_lower_bound_exclusive=export_args.
            upper_bound_datetime_prev,
            datetimes_contained_upper_bound_inclusive=export_args.
            upper_bound_datetime_to_export,
        )
        expected_metadata = attr.evolve(self.to_entity(metadata),
                                        export_time=_DATE_4)
        session.add(metadata)
        session.commit()
        session.close()

        # Act
        with freeze_time(_DATE_4.isoformat()):
            export_manager.export_view_for_args(export_args)

        expected_upper_bound_query = _DATE_2_UPPER_BOUND_CREATE_TABLE_SCRIPT
        expected_lower_bound_query = expected_upper_bound_query.replace(
            "2020_07_20_00_00_00_upper_bound",
            "2019_07_20_00_00_00_lower_bound")

        # Assert
        self.mock_client.run_query_async.assert_has_calls([
            mock.call(
                query_str=expected_upper_bound_query,
                query_parameters=[
                    self.generate_query_params_for_date(_DATE_2)
                ],
            ),
            mock.call(
                query_str=expected_lower_bound_query,
                query_parameters=[
                    self.generate_query_params_for_date(_DATE_1)
                ],
            ),
        ])
        # Lower bound is the first part of the subquery, not upper bound.
        expected_query = (
            "(SELECT * FROM `recidiviz-456.us_xx_ingest_views.ingest_view_2019_07_20_00_00_00_lower_bound`) "
            "EXCEPT DISTINCT "
            "(SELECT * FROM `recidiviz-456.us_xx_ingest_views.ingest_view_2020_07_20_00_00_00_upper_bound`)"
            "ORDER BY colA, colC;")
        self.assert_exported_to_gcs_with_query(expected_query)
        self.mock_client.delete_table.assert_has_calls([
            mock.call(
                dataset_id="us_xx_ingest_views",
                table_id="ingest_view_2020_07_20_00_00_00_upper_bound",
            ),
            mock.call(
                dataset_id="us_xx_ingest_views",
                table_id="ingest_view_2019_07_20_00_00_00_lower_bound",
            ),
        ])

        assert_session = SessionFactory.for_schema_base(OperationsBase)
        found_metadata = self.to_entity(
            one(
                assert_session.query(
                    schema.DirectIngestIngestFileMetadata).all()))
        self.assertEqual(expected_metadata, found_metadata)
        assert_session.close()
コード例 #29
0
    def test_debugQueryForArgs(self) -> None:
        # Arrange
        region = self.create_fake_region()
        export_manager = self.create_export_manager(region)
        export_args = GcsfsIngestViewExportArgs(
            ingest_view_name="ingest_view",
            upper_bound_datetime_prev=_DATE_1,
            upper_bound_datetime_to_export=_DATE_2,
        )

        # Act
        with freeze_time(_DATE_4.isoformat()):
            debug_query = DirectIngestIngestViewExportManager.debug_query_for_args(
                export_manager.ingest_views_by_tag, export_args)

        expected_debug_query = """CREATE TEMP TABLE ingest_view_2020_07_20_00_00_00_upper_bound AS (

WITH
file_tag_first_generated_view AS (
    WITH rows_with_recency_rank AS (
        SELECT
            col_name_1a, col_name_1b,
            ROW_NUMBER() OVER (PARTITION BY col_name_1a, col_name_1b
                               ORDER BY update_datetime DESC) AS recency_rank
        FROM
            `recidiviz-456.us_xx_raw_data.file_tag_first`
        WHERE
            update_datetime <= DATETIME(2020, 7, 20, 0, 0, 0)
    )

    SELECT *
    EXCEPT (recency_rank)
    FROM rows_with_recency_rank
    WHERE recency_rank = 1
),
tagFullHistoricalExport_generated_view AS (
    WITH max_update_datetime AS (
        SELECT
            MAX(update_datetime) AS update_datetime
        FROM
            `recidiviz-456.us_xx_raw_data.tagFullHistoricalExport`
        WHERE
            update_datetime <= DATETIME(2020, 7, 20, 0, 0, 0)
    ),
    max_file_id AS (
        SELECT
            MAX(file_id) AS file_id
        FROM
            `recidiviz-456.us_xx_raw_data.tagFullHistoricalExport`
        WHERE
            update_datetime = (SELECT update_datetime FROM max_update_datetime)
    ),
    rows_with_recency_rank AS (
        SELECT
            COL_1,
            ROW_NUMBER() OVER (PARTITION BY COL_1
                               ORDER BY update_datetime DESC) AS recency_rank
        FROM
            `recidiviz-456.us_xx_raw_data.tagFullHistoricalExport`
        WHERE
            file_id = (SELECT file_id FROM max_file_id)
    )
    SELECT *
    EXCEPT (recency_rank)
    FROM rows_with_recency_rank
    WHERE recency_rank = 1
)
select * from file_tag_first_generated_view JOIN tagFullHistoricalExport_generated_view USING (COL_1)
ORDER BY colA, colC

);
CREATE TEMP TABLE ingest_view_2019_07_20_00_00_00_lower_bound AS (

WITH
file_tag_first_generated_view AS (
    WITH rows_with_recency_rank AS (
        SELECT
            col_name_1a, col_name_1b,
            ROW_NUMBER() OVER (PARTITION BY col_name_1a, col_name_1b
                               ORDER BY update_datetime DESC) AS recency_rank
        FROM
            `recidiviz-456.us_xx_raw_data.file_tag_first`
        WHERE
            update_datetime <= DATETIME(2019, 7, 20, 0, 0, 0)
    )

    SELECT *
    EXCEPT (recency_rank)
    FROM rows_with_recency_rank
    WHERE recency_rank = 1
),
tagFullHistoricalExport_generated_view AS (
    WITH max_update_datetime AS (
        SELECT
            MAX(update_datetime) AS update_datetime
        FROM
            `recidiviz-456.us_xx_raw_data.tagFullHistoricalExport`
        WHERE
            update_datetime <= DATETIME(2019, 7, 20, 0, 0, 0)
    ),
    max_file_id AS (
        SELECT
            MAX(file_id) AS file_id
        FROM
            `recidiviz-456.us_xx_raw_data.tagFullHistoricalExport`
        WHERE
            update_datetime = (SELECT update_datetime FROM max_update_datetime)
    ),
    rows_with_recency_rank AS (
        SELECT
            COL_1,
            ROW_NUMBER() OVER (PARTITION BY COL_1
                               ORDER BY update_datetime DESC) AS recency_rank
        FROM
            `recidiviz-456.us_xx_raw_data.tagFullHistoricalExport`
        WHERE
            file_id = (SELECT file_id FROM max_file_id)
    )
    SELECT *
    EXCEPT (recency_rank)
    FROM rows_with_recency_rank
    WHERE recency_rank = 1
)
select * from file_tag_first_generated_view JOIN tagFullHistoricalExport_generated_view USING (COL_1)
ORDER BY colA, colC

);
(
SELECT * FROM ingest_view_2020_07_20_00_00_00_upper_bound
) EXCEPT DISTINCT (
SELECT * FROM ingest_view_2019_07_20_00_00_00_lower_bound
)
ORDER BY colA, colC;"""

        # Assert
        self.assertEqual(expected_debug_query, debug_query)
コード例 #30
0
            )
            for metadata in metadata_list
        ]


if __name__ == "__main__":

    # Update these variables and run to print an export query you can run in the BigQuery UI
    region_code_: str = "us_mo"
    ingest_view_name_: str = "tak001_offender_identification"
    upper_bound_datetime_prev_: datetime.datetime = datetime.datetime(2020, 10, 15)
    upper_bound_datetime_to_export_: datetime.datetime = datetime.datetime(2020, 12, 18)

    with local_project_id_override(GCP_PROJECT_STAGING):
        region_ = regions.get_region(region_code_, is_direct_ingest=True)
        view_collector_ = DirectIngestPreProcessedIngestViewCollector(region_, [])
        views_by_tag_ = {
            builder.file_tag: builder.build()
            for builder in view_collector_.collect_view_builders()
        }

        debug_query = DirectIngestIngestViewExportManager.debug_query_for_args(
            views_by_tag_,
            GcsfsIngestViewExportArgs(
                ingest_view_name=ingest_view_name_,
                upper_bound_datetime_prev=upper_bound_datetime_prev_,
                upper_bound_datetime_to_export=upper_bound_datetime_to_export_,
            ),
        )
        print(debug_query)