Exemple #1
0
    def test_start_scrape_no_docket_item(
        self,
        mock_pubsub: Mock,
        mock_get_region: Mock,
        mock_tracker: Mock,
        mock_sessions: Mock,
        mock_task_manager: Mock,
    ) -> None:
        region = "us_nd"
        scrape_type = constants.ScrapeType.BACKGROUND
        queue_name = "us_nd_scraper"
        initial_task = "trash_it"

        mock_get_region.return_value = mock_region(region, queue_name)
        mock_tracker.return_value = None
        mock_sessions.return_value = None

        scraper = FakeScraper(region, initial_task)
        scraper.start_scrape(scrape_type)

        mock_get_region.assert_called_with(region)
        mock_tracker.assert_called_with(ScrapeKey(region, scrape_type))
        mock_sessions.assert_called_with(ScrapeKey(region, scrape_type))
        mock_pubsub.assert_called_with(ScrapeKey(region, scrape_type),
                                       BATCH_PUBSUB_TYPE)
        mock_task_manager.return_value.create_scrape_task.assert_not_called()
Exemple #2
0
    def test_stop_no_session(
        self, mock_sessions, mock_task_manager, mock_region, mock_supported
    ):
        mock_sessions.return_value = None
        mock_scraper = create_autospec(BaseScraper)
        mock_region.return_value = fake_region(ingestor=mock_scraper)
        mock_supported.return_value = ["us_ca", "us_ut"]

        request_args = {
            "region": "all",
            "scrape_type": "all",
            "respect_is_stoppable": "false",
        }

        headers = {"X-Appengine-Cron": "test-cron"}
        response = self.client.get("/stop", query_string=request_args, headers=headers)
        assert response.status_code == 200

        mock_sessions.assert_has_calls(
            [
                call(ScrapeKey("us_ca", constants.ScrapeType.BACKGROUND)),
                call(ScrapeKey("us_ca", constants.ScrapeType.SNAPSHOT)),
                call(ScrapeKey("us_ut", constants.ScrapeType.BACKGROUND)),
                call(ScrapeKey("us_ut", constants.ScrapeType.SNAPSHOT)),
            ],
            any_order=True,
        )
        mock_scraper.stop_scrape.assert_not_called()
        mock_supported.assert_called_with(stripes=[], timezone=None)
        mock_task_manager.return_value.create_scraper_phase_task.assert_not_called()
def test_repr() -> None:
    scrape_key = ScrapeKey("us_ut", constants.ScrapeType.SNAPSHOT)

    representation = scrape_key.__repr__()

    assert (representation == "<ScrapeKey region_code: us_ut, "
            "scrape_type: ScrapeType.SNAPSHOT>")
Exemple #4
0
    def test_stop_no_session(self, mock_sessions, mock_enqueue, mock_region,
                             mock_supported, client):
        mock_sessions.return_value = []
        mock_region.return_value = fake_region()
        mock_supported.return_value = ['us_ca', 'us_ut']

        request_args = {
            'region': 'all',
            'scrape_type': 'all',
            'respect_is_stoppable': 'false'
        }

        headers = {'X-Appengine-Cron': "test-cron"}
        response = client.get('/stop',
                              query_string=request_args,
                              headers=headers)
        assert response.status_code == 200

        mock_sessions.assert_has_calls([
            call(ScrapeKey('us_ca', constants.ScrapeType.BACKGROUND)),
            call(ScrapeKey('us_ca', constants.ScrapeType.SNAPSHOT)),
            call(ScrapeKey('us_ut', constants.ScrapeType.BACKGROUND)),
            call(ScrapeKey('us_ut', constants.ScrapeType.SNAPSHOT))
        ],
                                       any_order=True)
        assert not mock_region.return_value.get_ingestor().\
            stop_scrape.called
        mock_supported.assert_called_with(timezone=None)
        assert not mock_enqueue.called
    def test_stop_no_session(self, mock_sessions, mock_task_manager,
                             mock_region, mock_supported):
        mock_sessions.return_value = None
        mock_scraper = create_autospec(BaseScraper)
        mock_region.return_value = fake_region(ingestor=mock_scraper)
        mock_supported.return_value = ['us_ca', 'us_ut']

        request_args = {
            'region': 'all',
            'scrape_type': 'all',
            'respect_is_stoppable': 'false'
        }

        headers = {'X-Appengine-Cron': 'test-cron'}
        response = self.client.get('/stop',
                                   query_string=request_args,
                                   headers=headers)
        assert response.status_code == 200

        mock_sessions.assert_has_calls([
            call(ScrapeKey('us_ca', constants.ScrapeType.BACKGROUND)),
            call(ScrapeKey('us_ca', constants.ScrapeType.SNAPSHOT)),
            call(ScrapeKey('us_ut', constants.ScrapeType.BACKGROUND)),
            call(ScrapeKey('us_ut', constants.ScrapeType.SNAPSHOT))
        ])
        mock_scraper.stop_scrape.assert_not_called()
        mock_supported.assert_called_with(stripes=[], timezone=None)
        mock_task_manager.return_value.create_scraper_phase_task.\
            assert_not_called()
Exemple #6
0
    def test_iterate_docket_item_no_matching_items(self):
        docket_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)
        docket.add_to_query_docket(docket_key, get_payload()).result()

        session_key = ScrapeKey(REGIONS[1], constants.ScrapeType.BACKGROUND)
        self.create_session(session_key)
        assert not tracker.iterate_docket_item(session_key)
    def test_persist_to_db_different_regions(self, mock_write, _mock_region,
                                             mock_session_return):
        scrape_key1 = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)
        scrape_key2 = ScrapeKey(REGIONS[1], constants.ScrapeType.BACKGROUND)

        ii = ingest_info.IngestInfo()
        ii.create_person(
            person_id=TEST_ID,
            full_name=TEST_NAME).create_booking(booking_id=TEST_ID)

        ii2 = ingest_info.IngestInfo()
        ii2.create_person(
            person_id=TEST_ID,
            full_name=TEST_NAME2).create_booking(booking_id=TEST_ID)

        t = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )

        t2 = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )

        mock_session_1 = mock_session_return.return_value = create_mock_session(
        )

        batch_persistence.write(ii, scrape_key1, t)
        expected_proto = serialization.convert_ingest_info_to_proto(ii)
        batch_persistence.persist_to_database(scrape_key1.region_code,
                                              mock_session_1.start)

        result_proto = mock_write.call_args[0][0]
        self.assertEqual(result_proto, expected_proto)

        # We expect the region that we persisted to have no more ingest infos.
        ingest_infos_1 = datastore_ingest_info.batch_get_ingest_infos_for_region(
            REGIONS[0], mock_session_1.start)
        self.assertEqual(len(ingest_infos_1), 0)

        mock_session_2 = mock_session_return.return_value = create_mock_session(
        )

        batch_persistence.write(ii2, scrape_key2, t2)
        ingest_infos_2 = datastore_ingest_info.batch_get_ingest_infos_for_region(
            REGIONS[1], mock_session_2.start)
        self.assertEqual(len(ingest_infos_2), 1)

        expected_proto = serialization.convert_ingest_info_to_proto(ii2)
        batch_persistence.persist_to_database(scrape_key2.region_code,
                                              mock_session_2.start)

        result_proto = mock_write.call_args[0][0]
        self.assertEqual(result_proto, expected_proto)

        self.assertEqual(mock_write.call_count, 2)
Exemple #8
0
    def test_stop_respects_region_is_not_stoppable(
        self,
        mock_sessions,
        mock_close,
        mock_phase,
        mock_task_manager,
        mock_region,
        mock_supported,
    ):
        session = sessions.ScrapeSession.new(
            key=None,
            region="us_xx",
            scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.SCRAPE,
        )
        mock_sessions.return_value = session
        mock_close.return_value = [session]
        mock_scraper = create_autospec(BaseScraper)
        mock_region.return_value = fake_region(ingestor=mock_scraper)
        mock_region.return_value.is_stoppable = False
        mock_supported.return_value = ["us_ca", "us_ut"]

        request_args = {"region": "all", "scrape_type": "all"}
        headers = {"X-Appengine-Cron": "test-cron"}
        response = self.client.get("/stop", query_string=request_args, headers=headers)
        assert response.status_code == 200

        mock_sessions.assert_has_calls(
            [
                call(ScrapeKey("us_ca", constants.ScrapeType.BACKGROUND)),
                call(ScrapeKey("us_ca", constants.ScrapeType.SNAPSHOT)),
                call(ScrapeKey("us_ut", constants.ScrapeType.BACKGROUND)),
                call(ScrapeKey("us_ut", constants.ScrapeType.SNAPSHOT)),
            ]
        )
        mock_phase.assert_has_calls(
            [call(session, scrape_phase.ScrapePhase.PERSIST)] * 4
        )
        assert mock_scraper.stop_scrape.mock_calls == [
            call(constants.ScrapeType.BACKGROUND, None),
            call().__bool__(),
            call(constants.ScrapeType.SNAPSHOT, None),
            call().__bool__(),
            call(constants.ScrapeType.BACKGROUND, None),
            call().__bool__(),
            call(constants.ScrapeType.SNAPSHOT, None),
            call().__bool__(),
        ]

        mock_supported.assert_called_with(stripes=[], timezone=None)
        mock_task_manager.return_value.create_scraper_phase_task.assert_has_calls(
            [
                call(region_code="us_ca", url="/read_and_persist"),
                call(region_code="us_ut", url="/read_and_persist"),
            ],
            any_order=True,
        )
    def test_check_for_finished_scrapers(
        self,
        mock_region: Mock,
        mock_validate_regions: Mock,
        mock_session: Mock,
        mock_task_manager: Mock,
    ) -> None:
        mock_validate_regions.return_value = ["region_x", "region_y", "region_z"]
        mock_session.side_effect = [
            # Session still in START, shouldn't be stopped
            sessions.ScrapeSession.new(
                key=None,
                region="region_x",
                scrape_type=constants.ScrapeType.BACKGROUND,
                phase=scrape_phase.ScrapePhase.START,
            ),
            # Session in SCRAPE, should be stopped
            sessions.ScrapeSession.new(
                key=None,
                region="region_y",
                scrape_type=constants.ScrapeType.BACKGROUND,
                phase=scrape_phase.ScrapePhase.SCRAPE,
            ),
            # No session, shouldn't be stopped
            None,
        ]

        fake_region_x = create_autospec(Region)
        fake_region_x.region_code = "region_y"
        fake_region_x.get_queue_name.return_value = "queue"
        mock_region.side_effect = [fake_region_x]

        mock_task_manager.return_value.list_scrape_tasks.return_value = []

        request_args = {"region": "all"}
        headers = {"X-Appengine-Cron": "test-cron"}
        response = self.client.get(
            "/check_finished", query_string=request_args, headers=headers
        )
        assert response.status_code == 200

        mock_validate_regions.assert_called_with(["all"])
        mock_session.assert_has_calls(
            [
                call(ScrapeKey("region_x", constants.ScrapeType.BACKGROUND)),
                call(ScrapeKey("region_y", constants.ScrapeType.BACKGROUND)),
                call(ScrapeKey("region_z", constants.ScrapeType.BACKGROUND)),
            ]
        )
        mock_region.assert_called_with("region_y")
        mock_task_manager.return_value.list_scrape_tasks.assert_called_with(
            region_code="region_y", queue_name="queue"
        )
        mock_task_manager.return_value.create_scraper_phase_task.assert_called_with(
            region_code="region_y", url="/stop"
        )
Exemple #10
0
    def test_get_new_docket_item_no_matching_items(self):
        write_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)
        read_key = ScrapeKey(REGIONS[1], constants.ScrapeType.BACKGROUND)

        pubsub_helper.create_topic_and_subscription(write_key,
                                                    docket.PUBSUB_TYPE)
        docket.add_to_query_docket(write_key, get_payload()).result()

        docket_item = docket.get_new_docket_item(read_key,
                                                 return_immediately=True)
        assert not docket_item
Exemple #11
0
    def test_purge_query_docket_nothing_matching(self):
        scrape_key_purge = ScrapeKey(REGIONS[0],
                                     constants.ScrapeType.BACKGROUND)
        scrape_key_add = ScrapeKey(REGIONS[1], constants.ScrapeType.BACKGROUND)

        pubsub_helper.create_topic_and_subscription(scrape_key_add,
                                                    docket.PUBSUB_TYPE)
        docket.add_to_query_docket(scrape_key_add, get_payload()).result()

        docket.purge_query_docket(scrape_key_purge)
        assert not docket.get_new_docket_item(scrape_key_purge,
                                              return_immediately=True)
Exemple #12
0
    def test_stop_timezone(
        self,
        mock_sessions,
        mock_close,
        mock_phase,
        mock_task_manager,
        mock_region,
        mock_supported,
    ):
        session = sessions.ScrapeSession.new(
            key=None,
            region="us_ut",
            scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.SCRAPE,
        )
        mock_sessions.return_value = session
        mock_close.return_value = [session]
        mock_scraper = create_autospec(BaseScraper)
        mock_region.return_value = fake_region(ingestor=mock_scraper)
        mock_supported.side_effect = _MockSupported

        request_args = {
            "region": "all",
            "scrape_type": "all",
            "timezone": "America/New_York",
            "respect_is_stoppable": "false",
        }
        headers = {"X-Appengine-Cron": "test-cron"}
        response = self.client.get("/stop", query_string=request_args, headers=headers)
        assert response.status_code == 200

        mock_sessions.assert_has_calls(
            [
                call(ScrapeKey("us_ut", constants.ScrapeType.BACKGROUND)),
                call(ScrapeKey("us_ut", constants.ScrapeType.SNAPSHOT)),
            ]
        )
        mock_phase.assert_has_calls(
            [call(session, scrape_phase.ScrapePhase.PERSIST)] * 2
        )
        mock_region.assert_has_calls([call("us_ut")])
        mock_scraper.stop_scrape.assert_called_with(
            constants.ScrapeType.SNAPSHOT, "false"
        )
        mock_supported.assert_called_with(
            stripes=[], timezone=pytz.timezone("America/New_York")
        )
        mock_task_manager.return_value.create_scraper_phase_task.assert_called_with(
            region_code="us_ut", url="/read_and_persist"
        )
    def test_stop_respects_region_is_not_stoppable(self, mock_sessions,
                                                   mock_close, mock_phase,
                                                   mock_task_manager,
                                                   mock_region,
                                                   mock_supported):
        session = sessions.ScrapeSession.new(
            key=None,
            region='us_xx',
            scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.SCRAPE)
        mock_sessions.return_value = session
        mock_close.return_value = [session]
        mock_scraper = create_autospec(BaseScraper)
        mock_region.return_value = fake_region(ingestor=mock_scraper)
        mock_region.return_value.is_stoppable = False
        mock_supported.return_value = ['us_ca', 'us_ut']

        request_args = {'region': 'all', 'scrape_type': 'all'}
        headers = {'X-Appengine-Cron': 'test-cron'}
        response = self.client.get('/stop',
                                   query_string=request_args,
                                   headers=headers)
        assert response.status_code == 200

        mock_sessions.assert_has_calls([
            call(ScrapeKey('us_ca', constants.ScrapeType.BACKGROUND)),
            call(ScrapeKey('us_ca', constants.ScrapeType.SNAPSHOT)),
            call(ScrapeKey('us_ut', constants.ScrapeType.BACKGROUND)),
            call(ScrapeKey('us_ut', constants.ScrapeType.SNAPSHOT))
        ])
        mock_phase.assert_has_calls(
            [call(session, scrape_phase.ScrapePhase.PERSIST)] * 4)
        assert mock_scraper.stop_scrape.mock_calls == [
            call(constants.ScrapeType.BACKGROUND, None),
            call().__bool__(),
            call(constants.ScrapeType.SNAPSHOT, None),
            call().__bool__(),
            call(constants.ScrapeType.BACKGROUND, None),
            call().__bool__(),
            call(constants.ScrapeType.SNAPSHOT, None),
            call().__bool__(),
        ]

        mock_supported.assert_called_with(stripes=[], timezone=None)
        mock_task_manager.return_value.create_scraper_phase_task.\
            assert_has_calls([
                call(region_code='us_ca', url='/read_and_persist'),
                call(region_code='us_ut', url='/read_and_persist'),
            ], any_order=True)
Exemple #14
0
def test_check_for_finished_scrapers_not_done(mock_region,
                                              mock_validate_regions,
                                              mock_session,
                                              mock_list_scrape_tasks,
                                              mock_enqueue, client):
    region_code = 'region_x'

    mock_session.return_value = sessions.ScrapeSession.new(
        key=None,
        region=region_code,
        scrape_type=constants.ScrapeType.BACKGROUND,
        phase=scrape_phase.ScrapePhase.SCRAPE)
    mock_validate_regions.return_value = [region_code]

    fake_region = create_autospec(Region)
    fake_region.region_code = region_code
    fake_region.get_queue_name.return_value = 'queue'
    mock_region.return_value = fake_region

    mock_list_scrape_tasks.return_value = ['fake_task']

    request_args = {'region': 'all'}
    headers = {'X-Appengine-Cron': "test-cron"}
    response = client.get('/check_finished',
                          query_string=request_args,
                          headers=headers)
    assert response.status_code == 200

    mock_validate_regions.assert_called_with(['all'])
    mock_session.assert_called_with(
        ScrapeKey(region_code, constants.ScrapeType.BACKGROUND))
    mock_region.assert_called_with(region_code)
    mock_list_scrape_tasks.assert_called_with(region_code=region_code,
                                              queue_name='queue')
    mock_enqueue.assert_not_called()
    def test_get_current_session(self):
        # older
        self.create_session(
            region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.START,
            start=fix_dt(datetime(2009, 6, 17)))
        current = self.create_session(
            region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.START,
            start=fix_dt(datetime(2009, 6, 18)))
        # closed
        self.create_session(
            region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.START,
            start=fix_dt(datetime(2009, 6, 19)),
            end=fix_dt(datetime(2009, 6, 21)))
        # different scrape type
        self.create_session(
            region_code="us_ny", scrape_type=constants.ScrapeType.SNAPSHOT,
            phase=scrape_phase.ScrapePhase.START,
            start=fix_dt(datetime(2009, 6, 19)))
        # different region
        self.create_session(
            region_code="us_fl", scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.START,
            start=fix_dt(datetime(2009, 6, 19)))

        result = sessions.get_current_session(
            ScrapeKey("us_ny", constants.ScrapeType.BACKGROUND))

        assert result.to_entity() == current.to_entity()
Exemple #16
0
    def test_get_more_tasks_failure_batch(
        self,
        mock_flask: Mock,
        mock_get_more: Mock,
        mock_fetch: Mock,
        mock_batch_error: Mock,
    ) -> None:
        mock_fetch.return_value = ("TEST", {})
        mock_get_more.side_effect = ValueError("TEST ERROR")
        mock_flask_get = Mock()
        mock_flask_get.return_value = "TRACE ID"
        mock_flask.headers.get = mock_flask_get

        start_time = datetime.datetime.now()
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=TEST_TASK,
            scraper_start_time=start_time,
        )
        scraper = FakeScraper("test")
        with self.assertRaises(ScraperGetMoreTasksError):
            scraper._generic_scrape(req)
        self.assertEqual(mock_batch_error.call_count, 1)

        scrape_key = ScrapeKey(
            region_code="test", scrape_type=constants.ScrapeType.BACKGROUND
        )
        mock_batch_error.assert_called_once_with(
            error="TEST ERROR",
            trace_id="TRACE ID",
            task=TEST_TASK,
            scrape_key=scrape_key,
        )
Exemple #17
0
    def test_scrape_data_no_more_tasks_batch(
        self,
        mock_get_more: Mock,
        mock_fetch: Mock,
        mock_populate: Mock,
        mock_write: Mock,
        mock_batch_write: Mock,
    ) -> None:
        mock_fetch.return_value = (TEST_HTML, {})
        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=True,
        )
        start_time = datetime.datetime.now()
        t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA)
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time,
        )

        scraper = FakeScraper("test")
        scraper._generic_scrape(req)

        scrape_key = ScrapeKey("test", constants.ScrapeType.BACKGROUND)
        self.assertEqual(mock_get_more.call_count, 0)
        self.assertEqual(mock_populate.call_count, 1)
        self.assertEqual(mock_write.call_count, 0)
        mock_batch_write.assert_called_once_with(
            ingest_info=self.ii,
            task=t,
            scrape_key=scrape_key,
        )
        self.assertEqual(len(scraper.tasks), 0)
    def test_write_to_datastore(self, mock_session_return):
        mock_session = mock_session_return.return_value = create_mock_session()

        scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)

        ii = ingest_info.IngestInfo()
        ii.create_person(full_name=TEST_NAME).create_booking(
            booking_id=TEST_ID)

        t = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )
        task_hash = hash(json.dumps(t.to_serializable(), sort_keys=True))

        expected_batch = BatchIngestInfoData(ingest_info=ii,
                                             task_hash=task_hash)

        batch_persistence.write(ii, scrape_key, t)

        batch_ingest_info_list = batch_persistence._get_batch_ingest_info_list(
            scrape_key.region_code, mock_session.start)

        self.assertEqual(len(batch_ingest_info_list), 1)
        self.assertEqual(expected_batch, batch_ingest_info_list[0])
Exemple #19
0
    def test_load_target_list_full_names(self, mock_region: Mock) -> None:
        mock_region.return_value.names_file = (
            "../recidiviz/tests/ingest/testdata/docket/names/last_and_first.csv"
        )
        scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)

        docket.load_target_list(scrape_key)

        names = []
        for _ in range(8):
            item = docket.get_new_docket_item(scrape_key)
            assert item is not None
            name_serialized = item.message.data.decode()
            names.append(json.loads(name_serialized))
        assert names == [
            ["Smith", "James"],
            ["Smith", "Michael"],
            ["Smith", "Robert"],
            ["Smith", "David"],
            ["Johnson", "James"],
            ["Johnson", "Michael"],
            ["Smith", "William"],
            ["Williams", "James"],
        ]
        assert not docket.get_new_docket_item(scrape_key)
    def test_persist_to_db(self, mock_write, _mock_region,
                           mock_session_return):
        mock_session = mock_session_return.return_value = create_mock_session()
        scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)

        ii = ingest_info.IngestInfo()
        ii.create_person(
            person_id=TEST_ID,
            full_name=TEST_NAME).create_booking(booking_id=TEST_ID)

        t = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )

        batch_persistence.write(ii, scrape_key, t)

        expected_proto = serialization.convert_ingest_info_to_proto(ii)

        batch_persistence.persist_to_database(scrape_key.region_code,
                                              mock_session.start)

        result_proto = mock_write.call_args[0][0]
        self.assertEqual(result_proto, expected_proto)

        # After we persist, there should no longer be ingest infos on Datastore
        ingest_infos = datastore_ingest_info.batch_get_ingest_infos_for_region(
            REGIONS[0], mock_session.start)
        self.assertEqual(len(ingest_infos), 0)
Exemple #21
0
    def test_get_recent_sessions(self):
        first = self.create_session(
            region_code="us_ny",
            scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.START,
            start=fix_dt(datetime(2009, 6, 17)),
        )
        # different scrape type
        self.create_session(
            region_code="us_ny",
            scrape_type=constants.ScrapeType.SNAPSHOT,
            phase=scrape_phase.ScrapePhase.START,
            start=fix_dt(datetime(2009, 6, 18)),
        )
        third = self.create_session(
            region_code="us_ny",
            scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.START,
            start=fix_dt(datetime(2009, 6, 19)),
            end=fix_dt(datetime(2009, 6, 21)),
        )
        # different region, scrape type
        self.create_session(
            region_code="us_fl",
            scrape_type=constants.ScrapeType.SNAPSHOT,
            phase=scrape_phase.ScrapePhase.START,
            start=fix_dt(datetime(2009, 6, 19)),
        )

        results = sessions.get_recent_sessions(
            ScrapeKey("us_ny", constants.ScrapeType.BACKGROUND)
        )
        assert to_entities(results) == to_entities([third, first])
Exemple #22
0
    def test_create_session_with_existing(self, mock_datetime, mock_client, mock_query):
        mock_datetime.now.return_value = fixed_now

        existing_session = ScrapeSession.new(
            key=datastore.key.Key("session", "existing", project=0),
            start=fixed_now,
            scrape_type=constants.ScrapeType.BACKGROUND,
            region="us_ny",
            phase=scrape_phase.ScrapePhase.START,
        )
        new_key = datastore.key.Key("session", "new", project=0)
        new_session = ScrapeSession.new(
            key=new_key,
            start=fixed_now,
            scrape_type=constants.ScrapeType.BACKGROUND,
            region="us_wy",
            phase=scrape_phase.ScrapePhase.START,
        )

        client = mock_client.return_value
        client.key.return_value = new_key
        wire_sessions_to_query(mock_client, mock_query, [existing_session])

        scrape_key = ScrapeKey("us_wy", constants.ScrapeType.BACKGROUND)
        sessions.create_session(scrape_key)

        existing_session.end = fixed_now
        client.put.assert_any_call(existing_session.to_entity())
        client.put.assert_any_call(new_session.to_entity())
        assert client.put.call_count == 2
Exemple #23
0
 def teardown_method(self, _test_method):
     for region in REGIONS:
         docket.purge_query_docket(
             ScrapeKey(region, constants.ScrapeType.BACKGROUND)
         )
     sessions.ds().delete_multi(self.sessions_to_delete)
     self.project_id_patcher.stop()
Exemple #24
0
    def start_scrape(self, scrape_type):
        """Start new scrape session / query against corrections site

        Retrieves first docket item, enqueues task for initial search
        page scrape to start the new scraping session.

        Args:
            scrape_type: (ScrapeType) The type of scrape to start

        Returns:
            N/A

        """
        docket_item = self.iterate_docket_item(scrape_type)
        scrape_key = ScrapeKey(self.get_region().region_code, scrape_type)
        # Ensure that the topic and subscription are created on start.
        pubsub_helper.create_topic_and_subscription(scrape_key, BATCH_PUBSUB_TYPE)
        if not docket_item:
            logging.error(
                "Found no %s docket items for %s, shutting down.",
                scrape_type,
                self.get_region().region_code,
            )
            sessions.close_session(scrape_key)
            return

        self.add_task(
            self.get_initial_task_method(),
            QueueRequest(
                scrape_type=scrape_type,
                scraper_start_time=datetime.now(),
                next_task=self.get_initial_task(),
            ),
        )
    def test_persist_duplicates_to_db(self, mock_write, _mock_region,
                                      mock_session_return):
        """Tests that duplicate ingest_info.Person objects are merged before
        write."""
        mock_session = mock_session_return.return_value = create_mock_session()
        scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)

        # Arrange
        ii = IngestInfo()
        ii.create_person(person_id=TEST_ID,
                         full_name=TEST_NAME) \
            .create_booking(booking_id=TEST_ID)

        ii_2 = IngestInfo()
        ii.create_person(person_id=TEST_ID2, full_name=TEST_NAME2)

        ii_1_dup = copy.deepcopy(ii)

        t1, t2, t3 = (Task(task_type=constants.TaskType.SCRAPE_DATA,
                           endpoint=TEST_ENDPOINT + str(i),
                           response_type=constants.ResponseType.TEXT)
                      for i in range(3))

        batch_persistence.write(ii, scrape_key, t1)
        batch_persistence.write(ii_2, scrape_key, t2)
        batch_persistence.write(ii_1_dup, scrape_key, t3)

        batch_persistence.persist_to_database(scrape_key.region_code,
                                              mock_session.start)

        expected_ii = IngestInfo(people=ii.people + ii_2.people)
        expected_proto = ingest_utils.convert_ingest_info_to_proto(expected_ii)
        result_proto = mock_write.call_args[0][0]
        self.assertEqual(result_proto, expected_proto)
Exemple #26
0
    def test_load_target_list_last_names(self, mock_region):
        mock_region.return_value.names_file = \
            '../recidiviz/tests/ingest/testdata/docket/names/last_only.csv'
        scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)

        docket.load_target_list(scrape_key)

        names = []
        for _ in range(12):
            item = docket.get_new_docket_item(scrape_key)
            name_serialized = item.message.data.decode()
            names.append(json.loads(name_serialized))
        assert names == [
            ['SMITH', ''],
            ['JOHNSON', ''],
            ['WILLIAMS', ''],
            ['BROWN', ''],
            ['JONES', ''],
            ['MILLER', ''],
            ['DAVIS', ''],
            ['GARCIA', ''],
            ['RODRIGUEZ', ''],
            ['WILSON', ''],
            ['MARTINEZ', ''],
            ['ANDERSON', ''],
        ]
        assert not docket.get_new_docket_item(scrape_key)
    def test_get_sessions_with_leased_none_for_scrape_type(self):
        self.create_session(
            region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.START,
            start=fix_dt(datetime(2016, 11, 20)), docket_ack_id="a")
        self.create_session(
            region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.START,
            start=fix_dt(datetime(2016, 11, 20)), docket_ack_id="b")
        self.create_session(
            region_code="us_ny", scrape_type=constants.ScrapeType.SNAPSHOT,
            phase=scrape_phase.ScrapePhase.START,
            start=fix_dt(datetime(2016, 11, 20)), docket_ack_id="c")
        self.create_session(
            region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.START,
            start=fix_dt(datetime(2016, 11, 20)), docket_ack_id=None)
        self.create_session(
            region_code="us_fl", scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.START,
            start=fix_dt(datetime(2016, 11, 20)), docket_ack_id="d")

        results = sessions.get_sessions_with_leased_docket_items(
            ScrapeKey("us_fl", constants.ScrapeType.SNAPSHOT))
        assert not to_entities(results)
Exemple #28
0
    def test_iterate_docket_item_no_matching_items(self, mock_docket):
        mock_docket.return_value = None

        payload = tracker.iterate_docket_item(
            ScrapeKey("us_fl", constants.ScrapeType.BACKGROUND)
        )
        assert not payload
Exemple #29
0
    def test_load_target_list_last_names(self, mock_region):
        mock_region.return_value.names_file = (
            "../recidiviz/tests/ingest/testdata/docket/names/last_only.csv")
        scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)

        docket.load_target_list(scrape_key)

        names = []
        for _ in range(12):
            item = docket.get_new_docket_item(scrape_key)
            name_serialized = item.message.data.decode()
            names.append(json.loads(name_serialized))
        assert names == [
            ["SMITH", ""],
            ["JOHNSON", ""],
            ["WILLIAMS", ""],
            ["BROWN", ""],
            ["JONES", ""],
            ["MILLER", ""],
            ["DAVIS", ""],
            ["GARCIA", ""],
            ["RODRIGUEZ", ""],
            ["WILSON", ""],
            ["MARTINEZ", ""],
            ["ANDERSON", ""],
        ]
        assert not docket.get_new_docket_item(scrape_key)
Exemple #30
0
    def test_add_item_happy_path(self, mock_client, mock_query):
        current_session_key = datastore.key.Key("session", "current", project=0)
        current_session_vars = {
            "region": "us_va",
            "scrape_type": constants.ScrapeType.SNAPSHOT,
            "phase": scrape_phase.ScrapePhase.START,
            "start": fix_dt(datetime(2014, 8, 31)),
        }
        current_session = ScrapeSession.new(current_session_key, **current_session_vars)
        prior_session = ScrapeSession.new(
            datastore.key.Key("session", "prior", project=0),
            region="us_ny",
            scrape_type=constants.ScrapeType.SNAPSHOT,
            start=fix_dt(datetime(2014, 8, 17)),
            phase=scrape_phase.ScrapePhase.SCRAPE,
        )

        wire_sessions_to_query(
            mock_client, mock_query, [current_session, prior_session]
        )

        assert sessions.add_docket_item_to_current_session(
            "alpha", ScrapeKey("us_va", constants.ScrapeType.SNAPSHOT)
        )

        current_session_vars.update({"docket_ack_id": "alpha"})
        expected_session = ScrapeSession.new(
            current_session_key, **current_session_vars
        )
        mock_client.return_value.put.assert_called_with(expected_session.to_entity())