Beispiel #1
0
    def test_scrape_data_and_more_no_persist(
            self, mock_get_more, mock_fetch, mock_populate, mock_write):
        mock_get_more.return_value = [TEST_TASK]
        mock_fetch.return_value = (TEST_HTML, {})
        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=False,
        )
        start_time = datetime.datetime.now()
        t = Task.evolve(
            TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA_AND_MORE)
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time
        )

        scraper = FakeScraper('test')
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        # Should send the ii since we chose not to persist.
        expected_tasks = [QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=TEST_TASK,
            scraper_start_time=start_time,
            ingest_info=self.ii
        )]

        self.assertEqual(mock_get_more.call_count, 1)
        self.assertEqual(mock_populate.call_count, 1)
        self.assertEqual(mock_write.call_count, 0)
        mock_get_more.assert_called_once_with(TEST_HTML, t)
        self.assertCountEqual(expected_tasks, scraper.tasks)
Beispiel #2
0
    def test_get_more_multiple_tasks_returned(
        self, mock_get_more: Mock, mock_fetch: Mock
    ) -> None:
        mock_get_more.return_value = [TEST_TASK, TEST_TASK]
        mock_fetch.return_value = (TEST_HTML, None)
        start_time = datetime.datetime.now()
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=TEST_TASK,
            scraper_start_time=start_time,
        )

        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        expected_tasks = [
            QueueRequest(
                scrape_type=constants.ScrapeType.BACKGROUND,
                next_task=TEST_TASK,
                scraper_start_time=start_time,
            )
        ] * 2

        self.assertCountEqual(expected_tasks, scraper.tasks)
Beispiel #3
0
    def test_fetch_sends_all_args(self, mock_get_more, mock_fetch):
        mock_get_more.return_value = [TEST_TASK]
        mock_fetch.return_value = (TEST_HTML, None)
        start_time = datetime.datetime.now()
        t = Task.evolve(
            TEST_TASK, headers='TEST_HEADERS', cookies='TEST_COOKIES',
            params='TEST_PARAMS', post_data='TEST_POST', json='TEST_JSON'
        )
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time
        )

        scraper = FakeScraper('test')
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        expected_tasks = [QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=TEST_TASK,
            scraper_start_time=start_time,
        )]

        mock_fetch.assert_called_once_with(
            t.endpoint, t.response_type, headers=t.headers, cookies=t.cookies,
            params=t.params, post_data=t.post_data, json_data=t.json
        )
        self.assertCountEqual(expected_tasks, scraper.tasks)
Beispiel #4
0
    def test_get_more_and_updates_cookies(
        self, mock_get_more: Mock, mock_fetch: Mock
    ) -> None:
        mock_get_more.return_value = [TEST_TASK]
        mock_fetch.return_value = (TEST_HTML, {1: 1})
        start_time = datetime.datetime.now()
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=TEST_TASK,
            scraper_start_time=start_time,
        )

        t = Task.evolve(TEST_TASK, cookies={1: 1})

        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        expected_tasks = [
            QueueRequest(
                scrape_type=constants.ScrapeType.BACKGROUND,
                next_task=t,
                scraper_start_time=start_time,
            )
        ]

        self.assertCountEqual(expected_tasks, scraper.tasks)
    def test_scrape_data_and_more_no_persist_second_time_persist(
            self, mock_get_more, mock_fetch, mock_populate, mock_write):
        populate_task = Task.evolve(TEST_TASK,
                                    task_type=constants.TaskType.SCRAPE_DATA)
        mock_get_more.return_value = [populate_task]
        mock_fetch.return_value = (TEST_HTML, {})
        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=False,
        )
        start_time = datetime.datetime.now()
        t = Task.evolve(TEST_TASK,
                        task_type=constants.TaskType.SCRAPE_DATA_AND_MORE)
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time,
        )

        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        # Should send the ii since we chose not to persist.
        expected_tasks = [
            QueueRequest(
                scrape_type=constants.ScrapeType.BACKGROUND,
                next_task=populate_task,
                scraper_start_time=start_time,
                ingest_info=self.ii,
            )
        ]

        self.assertEqual(mock_get_more.call_count, 1)
        self.assertEqual(mock_populate.call_count, 1)
        self.assertEqual(mock_write.call_count, 0)
        mock_get_more.assert_called_once_with(TEST_HTML, t)
        self.assertCountEqual(expected_tasks, scraper.tasks)

        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=True,
        )
        scraper._generic_scrape(scraper.tasks[0])
        self.assertEqual(mock_get_more.call_count, 1)
        self.assertEqual(mock_populate.call_count, 2)
        self.assertEqual(mock_write.call_count, 1)

        expected_metadata = IngestMetadata(
            scraper.region.region_code,
            scraper.region.jurisdiction_id,
            start_time,
            scraper.get_enum_overrides(),
        )
        expected_proto = convert_ingest_info_to_proto(self.ii)
        mock_write.assert_called_once_with(expected_proto, expected_metadata)
Beispiel #6
0
    def test_scrape_data_and_more_yes_persist(
        self,
        mock_get_more: Mock,
        mock_fetch: Mock,
        mock_populate: Mock,
        mock_write: Mock,
    ) -> None:
        mock_get_more.return_value = [TEST_TASK]
        mock_fetch.return_value = (TEST_HTML, {})
        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=True,
        )
        start_time = datetime.datetime.now()
        t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA_AND_MORE)
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time,
        )

        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        # Should send the ii since we chose not to persist.
        expected_tasks = [
            QueueRequest(
                scrape_type=constants.ScrapeType.BACKGROUND,
                next_task=TEST_TASK,
                scraper_start_time=start_time,
            )
        ]
        expected_metadata = IngestMetadata(
            region=scraper.region.region_code,
            jurisdiction_id=scraper.region.jurisdiction_id,
            ingest_time=start_time,
            enum_overrides=scraper.get_enum_overrides(),
            system_level=SystemLevel.COUNTY,
            database_key=SQLAlchemyDatabaseKey.for_schema(SchemaType.JAILS),
        )
        expected_proto = convert_ingest_info_to_proto(self.ii)

        self.assertEqual(mock_get_more.call_count, 1)
        self.assertEqual(mock_populate.call_count, 1)
        self.assertEqual(mock_write.call_count, 1)
        mock_write.assert_called_once_with(expected_proto, expected_metadata)
        self.assertCountEqual(expected_tasks, scraper.tasks)
    def test_scrape_data_no_more_tasks(self, mock_get_more, mock_fetch,
                                       mock_populate, mock_write):
        mock_fetch.return_value = (TEST_HTML, {})
        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=True,
        )
        start_time = datetime.datetime.now()
        t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA)
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time,
        )

        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        expected_metadata = IngestMetadata(
            scraper.region.region_code,
            scraper.region.jurisdiction_id,
            start_time,
            scraper.get_enum_overrides(),
        )
        expected_proto = convert_ingest_info_to_proto(self.ii)

        self.assertEqual(mock_get_more.call_count, 0)
        self.assertEqual(mock_populate.call_count, 1)
        self.assertEqual(mock_write.call_count, 1)
        mock_write.assert_called_once_with(expected_proto, expected_metadata)
        self.assertEqual(len(scraper.tasks), 0)
Beispiel #8
0
    def test_scrape_data_no_more_tasks_batch(
        self,
        mock_get_more: Mock,
        mock_fetch: Mock,
        mock_populate: Mock,
        mock_write: Mock,
        mock_batch_write: Mock,
    ) -> None:
        mock_fetch.return_value = (TEST_HTML, {})
        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=True,
        )
        start_time = datetime.datetime.now()
        t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA)
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time,
        )

        scraper = FakeScraper("test")
        scraper._generic_scrape(req)

        scrape_key = ScrapeKey("test", constants.ScrapeType.BACKGROUND)
        self.assertEqual(mock_get_more.call_count, 0)
        self.assertEqual(mock_populate.call_count, 1)
        self.assertEqual(mock_write.call_count, 0)
        mock_batch_write.assert_called_once_with(
            ingest_info=self.ii,
            task=t,
            scrape_key=scrape_key,
        )
        self.assertEqual(len(scraper.tasks), 0)
Beispiel #9
0
    def test_get_more_tasks_failure_batch(
        self,
        mock_flask: Mock,
        mock_get_more: Mock,
        mock_fetch: Mock,
        mock_batch_error: Mock,
    ) -> None:
        mock_fetch.return_value = ("TEST", {})
        mock_get_more.side_effect = ValueError("TEST ERROR")
        mock_flask_get = Mock()
        mock_flask_get.return_value = "TRACE ID"
        mock_flask.headers.get = mock_flask_get

        start_time = datetime.datetime.now()
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=TEST_TASK,
            scraper_start_time=start_time,
        )
        scraper = FakeScraper("test")
        with self.assertRaises(ScraperGetMoreTasksError):
            scraper._generic_scrape(req)
        self.assertEqual(mock_batch_error.call_count, 1)

        scrape_key = ScrapeKey(
            region_code="test", scrape_type=constants.ScrapeType.BACKGROUND
        )
        mock_batch_error.assert_called_once_with(
            error="TEST ERROR",
            trace_id="TRACE ID",
            task=TEST_TASK,
            scrape_key=scrape_key,
        )
Beispiel #10
0
    def start_scrape(self, scrape_type):
        """Start new scrape session / query against corrections site

        Retrieves first docket item, enqueues task for initial search
        page scrape to start the new scraping session.

        Args:
            scrape_type: (ScrapeType) The type of scrape to start

        Returns:
            N/A

        """
        docket_item = self.iterate_docket_item(scrape_type)
        scrape_key = ScrapeKey(self.get_region().region_code, scrape_type)
        # Ensure that the topic and subscription are created on start.
        pubsub_helper.create_topic_and_subscription(scrape_key, BATCH_PUBSUB_TYPE)
        if not docket_item:
            logging.error(
                "Found no %s docket items for %s, shutting down.",
                scrape_type,
                self.get_region().region_code,
            )
            sessions.close_session(scrape_key)
            return

        self.add_task(
            self.get_initial_task_method(),
            QueueRequest(
                scrape_type=scrape_type,
                scraper_start_time=datetime.now(),
                next_task=self.get_initial_task(),
            ),
        )
Beispiel #11
0
    def test_resume_scrape_snapshot(self, mock_get_region, mock_tracker,
                                    mock_task_manager, mock_datetime):
        docket_item = (41620, ['daft', 'punk'])
        region = 'us_nd'
        scrape_type = constants.ScrapeType.SNAPSHOT
        queue_name = 'us_nd_scraper'
        initial_task = 'press_it'

        mock_get_region.return_value = mock_region(region, queue_name)
        mock_tracker.return_value = docket_item
        mock_task_manager.return_value.create_scrape_task.return_value = None
        mock_datetime.now.return_value = _DATETIME

        scraper = FakeScraper(region, initial_task)
        scraper.resume_scrape(scrape_type)

        mock_get_region.assert_called_with(region)

        queue_params = QueueRequest(
            scrape_type=scrape_type.value,
            scraper_start_time=_DATETIME,
            next_task=FAKE_TASK,
            # content=(83240, ['dagt', 'punk']),
        )
        request_body = {
            'region': region,
            'task': initial_task,
            'params': queue_params.to_serializable()
        }

        mock_task_manager.return_value.create_scrape_task.assert_called_with(
            region_code=region,
            queue_name=queue_name,
            url=scraper.scraper_work_url,
            body=request_body)
Beispiel #12
0
    def test_resume_scrape_background(
        self,
        mock_get_region: Mock,
        mock_sessions: Mock,
        mock_task_manager: Mock,
        mock_datetime: Mock,
    ) -> None:
        """Tests the resume_scrape flow for background scraping."""
        region = "us_nd"
        scrape_type = constants.ScrapeType.BACKGROUND
        queue_name = "us_nd_scraper"
        initial_task = "charge_it"

        mock_get_region.return_value = mock_region(region, queue_name)
        recent_session_none_scraped = ScrapeSession.new(
            key=None,
            scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.SCRAPE,
        )
        recent_session = ScrapeSession.new(
            key=None,
            scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.SCRAPE,
            last_scraped="Bangalter, Thomas",
        )
        mock_sessions.return_value = [
            recent_session_none_scraped, recent_session
        ]
        mock_task_manager.return_value.create_scrape_task.return_value = None
        mock_datetime.now.return_value = _DATETIME

        scraper = FakeScraper(region, initial_task)
        scraper.resume_scrape(scrape_type)

        mock_get_region.assert_called_with(region)
        mock_sessions.assert_called_with(ScrapeKey(region, scrape_type))

        queue_params = QueueRequest(
            scrape_type=scrape_type,
            scraper_start_time=_DATETIME,
            next_task=FAKE_TASK,
            # content=['Bangalter', 'Thomas'],
        )
        request_body = {
            "region": region,
            "task": initial_task,
            "params": queue_params.to_serializable(),
        }

        mock_task_manager.return_value.create_scrape_task.assert_called_with(
            region_code=region,
            queue_name=queue_name,
            url=scraper.scraper_work_url,
            body=request_body,
        )
Beispiel #13
0
def start_scrape(queue, self, scrape_type):
    add_task(
        queue,
        self,
        self.get_initial_task_method(),
        QueueRequest(
            scrape_type=scrape_type,
            scraper_start_time=datetime.now(),
            next_task=self.get_initial_task(),
        ),
    )
Beispiel #14
0
    def test_fetch_failure(self, mock_fetch: Mock) -> None:
        mock_fetch.return_value = ValueError("TEST ERROR")

        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=TEST_TASK,
            scraper_start_time=datetime.datetime.now(),
        )
        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        with self.assertRaises(ScraperFetchError):
            scraper._generic_scrape(req)
Beispiel #15
0
def start_scrape(
    queue: deque, self: Scraper, scrape_type: constants.ScrapeType
) -> None:
    add_task(
        queue,
        self,
        self.get_initial_task_method(),
        QueueRequest(
            scrape_type=scrape_type,
            scraper_start_time=datetime.now(),
            next_task=self.get_initial_task(),
        ),
    )
    def test_get_more_tasks_failure(self, mock_get_more, mock_fetch):
        mock_fetch.return_value = ("TEST", {})
        mock_get_more.side_effect = ValueError("TEST ERROR")

        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=TEST_TASK,
            scraper_start_time=datetime.datetime.now(),
        )
        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        with self.assertRaises(ScraperGetMoreTasksError):
            scraper._generic_scrape(req)
    def test_create_scrape_task(self, mock_client: Mock,
                                mock_uuid: Mock) -> None:
        # Arrange
        uuid = "random-uuid"
        mock_uuid.uuid4.return_value = uuid

        region_code = "us_ca_san_francisco"
        project_id = "recidiviz-456"

        queue_name = "test-queue-name"
        queue_path = f"queue_path/{project_id}/{QUEUES_REGION}/{queue_name}"
        task_id = "us_ca_san_francisco-random-uuid"
        task_path = f"{queue_path}/{task_id}"
        url = "/my_scrape/task"

        body = {
            "region":
            region_code,
            "params":
            QueueRequest(
                next_task=Task(task_type=TaskType.INITIAL,
                               endpoint="www.google.com"),
                scrape_type=ScrapeType.BACKGROUND,
                scraper_start_time=datetime.datetime.now(),
            ).to_serializable(),
        }
        task = tasks_v2.types.task_pb2.Task(
            name=task_path,
            app_engine_http_request={
                "http_method": "POST",
                "relative_uri": url,
                "body": json.dumps(body).encode(),
            },
        )

        mock_client.return_value.task_path.return_value = task_path
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        ScraperCloudTaskManager(project_id=project_id).create_scrape_task(
            region_code=region_code, queue_name=queue_name, url=url, body=body)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            project_id, QUEUES_REGION, queue_name)
        mock_client.return_value.task_path.assert_called_with(
            project_id, QUEUES_REGION, queue_name, task_id)
        mock_client.return_value.create_task.assert_called_with(
            parent=queue_path, task=task)
    def test_create_scrape_task(self, mock_client, mock_uuid):
        # Arrange
        uuid = 'random-uuid'
        mock_uuid.uuid4.return_value = uuid

        region_code = 'us_ca_san_francisco'
        project_id = 'recidiviz-456'

        queue_name = 'test-queue-name'
        queue_path = f'queue_path/{project_id}/{QUEUES_REGION}/{queue_name}'
        task_id = 'us_ca_san_francisco-random-uuid'
        task_path = f'{queue_path}/{task_id}'
        url = '/my_scrape/task'

        body = {
            'region':
            region_code,
            'params':
            QueueRequest(
                next_task=Task(task_type=TaskType.INITIAL,
                               endpoint='www.google.com'),
                scrape_type=ScrapeType.BACKGROUND,
                scraper_start_time=datetime.datetime.now()).to_serializable()
        }
        task = tasks_v2.types.task_pb2.Task(name=task_path,
                                            app_engine_http_request={
                                                'http_method': 'POST',
                                                'relative_uri': url,
                                                'body':
                                                json.dumps(body).encode()
                                            })

        mock_client.return_value.task_path.return_value = task_path
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        ScraperCloudTaskManager(project_id=project_id). \
            create_scrape_task(region_code=region_code,
                               queue_name=queue_name,
                               url=url,
                               body=body)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            project_id, QUEUES_REGION, queue_name)
        mock_client.return_value.task_path.assert_called_with(
            project_id, QUEUES_REGION, queue_name, task_id)
        mock_client.return_value.create_task.assert_called_with(
            queue_path, task)
Beispiel #19
0
    def test_content_no_fetch(self, mock_get_more: Mock, mock_fetch: Mock) -> None:
        t = Task.evolve(TEST_TASK, content=TEST_HTML)
        mock_get_more.return_value = [t]
        start_time = datetime.datetime.now()
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time,
        )
        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        expected_tasks = [req]

        self.assertEqual(mock_fetch.call_count, 0)
        self.assertCountEqual(expected_tasks, scraper.tasks)
Beispiel #20
0
    def test_start_scrape_background(
        self,
        mock_pubsub: Mock,
        mock_get_region: Mock,
        mock_tracker: Mock,
        mock_task_manager: Mock,
        mock_datetime: Mock,
    ) -> None:
        docket_item = ("Dog", "Cat")
        region = "us_nd"
        scrape_type = constants.ScrapeType.BACKGROUND
        queue_name = "us_nd_scraper"
        initial_task = "use_it"

        mock_get_region.return_value = mock_region(region, queue_name)
        mock_tracker.return_value = docket_item
        mock_task_manager.return_value.create_scrape_task.return_value = None
        mock_datetime.now.return_value = _DATETIME

        scraper = FakeScraper(region, initial_task)
        scraper.start_scrape(scrape_type)

        mock_get_region.assert_called_with(region)
        mock_tracker.assert_called_with(ScrapeKey(region, scrape_type))
        mock_pubsub.assert_called_with(ScrapeKey(region, scrape_type),
                                       BATCH_PUBSUB_TYPE)

        queue_params = QueueRequest(
            scrape_type=scrape_type,
            scraper_start_time=_DATETIME,
            next_task=FAKE_TASK,
            # content=docket_item,
        )
        request_body = {
            "region": region,
            "task": initial_task,
            "params": queue_params.to_serializable(),
        }

        mock_task_manager.return_value.create_scrape_task.assert_called_with(
            region_code=region,
            queue_name=queue_name,
            url=scraper.scraper_work_url,
            body=request_body,
        )
Beispiel #21
0
    def test_resume_scrape_snapshot(
        self,
        mock_get_region: Mock,
        mock_tracker: Mock,
        mock_task_manager: Mock,
        mock_datetime: Mock,
    ) -> None:
        docket_item = (41620, ["daft", "punk"])
        region = "us_nd"
        scrape_type = constants.ScrapeType.SNAPSHOT
        queue_name = "us_nd_scraper"
        initial_task = "press_it"

        mock_get_region.return_value = mock_region(region, queue_name)
        mock_tracker.return_value = docket_item
        mock_task_manager.return_value.create_scrape_task.return_value = None
        mock_datetime.now.return_value = _DATETIME

        scraper = FakeScraper(region, initial_task)
        scraper.resume_scrape(scrape_type)

        mock_get_region.assert_called_with(region)

        queue_params = QueueRequest(
            scrape_type=scrape_type,
            scraper_start_time=_DATETIME,
            next_task=FAKE_TASK,
            # content=(83240, ['dagt', 'punk']),
        )
        request_body = {
            "region": region,
            "task": initial_task,
            "params": queue_params.to_serializable(),
        }

        mock_task_manager.return_value.create_scrape_task.assert_called_with(
            region_code=region,
            queue_name=queue_name,
            url=scraper.scraper_work_url,
            body=request_body,
        )
Beispiel #22
0
    def test_start_scrape_background(self, mock_pubsub, mock_get_region,
                                     mock_tracker, mock_task_manager,
                                     mock_datetime):
        docket_item = ('Dog', 'Cat')
        region = 'us_nd'
        scrape_type = constants.ScrapeType.BACKGROUND
        queue_name = 'us_nd_scraper'
        initial_task = 'use_it'

        mock_get_region.return_value = mock_region(region, queue_name)
        mock_tracker.return_value = docket_item
        mock_task_manager.return_value.create_scrape_task.return_value = None
        mock_datetime.now.return_value = _DATETIME

        scraper = FakeScraper(region, initial_task)
        scraper.start_scrape(scrape_type)

        mock_get_region.assert_called_with(region)
        mock_tracker.assert_called_with(ScrapeKey(region, scrape_type))
        mock_pubsub.assert_called_with(ScrapeKey(region, scrape_type),
                                       BATCH_PUBSUB_TYPE)

        queue_params = QueueRequest(
            scrape_type=scrape_type.value,
            scraper_start_time=_DATETIME,
            next_task=FAKE_TASK,
            # content=docket_item,
        )
        request_body = {
            'region': region,
            'task': initial_task,
            'params': queue_params.to_serializable()
        }

        mock_task_manager.return_value.create_scrape_task.assert_called_with(
            region_code=region,
            queue_name=queue_name,
            url=scraper.scraper_work_url,
            body=request_body)
Beispiel #23
0
    def _generic_scrape(self, request: QueueRequest):
        """
        General handler for all scrape tasks.  This function is a generic entry
        point into all types of scrapes.  It decides what to call based on
        params.

        Args:
            params: dict of parameters passed from the last scrape session.
        """
        try:
            task = request.next_task

            # Here we handle a special case where we weren't really sure
            # we were going to get data when we submitted a task, but then
            # we ended up with data, so no more requests are required,
            # just the content we already have.
            # TODO(#680): remove this
            if task.content is not None:
                content = self._parse_html_content(task.content)
                cookies = None
            else:
                post_data = task.post_data

                # Let the child transform the post_data if it wants before
                # sending the requests.  This hook is in here in case the
                # child did something like compress the post_data before
                # it put it on the queue.
                self.transform_post_data(post_data)

                # We always fetch some content before doing anything.
                # Note that we use get here for the post_data to return a
                # default value of None if this scraper doesn't set it.
                try:
                    content, cookies = self._fetch_content(
                        task.endpoint,
                        task.response_type,
                        headers=task.headers,
                        cookies=task.cookies,
                        params=task.params,
                        post_data=post_data,
                        json_data=task.json)
                except Exception as e:
                    raise ScraperFetchError(str(e)) from e

            scraped_data = None
            if self.should_scrape_data(task.task_type):
                # If we want to scrape data, we should either create an
                # ingest_info object or get the one that already exists.
                logging.info("Scraping data for [%s] and endpoint: [%s]",
                             self.region.region_code, task.endpoint)
                try:
                    scraped_data = self.populate_data(
                        content, task, request.ingest_info or IngestInfo())
                except Exception as e:
                    raise ScraperPopulateDataError(str(e)) from e

            if self.should_get_more_tasks(task.task_type):
                logging.info("Getting more tasks for [%s] and endpoint: [%s]",
                             self.region.region_code, task.endpoint)

                # Only send along ingest info if it will not be persisted now.
                ingest_info_to_send = None
                if scraped_data is not None and not scraped_data.persist:
                    ingest_info_to_send = scraped_data.ingest_info

                try:
                    # pylint: disable=assignment-from-no-return
                    next_tasks = self.get_more_tasks(content, task)
                except Exception as e:
                    raise ScraperGetMoreTasksError(str(e)) from e
                for next_task in next_tasks:
                    # Include cookies received from response, if any
                    if cookies:
                        cookies.update(next_task.cookies)
                        next_task = Task.evolve(next_task, cookies=cookies)
                    self.add_task(
                        '_generic_scrape',
                        QueueRequest(
                            scrape_type=request.scrape_type,
                            scraper_start_time=request.scraper_start_time,
                            next_task=next_task,
                            ingest_info=ingest_info_to_send,
                        ))

            if scraped_data is not None and scraped_data.persist:
                if scraped_data.ingest_info:
                    logging.info("Logging at most 4 people (were %d):",
                                 len(scraped_data.ingest_info.people))
                    loop_count = min(len(scraped_data.ingest_info.people),
                                     constants.MAX_PEOPLE_TO_LOG)
                    for i in range(loop_count):
                        logging.info("[%s]",
                                     str(scraped_data.ingest_info.people[i]))
                    logging.info("Last seen time of person being set as: [%s]",
                                 request.scraper_start_time)
                    metadata = IngestMetadata(self.region.region_code,
                                              self.region.jurisdiction_id,
                                              request.scraper_start_time,
                                              self.get_enum_overrides())
                    if self.BATCH_WRITES:
                        logging.info(
                            "Queuing ingest_info ([%d] people) to "
                            "batch_persistence for [%s]",
                            len(scraped_data.ingest_info.people),
                            self.region.region_code)
                        scrape_key = ScrapeKey(self.region.region_code,
                                               request.scrape_type)
                        batch_persistence.write(
                            ingest_info=scraped_data.ingest_info,
                            scrape_key=scrape_key,
                            task=task,
                        )
                    else:
                        logging.info(
                            "Writing ingest_info ([%d] people) to the database"
                            " for [%s]", len(scraped_data.ingest_info.people),
                            self.region.region_code)
                        persistence.write(
                            ingest_utils.convert_ingest_info_to_proto(
                                scraped_data.ingest_info), metadata)
                for sc in scraped_data.single_counts:
                    if not sc.date:
                        scrape_key = ScrapeKey(self.region.region_code,
                                               constants.ScrapeType.BACKGROUND)
                        session = sessions.get_current_session(scrape_key)
                        if session:
                            sc = attr.evolve(sc, date=session.start.date())
                    single_count.store_single_count(
                        sc, self.region.jurisdiction_id)
        except Exception as e:
            if self.BATCH_WRITES:
                scrape_key = ScrapeKey(self.region.region_code,
                                       request.scrape_type)
                batch_persistence.write_error(
                    error=str(e),
                    trace_id=get_trace_id_from_flask(),
                    task=task,
                    scrape_key=scrape_key,
                )
            raise e
Beispiel #24
0
from typing import Callable

import pytest
import pytz
from flask import Flask
from mock import Mock, create_autospec, patch

from recidiviz.ingest.scrape import constants, scrape_phase, sessions, worker
from recidiviz.ingest.scrape.task_params import QueueRequest, Task
from recidiviz.utils.regions import Region

PATH = "/work/us_ca"
FAKE_QUEUE_PARAMS = QueueRequest(
    scrape_type=constants.ScrapeType.BACKGROUND,
    scraper_start_time=datetime.datetime.now(tz=pytz.UTC),
    next_task=Task(
        task_type=constants.TaskType.INITIAL,
        endpoint="some.endpoint",
    ),
)

app = Flask(__name__)
app.register_blueprint(worker.worker)
app.config["TESTING"] = True


@patch("recidiviz.utils.metadata.project_id", Mock(return_value="test-project"))
@patch("recidiviz.utils.metadata.project_number", Mock(return_value="123456789"))
class TestWorker:
    """Tests for requests to the Worker API."""

    # noinspection PyAttributeOutsideInit
Beispiel #25
0
    def resume_scrape(self, scrape_type):
        """Resume a stopped scrape from where it left off

        Starts the scraper up again at the same place (roughly) as it had been
        stopped previously. This allows for cron jobs to start/stop scrapers at
        different times of day.

        Args:
            scrape_type: (ScrapeType) Type of scraping to resume

        Returns:
            N/A
        """
        # Note: None of the current scrapers support resumes, so this function
        # doesn't fully work. For instance, content is thrown away.
        if scrape_type is constants.ScrapeType.BACKGROUND:
            # Background scrape

            # In most scrapers, background scrapes will use
            # short-lived docket items. However, some background
            # scrapes use only one docket item to run a giant scrape,
            # which may run for months. Limitations in GAE Pull Queues
            # make it difficult to keep track of a leased task for
            # that long, so we don't try. Resuming a background scrape
            # simply resumes from session data, and the task stays in
            # the docket un-leased. It will get deleted the next time
            # we start a new background scrape.

            recent_sessions = sessions.get_recent_sessions(
                ScrapeKey(self.get_region().region_code, scrape_type)
            )

            last_scraped = None
            for session in recent_sessions:
                if session.last_scraped:
                    last_scraped = session.last_scraped
                    break

            if last_scraped:
                content = last_scraped.split(", ")
            else:
                logging.error(
                    "No earlier session with last_scraped found; " "cannot resume."
                )
                return

        else:
            # Snapshot scrape

            # Get an item from the docket and continue from there. These queries
            # are very quick, so we don't bother trying to resume the same task
            # we left off on.

            content = self.iterate_docket_item(scrape_type)
            if not content:
                sessions.close_session(
                    ScrapeKey(self.get_region().region_code, scrape_type)
                )
                return

        self.add_task(
            self.get_initial_task_method(),
            QueueRequest(
                scrape_type=scrape_type,
                scraper_start_time=datetime.now(),
                next_task=self.get_initial_task(),
            ),
        )