Beispiel #1
0
    def test_resume_scrape_snapshot(self, mock_get_region, mock_tracker,
                                    mock_task_manager, mock_datetime):
        docket_item = (41620, ['daft', 'punk'])
        region = 'us_nd'
        scrape_type = constants.ScrapeType.SNAPSHOT
        queue_name = 'us_nd_scraper'
        initial_task = 'press_it'

        mock_get_region.return_value = mock_region(region, queue_name)
        mock_tracker.return_value = docket_item
        mock_task_manager.return_value.create_scrape_task.return_value = None
        mock_datetime.now.return_value = _DATETIME

        scraper = FakeScraper(region, initial_task)
        scraper.resume_scrape(scrape_type)

        mock_get_region.assert_called_with(region)

        queue_params = QueueRequest(
            scrape_type=scrape_type.value,
            scraper_start_time=_DATETIME,
            next_task=FAKE_TASK,
            # content=(83240, ['dagt', 'punk']),
        )
        request_body = {
            'region': region,
            'task': initial_task,
            'params': queue_params.to_serializable()
        }

        mock_task_manager.return_value.create_scrape_task.assert_called_with(
            region_code=region,
            queue_name=queue_name,
            url=scraper.scraper_work_url,
            body=request_body)
Beispiel #2
0
    def test_scrape_data_and_more_no_persist(
            self, mock_get_more, mock_fetch, mock_populate, mock_write):
        mock_get_more.return_value = [TEST_TASK]
        mock_fetch.return_value = (TEST_HTML, {})
        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=False,
        )
        start_time = datetime.datetime.now()
        t = Task.evolve(
            TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA_AND_MORE)
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time
        )

        scraper = FakeScraper('test')
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        # Should send the ii since we chose not to persist.
        expected_tasks = [QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=TEST_TASK,
            scraper_start_time=start_time,
            ingest_info=self.ii
        )]

        self.assertEqual(mock_get_more.call_count, 1)
        self.assertEqual(mock_populate.call_count, 1)
        self.assertEqual(mock_write.call_count, 0)
        mock_get_more.assert_called_once_with(TEST_HTML, t)
        self.assertCountEqual(expected_tasks, scraper.tasks)
Beispiel #3
0
    def test_fetch_sends_all_args(self, mock_get_more, mock_fetch):
        mock_get_more.return_value = [TEST_TASK]
        mock_fetch.return_value = (TEST_HTML, None)
        start_time = datetime.datetime.now()
        t = Task.evolve(
            TEST_TASK, headers='TEST_HEADERS', cookies='TEST_COOKIES',
            params='TEST_PARAMS', post_data='TEST_POST', json='TEST_JSON'
        )
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time
        )

        scraper = FakeScraper('test')
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        expected_tasks = [QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=TEST_TASK,
            scraper_start_time=start_time,
        )]

        mock_fetch.assert_called_once_with(
            t.endpoint, t.response_type, headers=t.headers, cookies=t.cookies,
            params=t.params, post_data=t.post_data, json_data=t.json
        )
        self.assertCountEqual(expected_tasks, scraper.tasks)
Beispiel #4
0
    def test_get_more_multiple_tasks_returned(
        self, mock_get_more: Mock, mock_fetch: Mock
    ) -> None:
        mock_get_more.return_value = [TEST_TASK, TEST_TASK]
        mock_fetch.return_value = (TEST_HTML, None)
        start_time = datetime.datetime.now()
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=TEST_TASK,
            scraper_start_time=start_time,
        )

        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        expected_tasks = [
            QueueRequest(
                scrape_type=constants.ScrapeType.BACKGROUND,
                next_task=TEST_TASK,
                scraper_start_time=start_time,
            )
        ] * 2

        self.assertCountEqual(expected_tasks, scraper.tasks)
Beispiel #5
0
    def test_get_more_and_updates_cookies(
        self, mock_get_more: Mock, mock_fetch: Mock
    ) -> None:
        mock_get_more.return_value = [TEST_TASK]
        mock_fetch.return_value = (TEST_HTML, {1: 1})
        start_time = datetime.datetime.now()
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=TEST_TASK,
            scraper_start_time=start_time,
        )

        t = Task.evolve(TEST_TASK, cookies={1: 1})

        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        expected_tasks = [
            QueueRequest(
                scrape_type=constants.ScrapeType.BACKGROUND,
                next_task=t,
                scraper_start_time=start_time,
            )
        ]

        self.assertCountEqual(expected_tasks, scraper.tasks)
    def test_scrape_data_and_more_no_persist_second_time_persist(
            self, mock_get_more, mock_fetch, mock_populate, mock_write):
        populate_task = Task.evolve(TEST_TASK,
                                    task_type=constants.TaskType.SCRAPE_DATA)
        mock_get_more.return_value = [populate_task]
        mock_fetch.return_value = (TEST_HTML, {})
        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=False,
        )
        start_time = datetime.datetime.now()
        t = Task.evolve(TEST_TASK,
                        task_type=constants.TaskType.SCRAPE_DATA_AND_MORE)
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time,
        )

        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        # Should send the ii since we chose not to persist.
        expected_tasks = [
            QueueRequest(
                scrape_type=constants.ScrapeType.BACKGROUND,
                next_task=populate_task,
                scraper_start_time=start_time,
                ingest_info=self.ii,
            )
        ]

        self.assertEqual(mock_get_more.call_count, 1)
        self.assertEqual(mock_populate.call_count, 1)
        self.assertEqual(mock_write.call_count, 0)
        mock_get_more.assert_called_once_with(TEST_HTML, t)
        self.assertCountEqual(expected_tasks, scraper.tasks)

        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=True,
        )
        scraper._generic_scrape(scraper.tasks[0])
        self.assertEqual(mock_get_more.call_count, 1)
        self.assertEqual(mock_populate.call_count, 2)
        self.assertEqual(mock_write.call_count, 1)

        expected_metadata = IngestMetadata(
            scraper.region.region_code,
            scraper.region.jurisdiction_id,
            start_time,
            scraper.get_enum_overrides(),
        )
        expected_proto = convert_ingest_info_to_proto(self.ii)
        mock_write.assert_called_once_with(expected_proto, expected_metadata)
Beispiel #7
0
    def test_resume_scrape_background(
        self,
        mock_get_region: Mock,
        mock_sessions: Mock,
        mock_task_manager: Mock,
        mock_datetime: Mock,
    ) -> None:
        """Tests the resume_scrape flow for background scraping."""
        region = "us_nd"
        scrape_type = constants.ScrapeType.BACKGROUND
        queue_name = "us_nd_scraper"
        initial_task = "charge_it"

        mock_get_region.return_value = mock_region(region, queue_name)
        recent_session_none_scraped = ScrapeSession.new(
            key=None,
            scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.SCRAPE,
        )
        recent_session = ScrapeSession.new(
            key=None,
            scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.SCRAPE,
            last_scraped="Bangalter, Thomas",
        )
        mock_sessions.return_value = [
            recent_session_none_scraped, recent_session
        ]
        mock_task_manager.return_value.create_scrape_task.return_value = None
        mock_datetime.now.return_value = _DATETIME

        scraper = FakeScraper(region, initial_task)
        scraper.resume_scrape(scrape_type)

        mock_get_region.assert_called_with(region)
        mock_sessions.assert_called_with(ScrapeKey(region, scrape_type))

        queue_params = QueueRequest(
            scrape_type=scrape_type,
            scraper_start_time=_DATETIME,
            next_task=FAKE_TASK,
            # content=['Bangalter', 'Thomas'],
        )
        request_body = {
            "region": region,
            "task": initial_task,
            "params": queue_params.to_serializable(),
        }

        mock_task_manager.return_value.create_scrape_task.assert_called_with(
            region_code=region,
            queue_name=queue_name,
            url=scraper.scraper_work_url,
            body=request_body,
        )
Beispiel #8
0
def add_task(
    queue: deque, _self: Scraper, task_name: str, request: QueueRequest
) -> None:
    """Overwritten version of `add_task` which adds the task to an in-memory
    queue.
    """

    # Serialize and deserialize the request. Simply to replicate production
    # and catch any potential issues.
    serialized = json.dumps(request.to_serializable())
    request = QueueRequest.from_serializable(json.loads(serialized))

    # Add it to the queue
    queue.append((task_name, request))
Beispiel #9
0
    def test_scrape_data_and_more_yes_persist(
        self,
        mock_get_more: Mock,
        mock_fetch: Mock,
        mock_populate: Mock,
        mock_write: Mock,
    ) -> None:
        mock_get_more.return_value = [TEST_TASK]
        mock_fetch.return_value = (TEST_HTML, {})
        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=True,
        )
        start_time = datetime.datetime.now()
        t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA_AND_MORE)
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time,
        )

        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        # Should send the ii since we chose not to persist.
        expected_tasks = [
            QueueRequest(
                scrape_type=constants.ScrapeType.BACKGROUND,
                next_task=TEST_TASK,
                scraper_start_time=start_time,
            )
        ]
        expected_metadata = IngestMetadata(
            region=scraper.region.region_code,
            jurisdiction_id=scraper.region.jurisdiction_id,
            ingest_time=start_time,
            enum_overrides=scraper.get_enum_overrides(),
            system_level=SystemLevel.COUNTY,
            database_key=SQLAlchemyDatabaseKey.for_schema(SchemaType.JAILS),
        )
        expected_proto = convert_ingest_info_to_proto(self.ii)

        self.assertEqual(mock_get_more.call_count, 1)
        self.assertEqual(mock_populate.call_count, 1)
        self.assertEqual(mock_write.call_count, 1)
        mock_write.assert_called_once_with(expected_proto, expected_metadata)
        self.assertCountEqual(expected_tasks, scraper.tasks)
Beispiel #10
0
    def test_get_more_tasks_failure_batch(
        self,
        mock_flask: Mock,
        mock_get_more: Mock,
        mock_fetch: Mock,
        mock_batch_error: Mock,
    ) -> None:
        mock_fetch.return_value = ("TEST", {})
        mock_get_more.side_effect = ValueError("TEST ERROR")
        mock_flask_get = Mock()
        mock_flask_get.return_value = "TRACE ID"
        mock_flask.headers.get = mock_flask_get

        start_time = datetime.datetime.now()
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=TEST_TASK,
            scraper_start_time=start_time,
        )
        scraper = FakeScraper("test")
        with self.assertRaises(ScraperGetMoreTasksError):
            scraper._generic_scrape(req)
        self.assertEqual(mock_batch_error.call_count, 1)

        scrape_key = ScrapeKey(
            region_code="test", scrape_type=constants.ScrapeType.BACKGROUND
        )
        mock_batch_error.assert_called_once_with(
            error="TEST ERROR",
            trace_id="TRACE ID",
            task=TEST_TASK,
            scrape_key=scrape_key,
        )
    def test_scrape_data_no_more_tasks(self, mock_get_more, mock_fetch,
                                       mock_populate, mock_write):
        mock_fetch.return_value = (TEST_HTML, {})
        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=True,
        )
        start_time = datetime.datetime.now()
        t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA)
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time,
        )

        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        expected_metadata = IngestMetadata(
            scraper.region.region_code,
            scraper.region.jurisdiction_id,
            start_time,
            scraper.get_enum_overrides(),
        )
        expected_proto = convert_ingest_info_to_proto(self.ii)

        self.assertEqual(mock_get_more.call_count, 0)
        self.assertEqual(mock_populate.call_count, 1)
        self.assertEqual(mock_write.call_count, 1)
        mock_write.assert_called_once_with(expected_proto, expected_metadata)
        self.assertEqual(len(scraper.tasks), 0)
Beispiel #12
0
    def test_scrape_data_no_more_tasks_batch(
        self,
        mock_get_more: Mock,
        mock_fetch: Mock,
        mock_populate: Mock,
        mock_write: Mock,
        mock_batch_write: Mock,
    ) -> None:
        mock_fetch.return_value = (TEST_HTML, {})
        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=True,
        )
        start_time = datetime.datetime.now()
        t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA)
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time,
        )

        scraper = FakeScraper("test")
        scraper._generic_scrape(req)

        scrape_key = ScrapeKey("test", constants.ScrapeType.BACKGROUND)
        self.assertEqual(mock_get_more.call_count, 0)
        self.assertEqual(mock_populate.call_count, 1)
        self.assertEqual(mock_write.call_count, 0)
        mock_batch_write.assert_called_once_with(
            ingest_info=self.ii,
            task=t,
            scrape_key=scrape_key,
        )
        self.assertEqual(len(scraper.tasks), 0)
Beispiel #13
0
    def start_scrape(self, scrape_type):
        """Start new scrape session / query against corrections site

        Retrieves first docket item, enqueues task for initial search
        page scrape to start the new scraping session.

        Args:
            scrape_type: (ScrapeType) The type of scrape to start

        Returns:
            N/A

        """
        docket_item = self.iterate_docket_item(scrape_type)
        scrape_key = ScrapeKey(self.get_region().region_code, scrape_type)
        # Ensure that the topic and subscription are created on start.
        pubsub_helper.create_topic_and_subscription(scrape_key, BATCH_PUBSUB_TYPE)
        if not docket_item:
            logging.error(
                "Found no %s docket items for %s, shutting down.",
                scrape_type,
                self.get_region().region_code,
            )
            sessions.close_session(scrape_key)
            return

        self.add_task(
            self.get_initial_task_method(),
            QueueRequest(
                scrape_type=scrape_type,
                scraper_start_time=datetime.now(),
                next_task=self.get_initial_task(),
            ),
        )
Beispiel #14
0
    def test_start_scrape_background(
        self,
        mock_pubsub: Mock,
        mock_get_region: Mock,
        mock_tracker: Mock,
        mock_task_manager: Mock,
        mock_datetime: Mock,
    ) -> None:
        docket_item = ("Dog", "Cat")
        region = "us_nd"
        scrape_type = constants.ScrapeType.BACKGROUND
        queue_name = "us_nd_scraper"
        initial_task = "use_it"

        mock_get_region.return_value = mock_region(region, queue_name)
        mock_tracker.return_value = docket_item
        mock_task_manager.return_value.create_scrape_task.return_value = None
        mock_datetime.now.return_value = _DATETIME

        scraper = FakeScraper(region, initial_task)
        scraper.start_scrape(scrape_type)

        mock_get_region.assert_called_with(region)
        mock_tracker.assert_called_with(ScrapeKey(region, scrape_type))
        mock_pubsub.assert_called_with(ScrapeKey(region, scrape_type),
                                       BATCH_PUBSUB_TYPE)

        queue_params = QueueRequest(
            scrape_type=scrape_type,
            scraper_start_time=_DATETIME,
            next_task=FAKE_TASK,
            # content=docket_item,
        )
        request_body = {
            "region": region,
            "task": initial_task,
            "params": queue_params.to_serializable(),
        }

        mock_task_manager.return_value.create_scrape_task.assert_called_with(
            region_code=region,
            queue_name=queue_name,
            url=scraper.scraper_work_url,
            body=request_body,
        )
Beispiel #15
0
def start_scrape(queue, self, scrape_type):
    add_task(
        queue,
        self,
        self.get_initial_task_method(),
        QueueRequest(
            scrape_type=scrape_type,
            scraper_start_time=datetime.now(),
            next_task=self.get_initial_task(),
        ),
    )
Beispiel #16
0
    def test_resume_scrape_snapshot(
        self,
        mock_get_region: Mock,
        mock_tracker: Mock,
        mock_task_manager: Mock,
        mock_datetime: Mock,
    ) -> None:
        docket_item = (41620, ["daft", "punk"])
        region = "us_nd"
        scrape_type = constants.ScrapeType.SNAPSHOT
        queue_name = "us_nd_scraper"
        initial_task = "press_it"

        mock_get_region.return_value = mock_region(region, queue_name)
        mock_tracker.return_value = docket_item
        mock_task_manager.return_value.create_scrape_task.return_value = None
        mock_datetime.now.return_value = _DATETIME

        scraper = FakeScraper(region, initial_task)
        scraper.resume_scrape(scrape_type)

        mock_get_region.assert_called_with(region)

        queue_params = QueueRequest(
            scrape_type=scrape_type,
            scraper_start_time=_DATETIME,
            next_task=FAKE_TASK,
            # content=(83240, ['dagt', 'punk']),
        )
        request_body = {
            "region": region,
            "task": initial_task,
            "params": queue_params.to_serializable(),
        }

        mock_task_manager.return_value.create_scrape_task.assert_called_with(
            region_code=region,
            queue_name=queue_name,
            url=scraper.scraper_work_url,
            body=request_body,
        )
Beispiel #17
0
    def test_fetch_failure(self, mock_fetch: Mock) -> None:
        mock_fetch.return_value = ValueError("TEST ERROR")

        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=TEST_TASK,
            scraper_start_time=datetime.datetime.now(),
        )
        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        with self.assertRaises(ScraperFetchError):
            scraper._generic_scrape(req)
Beispiel #18
0
def start_scrape(
    queue: deque, self: Scraper, scrape_type: constants.ScrapeType
) -> None:
    add_task(
        queue,
        self,
        self.get_initial_task_method(),
        QueueRequest(
            scrape_type=scrape_type,
            scraper_start_time=datetime.now(),
            next_task=self.get_initial_task(),
        ),
    )
    def test_get_more_tasks_failure(self, mock_get_more, mock_fetch):
        mock_fetch.return_value = ("TEST", {})
        mock_get_more.side_effect = ValueError("TEST ERROR")

        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=TEST_TASK,
            scraper_start_time=datetime.datetime.now(),
        )
        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        with self.assertRaises(ScraperGetMoreTasksError):
            scraper._generic_scrape(req)
Beispiel #20
0
    def test_start_scrape_background(self, mock_pubsub, mock_get_region,
                                     mock_tracker, mock_task_manager,
                                     mock_datetime):
        docket_item = ('Dog', 'Cat')
        region = 'us_nd'
        scrape_type = constants.ScrapeType.BACKGROUND
        queue_name = 'us_nd_scraper'
        initial_task = 'use_it'

        mock_get_region.return_value = mock_region(region, queue_name)
        mock_tracker.return_value = docket_item
        mock_task_manager.return_value.create_scrape_task.return_value = None
        mock_datetime.now.return_value = _DATETIME

        scraper = FakeScraper(region, initial_task)
        scraper.start_scrape(scrape_type)

        mock_get_region.assert_called_with(region)
        mock_tracker.assert_called_with(ScrapeKey(region, scrape_type))
        mock_pubsub.assert_called_with(ScrapeKey(region, scrape_type),
                                       BATCH_PUBSUB_TYPE)

        queue_params = QueueRequest(
            scrape_type=scrape_type.value,
            scraper_start_time=_DATETIME,
            next_task=FAKE_TASK,
            # content=docket_item,
        )
        request_body = {
            'region': region,
            'task': initial_task,
            'params': queue_params.to_serializable()
        }

        mock_task_manager.return_value.create_scrape_task.assert_called_with(
            region_code=region,
            queue_name=queue_name,
            url=scraper.scraper_work_url,
            body=request_body)
    def test_create_scrape_task(self, mock_client, mock_uuid):
        # Arrange
        uuid = 'random-uuid'
        mock_uuid.uuid4.return_value = uuid

        region_code = 'us_ca_san_francisco'
        project_id = 'recidiviz-456'

        queue_name = 'test-queue-name'
        queue_path = f'queue_path/{project_id}/{QUEUES_REGION}/{queue_name}'
        task_id = 'us_ca_san_francisco-random-uuid'
        task_path = f'{queue_path}/{task_id}'
        url = '/my_scrape/task'

        body = {
            'region':
            region_code,
            'params':
            QueueRequest(
                next_task=Task(task_type=TaskType.INITIAL,
                               endpoint='www.google.com'),
                scrape_type=ScrapeType.BACKGROUND,
                scraper_start_time=datetime.datetime.now()).to_serializable()
        }
        task = tasks_v2.types.task_pb2.Task(name=task_path,
                                            app_engine_http_request={
                                                'http_method': 'POST',
                                                'relative_uri': url,
                                                'body':
                                                json.dumps(body).encode()
                                            })

        mock_client.return_value.task_path.return_value = task_path
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        ScraperCloudTaskManager(project_id=project_id). \
            create_scrape_task(region_code=region_code,
                               queue_name=queue_name,
                               url=url,
                               body=body)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            project_id, QUEUES_REGION, queue_name)
        mock_client.return_value.task_path.assert_called_with(
            project_id, QUEUES_REGION, queue_name, task_id)
        mock_client.return_value.create_task.assert_called_with(
            queue_path, task)
    def test_create_scrape_task(self, mock_client: Mock,
                                mock_uuid: Mock) -> None:
        # Arrange
        uuid = "random-uuid"
        mock_uuid.uuid4.return_value = uuid

        region_code = "us_ca_san_francisco"
        project_id = "recidiviz-456"

        queue_name = "test-queue-name"
        queue_path = f"queue_path/{project_id}/{QUEUES_REGION}/{queue_name}"
        task_id = "us_ca_san_francisco-random-uuid"
        task_path = f"{queue_path}/{task_id}"
        url = "/my_scrape/task"

        body = {
            "region":
            region_code,
            "params":
            QueueRequest(
                next_task=Task(task_type=TaskType.INITIAL,
                               endpoint="www.google.com"),
                scrape_type=ScrapeType.BACKGROUND,
                scraper_start_time=datetime.datetime.now(),
            ).to_serializable(),
        }
        task = tasks_v2.types.task_pb2.Task(
            name=task_path,
            app_engine_http_request={
                "http_method": "POST",
                "relative_uri": url,
                "body": json.dumps(body).encode(),
            },
        )

        mock_client.return_value.task_path.return_value = task_path
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        ScraperCloudTaskManager(project_id=project_id).create_scrape_task(
            region_code=region_code, queue_name=queue_name, url=url, body=body)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            project_id, QUEUES_REGION, queue_name)
        mock_client.return_value.task_path.assert_called_with(
            project_id, QUEUES_REGION, queue_name, task_id)
        mock_client.return_value.create_task.assert_called_with(
            parent=queue_path, task=task)
Beispiel #23
0
    def add_task(self, task_name, request: QueueRequest):
        """ Add a task to the task queue.

        Args:
            task_name: (string) name of the function in the scraper class to
                       be invoked
            request: (dict) parameters to be passed to the function
        """
        self.cloud_task_manager.create_scrape_task(
            region_code=self.get_region().region_code,
            queue_name=self.get_region().get_queue_name(),
            url=self.scraper_work_url,
            body={
                'region': self.get_region().region_code,
                'task': task_name,
                'params': request.to_serializable(),
            })
Beispiel #24
0
    def test_content_no_fetch(self, mock_get_more: Mock, mock_fetch: Mock) -> None:
        t = Task.evolve(TEST_TASK, content=TEST_HTML)
        mock_get_more.return_value = [t]
        start_time = datetime.datetime.now()
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time,
        )
        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        expected_tasks = [req]

        self.assertEqual(mock_fetch.call_count, 0)
        self.assertCountEqual(expected_tasks, scraper.tasks)
Beispiel #25
0
    def _generic_scrape(self, request: QueueRequest):
        """
        General handler for all scrape tasks.  This function is a generic entry
        point into all types of scrapes.  It decides what to call based on
        params.

        Args:
            params: dict of parameters passed from the last scrape session.
        """
        try:
            task = request.next_task

            # Here we handle a special case where we weren't really sure
            # we were going to get data when we submitted a task, but then
            # we ended up with data, so no more requests are required,
            # just the content we already have.
            # TODO(#680): remove this
            if task.content is not None:
                content = self._parse_html_content(task.content)
                cookies = None
            else:
                post_data = task.post_data

                # Let the child transform the post_data if it wants before
                # sending the requests.  This hook is in here in case the
                # child did something like compress the post_data before
                # it put it on the queue.
                self.transform_post_data(post_data)

                # We always fetch some content before doing anything.
                # Note that we use get here for the post_data to return a
                # default value of None if this scraper doesn't set it.
                try:
                    content, cookies = self._fetch_content(
                        task.endpoint,
                        task.response_type,
                        headers=task.headers,
                        cookies=task.cookies,
                        params=task.params,
                        post_data=post_data,
                        json_data=task.json)
                except Exception as e:
                    raise ScraperFetchError(str(e)) from e

            scraped_data = None
            if self.should_scrape_data(task.task_type):
                # If we want to scrape data, we should either create an
                # ingest_info object or get the one that already exists.
                logging.info("Scraping data for [%s] and endpoint: [%s]",
                             self.region.region_code, task.endpoint)
                try:
                    scraped_data = self.populate_data(
                        content, task, request.ingest_info or IngestInfo())
                except Exception as e:
                    raise ScraperPopulateDataError(str(e)) from e

            if self.should_get_more_tasks(task.task_type):
                logging.info("Getting more tasks for [%s] and endpoint: [%s]",
                             self.region.region_code, task.endpoint)

                # Only send along ingest info if it will not be persisted now.
                ingest_info_to_send = None
                if scraped_data is not None and not scraped_data.persist:
                    ingest_info_to_send = scraped_data.ingest_info

                try:
                    # pylint: disable=assignment-from-no-return
                    next_tasks = self.get_more_tasks(content, task)
                except Exception as e:
                    raise ScraperGetMoreTasksError(str(e)) from e
                for next_task in next_tasks:
                    # Include cookies received from response, if any
                    if cookies:
                        cookies.update(next_task.cookies)
                        next_task = Task.evolve(next_task, cookies=cookies)
                    self.add_task(
                        '_generic_scrape',
                        QueueRequest(
                            scrape_type=request.scrape_type,
                            scraper_start_time=request.scraper_start_time,
                            next_task=next_task,
                            ingest_info=ingest_info_to_send,
                        ))

            if scraped_data is not None and scraped_data.persist:
                if scraped_data.ingest_info:
                    logging.info("Logging at most 4 people (were %d):",
                                 len(scraped_data.ingest_info.people))
                    loop_count = min(len(scraped_data.ingest_info.people),
                                     constants.MAX_PEOPLE_TO_LOG)
                    for i in range(loop_count):
                        logging.info("[%s]",
                                     str(scraped_data.ingest_info.people[i]))
                    logging.info("Last seen time of person being set as: [%s]",
                                 request.scraper_start_time)
                    metadata = IngestMetadata(self.region.region_code,
                                              self.region.jurisdiction_id,
                                              request.scraper_start_time,
                                              self.get_enum_overrides())
                    if self.BATCH_WRITES:
                        logging.info(
                            "Queuing ingest_info ([%d] people) to "
                            "batch_persistence for [%s]",
                            len(scraped_data.ingest_info.people),
                            self.region.region_code)
                        scrape_key = ScrapeKey(self.region.region_code,
                                               request.scrape_type)
                        batch_persistence.write(
                            ingest_info=scraped_data.ingest_info,
                            scrape_key=scrape_key,
                            task=task,
                        )
                    else:
                        logging.info(
                            "Writing ingest_info ([%d] people) to the database"
                            " for [%s]", len(scraped_data.ingest_info.people),
                            self.region.region_code)
                        persistence.write(
                            ingest_utils.convert_ingest_info_to_proto(
                                scraped_data.ingest_info), metadata)
                for sc in scraped_data.single_counts:
                    if not sc.date:
                        scrape_key = ScrapeKey(self.region.region_code,
                                               constants.ScrapeType.BACKGROUND)
                        session = sessions.get_current_session(scrape_key)
                        if session:
                            sc = attr.evolve(sc, date=session.start.date())
                    single_count.store_single_count(
                        sc, self.region.jurisdiction_id)
        except Exception as e:
            if self.BATCH_WRITES:
                scrape_key = ScrapeKey(self.region.region_code,
                                       request.scrape_type)
                batch_persistence.write_error(
                    error=str(e),
                    trace_id=get_trace_id_from_flask(),
                    task=task,
                    scrape_key=scrape_key,
                )
            raise e
Beispiel #26
0
from typing import Callable

import pytest
import pytz
from flask import Flask
from mock import Mock, create_autospec, patch

from recidiviz.ingest.scrape import constants, scrape_phase, sessions, worker
from recidiviz.ingest.scrape.task_params import QueueRequest, Task
from recidiviz.utils.regions import Region

PATH = "/work/us_ca"
FAKE_QUEUE_PARAMS = QueueRequest(
    scrape_type=constants.ScrapeType.BACKGROUND,
    scraper_start_time=datetime.datetime.now(tz=pytz.UTC),
    next_task=Task(
        task_type=constants.TaskType.INITIAL,
        endpoint="some.endpoint",
    ),
)

app = Flask(__name__)
app.register_blueprint(worker.worker)
app.config["TESTING"] = True


@patch("recidiviz.utils.metadata.project_id", Mock(return_value="test-project"))
@patch("recidiviz.utils.metadata.project_number", Mock(return_value="123456789"))
class TestWorker:
    """Tests for requests to the Worker API."""

    # noinspection PyAttributeOutsideInit
Beispiel #27
0
def work(region):
    """POST request handler to route chunk of scraper work

    Very thin shim to receive a chunk of work from the task queue, and call
    the relevant part of the specified scraper to execute it.

    All scraper work that hits a third-party website goes through this handler
    as small discrete tasks, so that we leverage the taskqueue's throttling and
    retry support for network requests to the sites (and don't DOS them).

    Because scraping will vary so significantly by region, this taskqueue
    handler is very lightweight - it really just accepts the POST for the task,
    and calls the relevant regional scraper to do whatever was asked. This
    allows it to stay agnostic to regional variation.

    Never called manually, so authentication is enforced in app.yaml.

    Form data must be a bytes-encoded JSON object with parameters listed below.

    URL Parameters:
        region: (string) Region code for the scraper in question.
        task: (string) Name of the function to call in the scraper
        params: (dict) Parameter payload to give the function being called
            (optional)

    Returns:
        Response code 200 if successful

        Any other response code will make taskqueue consider the task
        failed, and it will retry the task until it expires or succeeds
        (handling backoff logic, etc.)
    """
    # Verify this was actually a task queued by our app
    if "X-AppEngine-QueueName" not in request.headers:
        logging.error("Couldn't validate task was legit, exiting.")
        return ("", HTTPStatus.INTERNAL_SERVER_ERROR)
    queue_name = request.headers.get("X-AppEngine-QueueName")

    json_data = request.get_data(as_text=True)
    data = json.loads(json_data)
    task = data["task"]
    params = QueueRequest.from_serializable(data["params"])

    if region != data["region"]:
        raise ValueError(
            "Region specified in task {} does not match region from url {}.".
            format(data["region"], region))

    task_tags = {monitoring.TagKey.STATUS: "COMPLETED"}
    # Note: measurements must be second so it receives the region tag.
    with monitoring.push_tags(
        {monitoring.TagKey.REGION:
         region}), monitoring.measurements(task_tags) as measurements:
        measurements.measure_int_put(m_tasks, 1)
        if not sessions.get_current_session(
                ScrapeKey(region, params.scrape_type)):
            task_tags[monitoring.TagKey.STATUS] = "SKIPPED"
            logging.info(
                "Queue [%s], skipping task [%s] for [%s] because it "
                "is not in the current session.",
                queue_name,
                task,
                region,
            )
            return ("", HTTPStatus.OK)
        logging.info("Queue [%s], processing task [%s] for [%s].", queue_name,
                     task, region)

        scraper = regions.get_region(region).get_ingestor()
        scraper_task = getattr(scraper, task)

        try:
            scraper_task(params)
        except Exception as e:
            task_tags[monitoring.TagKey.STATUS] = "ERROR: {}".format(
                type(e).__name__)
            raise RequestProcessingError(region, task, params) from e

        # Respond to the task queue to mark this task as done
        return ("", HTTPStatus.OK)
Beispiel #28
0
 def __init__(self, region: str, task: str, queue_request: QueueRequest):
     request_string = pprint.pformat(queue_request.to_serializable())
     msg = "Error when running '{}' for '{}' with request:\n{}".format(
         task, region, request_string)
     super().__init__(msg)
Beispiel #29
0
    def resume_scrape(self, scrape_type):
        """Resume a stopped scrape from where it left off

        Starts the scraper up again at the same place (roughly) as it had been
        stopped previously. This allows for cron jobs to start/stop scrapers at
        different times of day.

        Args:
            scrape_type: (ScrapeType) Type of scraping to resume

        Returns:
            N/A
        """
        # Note: None of the current scrapers support resumes, so this function
        # doesn't fully work. For instance, content is thrown away.
        if scrape_type is constants.ScrapeType.BACKGROUND:
            # Background scrape

            # In most scrapers, background scrapes will use
            # short-lived docket items. However, some background
            # scrapes use only one docket item to run a giant scrape,
            # which may run for months. Limitations in GAE Pull Queues
            # make it difficult to keep track of a leased task for
            # that long, so we don't try. Resuming a background scrape
            # simply resumes from session data, and the task stays in
            # the docket un-leased. It will get deleted the next time
            # we start a new background scrape.

            recent_sessions = sessions.get_recent_sessions(
                ScrapeKey(self.get_region().region_code, scrape_type)
            )

            last_scraped = None
            for session in recent_sessions:
                if session.last_scraped:
                    last_scraped = session.last_scraped
                    break

            if last_scraped:
                content = last_scraped.split(", ")
            else:
                logging.error(
                    "No earlier session with last_scraped found; " "cannot resume."
                )
                return

        else:
            # Snapshot scrape

            # Get an item from the docket and continue from there. These queries
            # are very quick, so we don't bother trying to resume the same task
            # we left off on.

            content = self.iterate_docket_item(scrape_type)
            if not content:
                sessions.close_session(
                    ScrapeKey(self.get_region().region_code, scrape_type)
                )
                return

        self.add_task(
            self.get_initial_task_method(),
            QueueRequest(
                scrape_type=scrape_type,
                scraper_start_time=datetime.now(),
                next_task=self.get_initial_task(),
            ),
        )