Example #1
0
    def __init__(self, region_name):
        """Initialize the parent scraper object.

        Args:
            region_name: (string) name of the region of the child scraper.

        """

        # Passing verify=False in the requests produces a warning,
        # disable it here.
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

        self.region = regions.get_region(region_name)
        self.scraper_work_url = "/scraper/work/{}".format(region_name)
        self.cloud_task_manager = ScraperCloudTaskManager()
    def test_list_scrape_tasks(self, mock_client):
        # Arrange
        region_code = 'us_ca_san_francisco'
        project_id = 'recidiviz-456'

        queue_name = 'test-queue-name'
        queue_path = f'queue_path/{project_id}/{QUEUES_REGION}/{queue_name}'

        mock_client.return_value.task_path.side_effect = self.task_path
        mock_client.return_value.queue_path.return_value = queue_path

        task1 = tasks_v2.types.task_pb2.Task(
            name=self.task_path(project_id, QUEUES_REGION, queue_name,
                                'us_ca_san_francisco-12345'))
        task2 = tasks_v2.types.task_pb2.Task(name=self.task_path(
            project_id, QUEUES_REGION, queue_name, 'us_ca_san_mateo-12345'))

        mock_client.return_value.list_tasks.return_value = [task1, task2]

        # Act
        tasks = ScraperCloudTaskManager(project_id=project_id). \
            list_scrape_tasks(region_code=region_code,
                              queue_name=queue_name)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            project_id, QUEUES_REGION, queue_name)
        mock_client.return_value.list_tasks.assert_called_with(queue_path)

        self.assertCountEqual(tasks, [task1])
    def test_create_scraper_phase_task(self, mock_client, mock_uuid):
        # Arrange
        uuid = 'random-uuid'
        mock_uuid.uuid4.return_value = uuid

        region_code = 'us_ca_san_francisco'
        project_id = 'recidiviz-456'

        queue_path = f'queue_path/{project_id}/{QUEUES_REGION}'
        task_id = 'us_ca_san_francisco-random-uuid'
        task_path = f'{queue_path}/{task_id}'
        url = '/my_enqueue/phase'

        task = tasks_v2.types.task_pb2.Task(name=task_path,
                                            app_engine_http_request={
                                                'http_method':
                                                'GET',
                                                'relative_uri':
                                                f'{url}?region={region_code}',
                                            })

        mock_client.return_value.task_path.return_value = task_path
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        ScraperCloudTaskManager(project_id=project_id). \
            create_scraper_phase_task(region_code=region_code, url=url)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            project_id, QUEUES_REGION, SCRAPER_PHASE_QUEUE_V2)
        mock_client.return_value.task_path.assert_called_with(
            project_id, QUEUES_REGION, SCRAPER_PHASE_QUEUE_V2, task_id)
        mock_client.return_value.create_task.assert_called_with(
            queue_path, task)
Example #4
0
    def test_purge_scrape_tasks(self, mock_client):
        # Arrange
        region_code = 'us_ca_san_francisco'
        project_id = 'recidiviz-456'

        queue_name = 'test-queue-name'
        queue_path = f'queue_path/{project_id}/{QUEUES_REGION}/{queue_name}'

        task1 = tasks_v2.types.task_pb2.Task(
            name=self.task_path(project_id, QUEUES_REGION, queue_name,
                                'us_ca_san_francisco-12345'))
        task2 = tasks_v2.types.task_pb2.Task(
            name=self.task_path(project_id, QUEUES_REGION, queue_name,
                                'us_ca_san_francisco-12345'))

        mock_client.return_value.list_tasks.return_value = [task1, task2]

        mock_client.return_value.task_path.side_effect = self.task_path
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        ScraperCloudTaskManager(project_id=project_id). \
            purge_scrape_tasks(region_code=region_code,
                               queue_name=queue_name)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            project_id, QUEUES_REGION, queue_name)
        mock_client.return_value.list_tasks.assert_called_with(
            parent=queue_path)
        self.assertEqual(mock_client.return_value.delete_task.mock_calls, [
            call(name=task1.name),
            call(name=task2.name),
        ])
Example #5
0
    def _stop_scraper(region: str):
        logging.info("Trying to stop scraper for region [%s].", region)
        for scrape_type in scrape_types:
            key = ScrapeKey(region_code=region, scrape_type=scrape_type)
            session = sessions.get_current_session(key)
            if not session:
                logging.info(
                    "No [%s] scrape to stop for region: [%s]", scrape_type,
                    region)
                continue

            region_scraper = regions.get_region(region).get_ingestor()
            was_stopped = region_scraper.stop_scrape(scrape_type,
                                                     respect_is_stoppable)
            if was_stopped:
                closed_sessions = sessions.close_session(key)
                for closed_session in closed_sessions:
                    sessions.update_phase(closed_session,
                                          scrape_phase.ScrapePhase.PERSIST)
                if next_phase:
                    logging.info("Enqueueing %s for region [%s].",
                                 next_phase, region)
                    ScraperCloudTaskManager().create_scraper_phase_task(
                        region_code=region,
                        url=next_phase_url)
Example #6
0
def check_for_finished_scrapers():
    """Checks for any finished scrapers and kicks off next processes."""

    next_phase = scrape_phase.next_phase(request.endpoint)
    next_phase_url = url_for(next_phase) if next_phase else None
    cloud_task_manager = ScraperCloudTaskManager()

    @monitoring.with_region_tag
    def _check_finished(region_code: str):
        # If there are no sessions currently scraping, nothing to check.
        session = sessions.get_current_session(
            ScrapeKey(region_code, constants.ScrapeType.BACKGROUND)
        )
        if not session or not session.phase.is_actively_scraping():
            return

        if is_scraper_finished(region_code, cloud_task_manager):
            logging.info("Region [%s] has finished scraping.", region_code)

            if next_phase:
                logging.info(
                    "Enqueueing [%s] for region [%s].", next_phase, region_code
                )
                ScraperCloudTaskManager().create_scraper_phase_task(
                    region_code=region_code, url=next_phase_url
                )

    region_codes = ingest_utils.validate_regions(
        get_str_param_values("region", request.args)
    )

    failed_regions = []
    with futures.ThreadPoolExecutor() as executor:
        future_to_region = {
            executor.submit(
                structured_logging.with_context(_check_finished), region_code
            ): region_code
            for region_code in region_codes
        }
        for future in futures.as_completed(future_to_region):
            region_code = future_to_region[future]
            with monitoring.push_tags({monitoring.TagKey.REGION: region_code}):
                try:
                    future.result()
                except Exception:
                    logging.exception(
                        "An exception occured when checking region [%s]", region_code
                    )
                    failed_regions.append(region_code)

    if failed_regions:
        return (
            "Failed to check regions: {}".format(failed_regions),
            HTTPStatus.INTERNAL_SERVER_ERROR,
        )
    return ("", HTTPStatus.OK)
def read_and_persist() -> Tuple[str, HTTPStatus]:
    """Reads all of the messages from Datastore for a region and persists
    them to the database.
    """

    region = request.args.get("region")

    if not isinstance(region, str):
        raise ValueError(f"Expected string region, found [{region}]")

    batch_tags = {
        monitoring.TagKey.STATUS: "COMPLETED",
        monitoring.TagKey.PERSISTED: False,
    }
    # Note: measurements must be second so it receives the region tag.
    with monitoring.push_tags(
        {monitoring.TagKey.REGION: region}
    ), monitoring.measurements(batch_tags) as measurements:
        measurements.measure_int_put(m_batch_count, 1)

        session = sessions.get_most_recent_completed_session(
            region, ScrapeType.BACKGROUND
        )

        if not session:
            raise ValueError(
                f"Most recent session for region [{region}] is unexpectedly None"
            )

        scrape_type = session.scrape_type

        try:
            did_persist = persist_to_database(region, session.start)
            batch_tags[monitoring.TagKey.PERSISTED] = did_persist
        except Exception as e:
            logging.exception(
                "An exception occurred in read and persist: %s", type(e).__name__
            )
            batch_tags[monitoring.TagKey.STATUS] = "ERROR: {}".format(type(e).__name__)
            sessions.update_phase(session, scrape_phase.ScrapePhase.DONE)
            raise BatchPersistError(region, scrape_type) from e

        if did_persist:
            next_phase = scrape_phase.next_phase(request.endpoint)
            sessions.update_phase(session, scrape_phase.ScrapePhase.RELEASE)
            if next_phase:
                logging.info("Enqueueing %s for region %s.", next_phase, region)
                ScraperCloudTaskManager().create_scraper_phase_task(
                    region_code=region, url=url_for(next_phase)
                )
            return "", HTTPStatus.OK

        sessions.update_phase(session, scrape_phase.ScrapePhase.DONE)
        return "", HTTPStatus.ACCEPTED
    def test_create_scrape_task(self, mock_client, mock_uuid):
        # Arrange
        uuid = 'random-uuid'
        mock_uuid.uuid4.return_value = uuid

        region_code = 'us_ca_san_francisco'
        project_id = 'recidiviz-456'

        queue_name = 'test-queue-name'
        queue_path = f'queue_path/{project_id}/{QUEUES_REGION}/{queue_name}'
        task_id = 'us_ca_san_francisco-random-uuid'
        task_path = f'{queue_path}/{task_id}'
        url = '/my_scrape/task'

        body = {
            'region':
            region_code,
            'params':
            QueueRequest(
                next_task=Task(task_type=TaskType.INITIAL,
                               endpoint='www.google.com'),
                scrape_type=ScrapeType.BACKGROUND,
                scraper_start_time=datetime.datetime.now()).to_serializable()
        }
        task = tasks_v2.types.task_pb2.Task(name=task_path,
                                            app_engine_http_request={
                                                'http_method': 'POST',
                                                'relative_uri': url,
                                                'body':
                                                json.dumps(body).encode()
                                            })

        mock_client.return_value.task_path.return_value = task_path
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        ScraperCloudTaskManager(project_id=project_id). \
            create_scrape_task(region_code=region_code,
                               queue_name=queue_name,
                               url=url,
                               body=body)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            project_id, QUEUES_REGION, queue_name)
        mock_client.return_value.task_path.assert_called_with(
            project_id, QUEUES_REGION, queue_name, task_id)
        mock_client.return_value.create_task.assert_called_with(
            queue_path, task)
    def test_create_scrape_task(self, mock_client: Mock,
                                mock_uuid: Mock) -> None:
        # Arrange
        uuid = "random-uuid"
        mock_uuid.uuid4.return_value = uuid

        region_code = "us_ca_san_francisco"
        project_id = "recidiviz-456"

        queue_name = "test-queue-name"
        queue_path = f"queue_path/{project_id}/{QUEUES_REGION}/{queue_name}"
        task_id = "us_ca_san_francisco-random-uuid"
        task_path = f"{queue_path}/{task_id}"
        url = "/my_scrape/task"

        body = {
            "region":
            region_code,
            "params":
            QueueRequest(
                next_task=Task(task_type=TaskType.INITIAL,
                               endpoint="www.google.com"),
                scrape_type=ScrapeType.BACKGROUND,
                scraper_start_time=datetime.datetime.now(),
            ).to_serializable(),
        }
        task = tasks_v2.types.task_pb2.Task(
            name=task_path,
            app_engine_http_request={
                "http_method": "POST",
                "relative_uri": url,
                "body": json.dumps(body).encode(),
            },
        )

        mock_client.return_value.task_path.return_value = task_path
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        ScraperCloudTaskManager(project_id=project_id).create_scrape_task(
            region_code=region_code, queue_name=queue_name, url=url, body=body)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            project_id, QUEUES_REGION, queue_name)
        mock_client.return_value.task_path.assert_called_with(
            project_id, QUEUES_REGION, queue_name, task_id)
        mock_client.return_value.create_task.assert_called_with(
            parent=queue_path, task=task)
    def _check_finished(region_code: str):
        # If there are no sessions currently scraping, nothing to check.
        session = sessions.get_current_session(
            ScrapeKey(region_code, constants.ScrapeType.BACKGROUND))
        if not session or not session.phase.is_actively_scraping():
            return

        if is_scraper_finished(region_code, cloud_task_manager):
            logging.info("Region [%s] has finished scraping.", region_code)

            if next_phase:
                logging.info("Enqueueing [%s] for region [%s].",
                             next_phase, region_code)
                ScraperCloudTaskManager().create_scraper_phase_task(
                    region_code=region_code, url=next_phase_url)
    def test_create_scraper_phase_task(self, mock_client: Mock,
                                       mock_uuid: Mock) -> None:
        # Arrange
        uuid = "random-uuid"
        mock_uuid.uuid4.return_value = uuid

        region_code = "us_ca_san_francisco"
        project_id = "recidiviz-456"

        queue_path = f"queue_path/{project_id}/{QUEUES_REGION}"
        task_id = "us_ca_san_francisco-random-uuid"
        task_path = f"{queue_path}/{task_id}"
        url = "/my_enqueue/phase"

        task = tasks_v2.types.task_pb2.Task(
            name=task_path,
            app_engine_http_request={
                "http_method": "GET",
                "relative_uri": f"{url}?region={region_code}",
            },
        )

        mock_client.return_value.task_path.return_value = task_path
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        ScraperCloudTaskManager(
            project_id=project_id).create_scraper_phase_task(
                region_code=region_code, url=url)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            project_id, QUEUES_REGION, SCRAPER_PHASE_QUEUE_V2)
        mock_client.return_value.task_path.assert_called_with(
            project_id, QUEUES_REGION, SCRAPER_PHASE_QUEUE_V2, task_id)
        mock_client.return_value.create_task.assert_called_with(
            parent=queue_path, task=task)
Example #12
0
def is_scraper_finished(region_code: str, cloud_task_manager: ScraperCloudTaskManager):
    region = regions.get_region(region_code)
    return not cloud_task_manager.list_scrape_tasks(
        region_code=region_code, queue_name=region.get_queue_name()
    )
Example #13
0
class Scraper(Ingestor, metaclass=abc.ABCMeta):
    """The base for all scraper objects. It handles basic setup, scrape
    process control (start, resume, stop), web requests, task
    queueing, state tracking, and a bunch of static convenience
    methods for data manipulation.

    Note that all child classes must implement the person_id_to_record_id
    method, which is used to iterate docket items.

    """

    def __init__(self, region_name):
        """Initialize the parent scraper object.

        Args:
            region_name: (string) name of the region of the child scraper.

        """

        # Passing verify=False in the requests produces a warning,
        # disable it here.
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

        self.region = regions.get_region(region_name)
        self.scraper_work_url = "/scraper/work/{}".format(region_name)
        self.cloud_task_manager = ScraperCloudTaskManager()

    @abc.abstractmethod
    def get_initial_task_method(self):
        """Abstract method for child classes to specify the name of the first
        task to run in the scraper.

        Returns:
            The name of the function to run as the first task.

        """

    @abc.abstractmethod
    def get_initial_task(self) -> Task:
        """Returns the initial task to use for the first call."""

    def get_region(self):
        """Retrieve the region object associated with this scraper.

        Returns:
            the region object

        """
        return self.region

    def start_scrape(self, scrape_type):
        """Start new scrape session / query against corrections site

        Retrieves first docket item, enqueues task for initial search
        page scrape to start the new scraping session.

        Args:
            scrape_type: (ScrapeType) The type of scrape to start

        Returns:
            N/A

        """
        docket_item = self.iterate_docket_item(scrape_type)
        scrape_key = ScrapeKey(self.get_region().region_code, scrape_type)
        # Ensure that the topic and subscription are created on start.
        pubsub_helper.create_topic_and_subscription(scrape_key, BATCH_PUBSUB_TYPE)
        if not docket_item:
            logging.error(
                "Found no %s docket items for %s, shutting down.",
                scrape_type,
                self.get_region().region_code,
            )
            sessions.close_session(scrape_key)
            return

        self.add_task(
            self.get_initial_task_method(),
            QueueRequest(
                scrape_type=scrape_type,
                scraper_start_time=datetime.now(),
                next_task=self.get_initial_task(),
            ),
        )

    def stop_scrape(self, scrape_type, respect_is_stoppable=False) -> bool:
        """Stops all active scraping tasks, resume non-targeted scrape types
        Stops the scraper, even if in the middle of a session. In
        production, this is called by a cron job scheduled to prevent
        interference with the normal operation of the scraped site.
        We share the scraping taskqueue between snapshot and
        background scraping to be certain of our throttling for the
        third-party service. As a result, cleaning up / purging the
        taskqueue necessarily kills all scrape types.  We kick off
        resume_scrape for any ongoing scraping types that aren't
        targets.
        Args:
            scrape_type: Scrape type to terminate
            respect_is_stoppable: Defaults to false, in which case the scraper
                will be stopped regardless of whether `is_stoppable` is set to
                true. Otherwise, stops the region's scraper only if its
                `is_stoppable` is set to true.
        Returns:
            A bool indicating whether or not the scrape was stopped.
        """
        region = self.get_region()

        if respect_is_stoppable and not region.is_stoppable:
            logging.info(
                "Stop scrape was called and ignored for the region: %s "
                "because the region's manifest is flagged as not stoppable",
                region.region_code,
            )
            return False

        logging.info("Stopping scrape for the region: %s", region.region_code)

        try:
            self.cloud_task_manager.purge_scrape_tasks(
                region_code=region.region_code, queue_name=region.get_queue_name()
            )
        except Exception as e:
            logging.error(
                "Caught an exception while trying to purge scrape "
                "tasks. The message was:\n%s",
                str(e),
            )
            return False

        # Check for other running scrapes, and if found kick off a delayed
        # resume for them since the taskqueue purge will kill them.
        other_scrapes = set([])
        open_sessions = sessions.get_sessions(region.region_code, include_closed=False)
        for session in open_sessions:
            if session.scrape_type != scrape_type:
                other_scrapes.add(session.scrape_type)

        for scrape in other_scrapes:
            logging.info("Resuming unaffected scrape type: %s.", str(scrape))
            self.resume_scrape(scrape)

        return True

    def resume_scrape(self, scrape_type):
        """Resume a stopped scrape from where it left off

        Starts the scraper up again at the same place (roughly) as it had been
        stopped previously. This allows for cron jobs to start/stop scrapers at
        different times of day.

        Args:
            scrape_type: (ScrapeType) Type of scraping to resume

        Returns:
            N/A
        """
        # Note: None of the current scrapers support resumes, so this function
        # doesn't fully work. For instance, content is thrown away.
        if scrape_type is constants.ScrapeType.BACKGROUND:
            # Background scrape

            # In most scrapers, background scrapes will use
            # short-lived docket items. However, some background
            # scrapes use only one docket item to run a giant scrape,
            # which may run for months. Limitations in GAE Pull Queues
            # make it difficult to keep track of a leased task for
            # that long, so we don't try. Resuming a background scrape
            # simply resumes from session data, and the task stays in
            # the docket un-leased. It will get deleted the next time
            # we start a new background scrape.

            recent_sessions = sessions.get_recent_sessions(
                ScrapeKey(self.get_region().region_code, scrape_type)
            )

            last_scraped = None
            for session in recent_sessions:
                if session.last_scraped:
                    last_scraped = session.last_scraped
                    break

            if last_scraped:
                content = last_scraped.split(", ")
            else:
                logging.error(
                    "No earlier session with last_scraped found; " "cannot resume."
                )
                return

        else:
            # Snapshot scrape

            # Get an item from the docket and continue from there. These queries
            # are very quick, so we don't bother trying to resume the same task
            # we left off on.

            content = self.iterate_docket_item(scrape_type)
            if not content:
                sessions.close_session(
                    ScrapeKey(self.get_region().region_code, scrape_type)
                )
                return

        self.add_task(
            self.get_initial_task_method(),
            QueueRequest(
                scrape_type=scrape_type,
                scraper_start_time=datetime.now(),
                next_task=self.get_initial_task(),
            ),
        )

    @staticmethod
    def fetch_page(
        url,
        headers=None,
        cookies=None,
        params=None,
        post_data=None,
        json_data=None,
        should_proxy=True,
    ):
        """Fetch content from a URL. If data is None (the default), we perform
        a GET for the page. If the data is set, it must be a dict of parameters
        to use as POST data in a POST request to the url.

        Args:
            url: (string) URL to fetch content from
            headers: (dict) any headers to send in addition to the default
            cookies: (dict) any cookies to send in the request.
            params: dict of parameters to pass in the url of a GET request
            post_data: dict of parameters to pass into the html POST request
            json_data: dict of parameters in JSON format to pass into the html
                       POST request
            extra_headers: dict of parameters to add to the headers of this
                           request
            should_proxy: (bool) whether or not to use a proxy.

        Returns:
            The content.

        """
        if should_proxy:
            proxies = scraper_utils.get_proxies()
        else:
            proxies = None
        headers = headers.copy() if headers else {}
        if "User-Agent" not in headers:
            headers.update(scraper_utils.get_headers())

        try:
            if post_data is None and json_data is None:
                page = requests.get(
                    url,
                    proxies=proxies,
                    headers=headers,
                    cookies=cookies,
                    params=params,
                    verify=False,
                )
            elif params is None:
                page = requests.post(
                    url,
                    proxies=proxies,
                    headers=headers,
                    cookies=cookies,
                    data=post_data,
                    json=json_data,
                    verify=False,
                )
            else:
                raise ValueError(
                    "Both params ({}) for a GET request and either post_data "
                    "({}) or json_data ({}) for a POST request were set.".format(
                        params, post_data, json_data
                    )
                )
            page.raise_for_status()
        except requests.exceptions.RequestException as ce:
            raise FetchPageError(ce.request, ce.response) from ce

        return page

    def add_task(self, task_name, request: QueueRequest):
        """Add a task to the task queue.

        Args:
            task_name: (string) name of the function in the scraper class to
                       be invoked
            request: (dict) parameters to be passed to the function
        """
        self.cloud_task_manager.create_scrape_task(
            region_code=self.get_region().region_code,
            queue_name=self.get_region().get_queue_name(),
            url=self.scraper_work_url,
            body={
                "region": self.get_region().region_code,
                "task": task_name,
                "params": request.to_serializable(),
            },
        )

    def iterate_docket_item(self, scrape_type):
        """Leases new docket item, updates current session, returns item
        contents

        Returns an entity to scrape as provided by the docket item.

        Args:
            scrape_type: (string) Type of docket item to retrieve

        Returns:
            False if there was any failure to retrieve a new docket item.
            If successful:
                Background scrape: ("surname", "given names")
        """

        item_content = tracker.iterate_docket_item(
            ScrapeKey(self.get_region().region_code, scrape_type)
        )

        if item_content is None:
            return False

        return item_content