コード例 #1
0
    def test_get_current_session(self):
        # older
        self.create_session(
            region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.START,
            start=fix_dt(datetime(2009, 6, 17)))
        current = self.create_session(
            region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.START,
            start=fix_dt(datetime(2009, 6, 18)))
        # closed
        self.create_session(
            region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.START,
            start=fix_dt(datetime(2009, 6, 19)),
            end=fix_dt(datetime(2009, 6, 21)))
        # different scrape type
        self.create_session(
            region_code="us_ny", scrape_type=constants.ScrapeType.SNAPSHOT,
            phase=scrape_phase.ScrapePhase.START,
            start=fix_dt(datetime(2009, 6, 19)))
        # different region
        self.create_session(
            region_code="us_fl", scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.START,
            start=fix_dt(datetime(2009, 6, 19)))

        result = sessions.get_current_session(
            ScrapeKey("us_ny", constants.ScrapeType.BACKGROUND))

        assert result.to_entity() == current.to_entity()
コード例 #2
0
    def _stop_scraper(region: str):
        logging.info("Trying to stop scraper for region [%s].", region)
        for scrape_type in scrape_types:
            key = ScrapeKey(region_code=region, scrape_type=scrape_type)
            session = sessions.get_current_session(key)
            if not session:
                logging.info(
                    "No [%s] scrape to stop for region: [%s]", scrape_type,
                    region)
                continue

            region_scraper = regions.get_region(region).get_ingestor()
            was_stopped = region_scraper.stop_scrape(scrape_type,
                                                     respect_is_stoppable)
            if was_stopped:
                closed_sessions = sessions.close_session(key)
                for closed_session in closed_sessions:
                    sessions.update_phase(closed_session,
                                          scrape_phase.ScrapePhase.PERSIST)
                if next_phase:
                    logging.info("Enqueueing %s for region [%s].",
                                 next_phase, region)
                    ScraperCloudTaskManager().create_scraper_phase_task(
                        region_code=region,
                        url=next_phase_url)
コード例 #3
0
ファイル: batch_persistence.py プロジェクト: dxy/pulse-data
def write(ingest_info: IngestInfo, scrape_key: ScrapeKey, task: Task):
    session = sessions.get_current_session(scrape_key)
    if not session:
        raise DatastoreError(scrape_key.region_code, "write")
    datastore_ingest_info.write_ingest_info(
        region=scrape_key.region_code,
        session_start_time=session.start,
        ingest_info=ingest_info,
        task_hash=hash(json.dumps(task.to_serializable(), sort_keys=True)))
コード例 #4
0
    def test_remove_item_from_session_and_docket(self):
        scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)
        docket.add_to_query_docket(scrape_key, get_payload()).result()
        self.create_session(scrape_key)
        tracker.iterate_docket_item(scrape_key)

        tracker.remove_item_from_session_and_docket(scrape_key)

        assert not sessions.get_current_session(scrape_key).docket_ack_id
コード例 #5
0
def write_error(error: str, trace_id: Optional[str], task: Task,
                scrape_key: ScrapeKey):
    session = sessions.get_current_session(scrape_key)
    if not session:
        raise DatastoreError(scrape_key.region_code, "write_error")

    datastore_ingest_info.write_error(region=scrape_key.region_code,
                                      error=error, trace_id=trace_id,
                                      task_hash=hash(json.dumps(
                                          task.to_serializable(),
                                          sort_keys=True)),
                                      session_start_time=session.start)
コード例 #6
0
    def _check_finished(region_code: str):
        # If there are no sessions currently scraping, nothing to check.
        session = sessions.get_current_session(
            ScrapeKey(region_code, constants.ScrapeType.BACKGROUND))
        if not session or not session.phase.is_actively_scraping():
            return

        if is_scraper_finished(region_code, cloud_task_manager):
            logging.info("Region [%s] has finished scraping.", region_code)

            if next_phase:
                logging.info("Enqueueing [%s] for region [%s].",
                             next_phase, region_code)
                ScraperCloudTaskManager().create_scraper_phase_task(
                    region_code=region_code, url=next_phase_url)
コード例 #7
0
def remove_item_from_session_and_docket(scrape_key):
    """Deletes currently leased docket item, removes from scrape session

    Fetches the current session, determines which item from the docket is
    currently being worked on, then deletes that item and resets the session
    item to blank.

    Args:
        scrape_key: (ScrapeKey) The scraper to remote currently leased item for

    Returns:
        N/A
    """
    session = sessions.get_current_session(scrape_key)

    if not session:
        logging.warning("No open sessions found to remove docket item.")
        return

    docket_ack_id = sessions.remove_docket_item_from_session(session)

    if docket_ack_id:
        docket.ack_docket_item(scrape_key, docket_ack_id)
コード例 #8
0
ファイル: base_scraper.py プロジェクト: xgenie-007/pulse-data
    def _generic_scrape(self, request: QueueRequest):
        """
        General handler for all scrape tasks.  This function is a generic entry
        point into all types of scrapes.  It decides what to call based on
        params.

        Args:
            params: dict of parameters passed from the last scrape session.
        """
        try:
            task = request.next_task

            # Here we handle a special case where we weren't really sure
            # we were going to get data when we submitted a task, but then
            # we ended up with data, so no more requests are required,
            # just the content we already have.
            # TODO(#680): remove this
            if task.content is not None:
                content = self._parse_html_content(task.content)
                cookies = None
            else:
                post_data = task.post_data

                # Let the child transform the post_data if it wants before
                # sending the requests.  This hook is in here in case the
                # child did something like compress the post_data before
                # it put it on the queue.
                self.transform_post_data(post_data)

                # We always fetch some content before doing anything.
                # Note that we use get here for the post_data to return a
                # default value of None if this scraper doesn't set it.
                try:
                    content, cookies = self._fetch_content(
                        task.endpoint,
                        task.response_type,
                        headers=task.headers,
                        cookies=task.cookies,
                        params=task.params,
                        post_data=post_data,
                        json_data=task.json)
                except Exception as e:
                    raise ScraperFetchError(str(e)) from e

            scraped_data = None
            if self.should_scrape_data(task.task_type):
                # If we want to scrape data, we should either create an
                # ingest_info object or get the one that already exists.
                logging.info("Scraping data for [%s] and endpoint: [%s]",
                             self.region.region_code, task.endpoint)
                try:
                    scraped_data = self.populate_data(
                        content, task, request.ingest_info or IngestInfo())
                except Exception as e:
                    raise ScraperPopulateDataError(str(e)) from e

            if self.should_get_more_tasks(task.task_type):
                logging.info("Getting more tasks for [%s] and endpoint: [%s]",
                             self.region.region_code, task.endpoint)

                # Only send along ingest info if it will not be persisted now.
                ingest_info_to_send = None
                if scraped_data is not None and not scraped_data.persist:
                    ingest_info_to_send = scraped_data.ingest_info

                try:
                    # pylint: disable=assignment-from-no-return
                    next_tasks = self.get_more_tasks(content, task)
                except Exception as e:
                    raise ScraperGetMoreTasksError(str(e)) from e
                for next_task in next_tasks:
                    # Include cookies received from response, if any
                    if cookies:
                        cookies.update(next_task.cookies)
                        next_task = Task.evolve(next_task, cookies=cookies)
                    self.add_task(
                        '_generic_scrape',
                        QueueRequest(
                            scrape_type=request.scrape_type,
                            scraper_start_time=request.scraper_start_time,
                            next_task=next_task,
                            ingest_info=ingest_info_to_send,
                        ))

            if scraped_data is not None and scraped_data.persist:
                if scraped_data.ingest_info:
                    logging.info("Logging at most 4 people (were %d):",
                                 len(scraped_data.ingest_info.people))
                    loop_count = min(len(scraped_data.ingest_info.people),
                                     constants.MAX_PEOPLE_TO_LOG)
                    for i in range(loop_count):
                        logging.info("[%s]",
                                     str(scraped_data.ingest_info.people[i]))
                    logging.info("Last seen time of person being set as: [%s]",
                                 request.scraper_start_time)
                    metadata = IngestMetadata(self.region.region_code,
                                              self.region.jurisdiction_id,
                                              request.scraper_start_time,
                                              self.get_enum_overrides())
                    if self.BATCH_WRITES:
                        logging.info(
                            "Queuing ingest_info ([%d] people) to "
                            "batch_persistence for [%s]",
                            len(scraped_data.ingest_info.people),
                            self.region.region_code)
                        scrape_key = ScrapeKey(self.region.region_code,
                                               request.scrape_type)
                        batch_persistence.write(
                            ingest_info=scraped_data.ingest_info,
                            scrape_key=scrape_key,
                            task=task,
                        )
                    else:
                        logging.info(
                            "Writing ingest_info ([%d] people) to the database"
                            " for [%s]", len(scraped_data.ingest_info.people),
                            self.region.region_code)
                        persistence.write(
                            ingest_utils.convert_ingest_info_to_proto(
                                scraped_data.ingest_info), metadata)
                for sc in scraped_data.single_counts:
                    if not sc.date:
                        scrape_key = ScrapeKey(self.region.region_code,
                                               constants.ScrapeType.BACKGROUND)
                        session = sessions.get_current_session(scrape_key)
                        if session:
                            sc = attr.evolve(sc, date=session.start.date())
                    single_count.store_single_count(
                        sc, self.region.jurisdiction_id)
        except Exception as e:
            if self.BATCH_WRITES:
                scrape_key = ScrapeKey(self.region.region_code,
                                       request.scrape_type)
                batch_persistence.write_error(
                    error=str(e),
                    trace_id=get_trace_id_from_flask(),
                    task=task,
                    scrape_key=scrape_key,
                )
            raise e
コード例 #9
0
ファイル: worker.py プロジェクト: jazzPouls/pulse-data
def work(region):
    """POST request handler to route chunk of scraper work

    Very thin shim to receive a chunk of work from the task queue, and call
    the relevant part of the specified scraper to execute it.

    All scraper work that hits a third-party website goes through this handler
    as small discrete tasks, so that we leverage the taskqueue's throttling and
    retry support for network requests to the sites (and don't DOS them).

    Because scraping will vary so significantly by region, this taskqueue
    handler is very lightweight - it really just accepts the POST for the task,
    and calls the relevant regional scraper to do whatever was asked. This
    allows it to stay agnostic to regional variation.

    Never called manually, so authentication is enforced in app.yaml.

    Form data must be a bytes-encoded JSON object with parameters listed below.

    URL Parameters:
        region: (string) Region code for the scraper in question.
        task: (string) Name of the function to call in the scraper
        params: (dict) Parameter payload to give the function being called
            (optional)

    Returns:
        Response code 200 if successful

        Any other response code will make taskqueue consider the task
        failed, and it will retry the task until it expires or succeeds
        (handling backoff logic, etc.)
    """
    # Verify this was actually a task queued by our app
    if "X-AppEngine-QueueName" not in request.headers:
        logging.error("Couldn't validate task was legit, exiting.")
        return ("", HTTPStatus.INTERNAL_SERVER_ERROR)
    queue_name = request.headers.get("X-AppEngine-QueueName")

    json_data = request.get_data(as_text=True)
    data = json.loads(json_data)
    task = data["task"]
    params = QueueRequest.from_serializable(data["params"])

    if region != data["region"]:
        raise ValueError(
            "Region specified in task {} does not match region from url {}.".
            format(data["region"], region))

    task_tags = {monitoring.TagKey.STATUS: "COMPLETED"}
    # Note: measurements must be second so it receives the region tag.
    with monitoring.push_tags(
        {monitoring.TagKey.REGION:
         region}), monitoring.measurements(task_tags) as measurements:
        measurements.measure_int_put(m_tasks, 1)
        if not sessions.get_current_session(
                ScrapeKey(region, params.scrape_type)):
            task_tags[monitoring.TagKey.STATUS] = "SKIPPED"
            logging.info(
                "Queue [%s], skipping task [%s] for [%s] because it "
                "is not in the current session.",
                queue_name,
                task,
                region,
            )
            return ("", HTTPStatus.OK)
        logging.info("Queue [%s], processing task [%s] for [%s].", queue_name,
                     task, region)

        scraper = regions.get_region(region).get_ingestor()
        scraper_task = getattr(scraper, task)

        try:
            scraper_task(params)
        except Exception as e:
            task_tags[monitoring.TagKey.STATUS] = "ERROR: {}".format(
                type(e).__name__)
            raise RequestProcessingError(region, task, params) from e

        # Respond to the task queue to mark this task as done
        return ("", HTTPStatus.OK)