Beispiel #1
0
def add_task(queue, _self, task_name, request):
    """Overwritten version of `add_task` which adds the task to an in-memory
    queue.
    """

    # Serialize and deserialize the request. Simply to replicate production
    # and catch any potential issues.
    serialized = json.dumps(request.to_serializable())
    request = QueueRequest.from_serializable(json.loads(serialized))

    # Add it to the queue
    queue.append((task_name, request))
Beispiel #2
0
def work(region):
    """POST request handler to route chunk of scraper work

    Very thin shim to receive a chunk of work from the task queue, and call
    the relevant part of the specified scraper to execute it.

    All scraper work that hits a third-party website goes through this handler
    as small discrete tasks, so that we leverage the taskqueue's throttling and
    retry support for network requests to the sites (and don't DOS them).

    Because scraping will vary so significantly by region, this taskqueue
    handler is very lightweight - it really just accepts the POST for the task,
    and calls the relevant regional scraper to do whatever was asked. This
    allows it to stay agnostic to regional variation.

    Never called manually, so authentication is enforced in app.yaml.

    Form data must be a bytes-encoded JSON object with parameters listed below.

    URL Parameters:
        region: (string) Region code for the scraper in question.
        task: (string) Name of the function to call in the scraper
        params: (dict) Parameter payload to give the function being called
            (optional)

    Returns:
        Response code 200 if successful

        Any other response code will make taskqueue consider the task
        failed, and it will retry the task until it expires or succeeds
        (handling backoff logic, etc.)
    """
    # Verify this was actually a task queued by our app
    if "X-AppEngine-QueueName" not in request.headers:
        logging.error("Couldn't validate task was legit, exiting.")
        return ("", HTTPStatus.INTERNAL_SERVER_ERROR)
    queue_name = request.headers.get("X-AppEngine-QueueName")

    json_data = request.get_data(as_text=True)
    data = json.loads(json_data)
    task = data["task"]
    params = QueueRequest.from_serializable(data["params"])

    if region != data["region"]:
        raise ValueError(
            "Region specified in task {} does not match region from url {}.".
            format(data["region"], region))

    task_tags = {monitoring.TagKey.STATUS: "COMPLETED"}
    # Note: measurements must be second so it receives the region tag.
    with monitoring.push_tags(
        {monitoring.TagKey.REGION:
         region}), monitoring.measurements(task_tags) as measurements:
        measurements.measure_int_put(m_tasks, 1)
        if not sessions.get_current_session(
                ScrapeKey(region, params.scrape_type)):
            task_tags[monitoring.TagKey.STATUS] = "SKIPPED"
            logging.info(
                "Queue [%s], skipping task [%s] for [%s] because it "
                "is not in the current session.",
                queue_name,
                task,
                region,
            )
            return ("", HTTPStatus.OK)
        logging.info("Queue [%s], processing task [%s] for [%s].", queue_name,
                     task, region)

        scraper = regions.get_region(region).get_ingestor()
        scraper_task = getattr(scraper, task)

        try:
            scraper_task(params)
        except Exception as e:
            task_tags[monitoring.TagKey.STATUS] = "ERROR: {}".format(
                type(e).__name__)
            raise RequestProcessingError(region, task, params) from e

        # Respond to the task queue to mark this task as done
        return ("", HTTPStatus.OK)