Ejemplo n.º 1
0
async def submit(
    request: Request,
    client: PublisherClient,
    q: SQLiteAckQueue,
    topic: str,
    metadata_headers: Dict[str, str],
    **kwargs,
) -> response.HTTPResponse:
    """Deliver request to the pubsub topic.

    Deliver to the local queue to be retried on transient errors.
    """
    data = request.body
    attrs = {
        key: value
        for key, value in dict(
            submission_timestamp=datetime.utcnow().isoformat() + "Z",
            uri=request.path,
            protocol="HTTP/" + request.version,
            method=request.method,
            args=request.query_string,
            remote_addr=request.ip,
            host=request.host,
            **{
                attr: request.headers.get(header)
                for header, attr in metadata_headers.items()
            },
        ).items()
        if value is not None
    }
    # assert valid pubsub message
    for value in attrs.values():
        if len(value.encode("utf8")) > 1024:
            # attribute exceeds value size limit of 1024 bytes
            # https://cloud.google.com/pubsub/quotas#resource_limits
            return response.text(
                "header too large\n", HTTP_STATUS.REQUEST_HEADER_FIELDS_TOO_LARGE
            )
    try:
        await client.publish(topic, data, **attrs)
    except ValueError:
        return response.text("payload too large\n", HTTP_STATUS.PAYLOAD_TOO_LARGE)
    except Exception:
        # api call failure, write to queue
        logger.exception("pubsub unavailable")
        try:
            q.put((topic, data, attrs))
        except DatabaseError:
            logger.exception("queue full")
            # sqlite queue is probably out of space
            return response.text("", HTTP_STATUS.INSUFFICIENT_STORAGE)
    return response.text("")
Ejemplo n.º 2
0
def _pool_tasks(interval, times_to_run):
    """
    A while that runs forever and check for new tasks on the cassandra DB,
    every new Task on DB has a 'created' state, so this method looks for all
    the tasks that have this status.
    Args:
        interval: the interval that we should pool cassandra, on every pool;
        times_to_run: used on tests to determine that the threads will not run
        forever. [ONLY FOR TESTING]
    """
    times_run = 0
    tasks_queue = SQLiteAckQueue(QUEUE_LOCATION)
    # this while condition will only be checked on testing, otherwise this loop
    # should run forever.
    while not times_to_run or times_run < times_to_run:
        _logger.debug("Calling pool tasks")
        all_tasks = Task.objects.filter(status=STATUS_CREATED).all()
        _logger.debug("Found %d tasks to process", len(all_tasks))
        for task in all_tasks:
            try:
                crawler_name = task.kind
                if task.options:
                    options = json.loads(task.options)
                    options = options.copy()
                else:
                    options = {}
                options["crawler"] = crawler_name
                options["task_id"] = task.task_id

                # get the fixed options on the settings that will be aggregated
                # with the options sent on the table.
                default_settings = settings.DAVINCI_CONF["crawler-params"].get("default", {})
                crawler_settings = settings.DAVINCI_CONF["crawler-params"].get(crawler_name, {})

                _add_default_options(options, crawler_settings)
                _add_default_options(options, default_settings)

                # fixed options, place here all the fixed options
                options["current_execution_date"] = datetime.utcnow()

                params = json.loads(task.params)

                tasks_queue.put([params, options])
                update_task_status(task, STATUS_QUEUED)
            except Exception as e:
                update_task_status(task, STATUS_FAULTY, source="crawl command", more_info=traceback.format_exc())
                _logger.error("Error while adding params to queue", e)
        time.sleep(interval)
        if times_to_run:
            times_run += 1
Ejemplo n.º 3
0
def test_init_app(app: Sanic, client: PublisherClient, mocker: MockFixture,
                  q: SQLiteAckQueue):
    # don't hit actual pubsub
    publish = mocker.patch.object(client.api, "publish")
    publish.return_value = PublishResponse(message_ids=["1"])

    # listener to create test conditions while sanic is running
    @app.listener("after_server_start")
    async def after_server_start(app, _):
        # stop Sanic
        app.stop()
        # queue message to be delivered on shutdown
        q.put(("topic", b"data", {}))
        q.put(("topic", b"data", {}))

    # set required configuration
    app.config.update(FLUSH_CONCURRENT_BYTES=1,
                      FLUSH_CONCURRENT_MESSAGES=1,
                      FLUSH_SLEEP_SECONDS=0)
    # configure sanic listeners to handle q in the background
    flush.init_app(app, client, q)
    # use a socket to bind to a random port and allow parallel testing
    sock = socket()
    sock.bind(("", 0))
    # start the app
    app.run(sock=sock)
    # make sure everything flushed cleanly
    assert q.size == 0
    assert q.unack_count() == 0
    # make sure publish was called the expected number of times
    assert publish.call_count == 2
Ejemplo n.º 4
0
def get_queue(config: dict) -> SQLiteAckQueue:
    """Create a SQLiteAckQueue.

    Use a SQLiteAckQueue because:
    * we use acks to ensure messages only removed on success
    * persist-queue's SQLite*Queue is faster than its Queue
    * SQLite provides thread-safe and process-safe access
    """
    queue_config = {
        key[6:].lower(): value
        for key, value in config.items()
        if key.startswith("QUEUE_")
    }
    return SQLiteAckQueue(**queue_config)
Ejemplo n.º 5
0
def init_app(app: Sanic) -> Tuple[PublisherClient, SQLiteAckQueue]:
    """Initialize Sanic app with url rules."""
    # Initialize PubSub client
    timeout = app.config.get("PUBLISH_TIMEOUT_SECONDS", None)
    client = PublisherClient()
    client.api.publish = partial(
        client.api.publish,
        retry=Retry(TRANSIENT_ERRORS, deadline=timeout),
        timeout=timeout,
    )
    client._batch_class = AsyncioBatch
    # Use a SQLiteAckQueue because:
    # * we use acks to ensure messages only removed on success
    # * persist-queue's SQLite*Queue is faster than its Queue
    # * SQLite provides thread-safe and process-safe access
    queue_config = {
        key[6:].lower(): value
        for key, value in app.config.items()
        if key.startswith("QUEUE_")
    }
    q = SQLiteAckQueue(**queue_config)
    # get metadata_headers config
    metadata_headers = app.config["METADATA_HEADERS"]
    # validate attribute keys
    for attribute in metadata_headers.values():
        if len(attribute.encode("utf8")) > 256:
            # https://cloud.google.com/pubsub/quotas#resource_limits
            raise ValueError("Metadata attribute exceeds key size limit of 256 bytes")
    # generate one view_func per topic
    handlers = {
        route.topic: partial(
            submit,
            client=client,
            q=q,
            topic=route.topic,
            metadata_headers=metadata_headers,
        )
        for route in app.config["ROUTE_TABLE"]
    }
    # add routes for ROUTE_TABLE
    for route in app.config["ROUTE_TABLE"]:
        app.add_route(
            handler=handlers[route.topic],
            uri=route.uri,
            methods=[method.upper() for method in route.methods],
            # required because handler.__name__ does not exist
            # must be a unique name for each handler
            name="submit_" + route.topic,
        )
    return client, q
    def _crawl_params(self):
        """
        Read the multiprocessing queue and call the crawl method to execute the
        crawling logic.

        Will run forever than we need to Ctrl+C to finish this.
        """
        times_run = 0
        tasks_queue = SQLiteAckQueue(QUEUE_LOCATION)
        while True:
            if self.times_to_run and times_run > self.times_to_run:
                return
            task_id = None
            object_queue = None
            try:
                object_queue = tasks_queue.get(block=False)
                crawl_param, options = object_queue
                crawler_name = options.get("crawler")
                task_id = options.get("task_id")
                update_task_status(task_id, STATUS_IN_PROGRESS)
                _logger.debug("Reading a queue value %s", crawl_param)

                if "current_execution_date" not in options:
                    options["current_execution_date"] = datetime.utcnow()

                self._crawl(crawler_name, task_id, crawl_param, options)
                update_task_status(task_id, STATUS_FINISHED)
                tasks_queue.ack(object_queue)
            except Empty:
                # Means that the queue is empty and we need to count many times
                # that the occurs to the close logic, we just start counting
                # when at least the queue received one
                _logger.debug("No objects found on queue, waiting for 1 "
                              "second and try again")
                time.sleep(1)
            except Exception as e:
                if task_id and object_queue:
                    update_task_status(task_id,
                                       STATUS_FAULTY,
                                       source="crawl consumer",
                                       more_info=traceback.format_exc())
                    tasks_queue.ack_failed(object_queue)

                _logger.error("Error while crawling params from queue", e)
            times_run += 1
Ejemplo n.º 7
0
def q() -> SQLiteAckQueue:
    return SQLiteAckQueue(":memory:")
import traceback

import logging
import time
from datetime import datetime
from davinci_crawling.management.commands.utils.utils import update_task_status, get_crawler_by_name
from davinci_crawling.task.models import STATUS_IN_PROGRESS, STATUS_FAULTY, STATUS_FINISHED
from persistqueue import SQLiteAckQueue
from persistqueue.exceptions import Empty
from threading import Thread

_logger = logging.getLogger("davinci_crawling.queue")

QUEUE_LOCATION = "tasks_queue"
# create the queue for the first time
SQLiteAckQueue(QUEUE_LOCATION)


class CrawlConsumer(object):
    """
    Initiates a crawl consumer that reads from the multiprocessing queue.

    It processes the parameters in parallel using a determined quantity of
    workers.
    """

    consumers = []

    def __init__(self, qty_workers=2, times_to_run=None):
        """
        Args: