async def submit( request: Request, client: PublisherClient, q: SQLiteAckQueue, topic: str, metadata_headers: Dict[str, str], **kwargs, ) -> response.HTTPResponse: """Deliver request to the pubsub topic. Deliver to the local queue to be retried on transient errors. """ data = request.body attrs = { key: value for key, value in dict( submission_timestamp=datetime.utcnow().isoformat() + "Z", uri=request.path, protocol="HTTP/" + request.version, method=request.method, args=request.query_string, remote_addr=request.ip, host=request.host, **{ attr: request.headers.get(header) for header, attr in metadata_headers.items() }, ).items() if value is not None } # assert valid pubsub message for value in attrs.values(): if len(value.encode("utf8")) > 1024: # attribute exceeds value size limit of 1024 bytes # https://cloud.google.com/pubsub/quotas#resource_limits return response.text( "header too large\n", HTTP_STATUS.REQUEST_HEADER_FIELDS_TOO_LARGE ) try: await client.publish(topic, data, **attrs) except ValueError: return response.text("payload too large\n", HTTP_STATUS.PAYLOAD_TOO_LARGE) except Exception: # api call failure, write to queue logger.exception("pubsub unavailable") try: q.put((topic, data, attrs)) except DatabaseError: logger.exception("queue full") # sqlite queue is probably out of space return response.text("", HTTP_STATUS.INSUFFICIENT_STORAGE) return response.text("")
def _pool_tasks(interval, times_to_run): """ A while that runs forever and check for new tasks on the cassandra DB, every new Task on DB has a 'created' state, so this method looks for all the tasks that have this status. Args: interval: the interval that we should pool cassandra, on every pool; times_to_run: used on tests to determine that the threads will not run forever. [ONLY FOR TESTING] """ times_run = 0 tasks_queue = SQLiteAckQueue(QUEUE_LOCATION) # this while condition will only be checked on testing, otherwise this loop # should run forever. while not times_to_run or times_run < times_to_run: _logger.debug("Calling pool tasks") all_tasks = Task.objects.filter(status=STATUS_CREATED).all() _logger.debug("Found %d tasks to process", len(all_tasks)) for task in all_tasks: try: crawler_name = task.kind if task.options: options = json.loads(task.options) options = options.copy() else: options = {} options["crawler"] = crawler_name options["task_id"] = task.task_id # get the fixed options on the settings that will be aggregated # with the options sent on the table. default_settings = settings.DAVINCI_CONF["crawler-params"].get("default", {}) crawler_settings = settings.DAVINCI_CONF["crawler-params"].get(crawler_name, {}) _add_default_options(options, crawler_settings) _add_default_options(options, default_settings) # fixed options, place here all the fixed options options["current_execution_date"] = datetime.utcnow() params = json.loads(task.params) tasks_queue.put([params, options]) update_task_status(task, STATUS_QUEUED) except Exception as e: update_task_status(task, STATUS_FAULTY, source="crawl command", more_info=traceback.format_exc()) _logger.error("Error while adding params to queue", e) time.sleep(interval) if times_to_run: times_run += 1