Example #1
0
def extract_recap_documents(
    docs: QuerySet,
    skip_ocr: bool = False,
    order_by: Optional[str] = None,
    queue: Optional[str] = None,
) -> None:
    """Loop over RECAPDocuments and extract their contents. Use OCR if requested.

    :param docs: A queryset containing the RECAPDocuments to be processed.
    :type docs: Django Queryset
    :param skip_ocr: Whether OCR should be completed (False) or whether items
    should simply be updated to have status OCR_NEEDED.
    :type skip_ocr: Bool
    :param order_by: An optimization parameter. You may opt to order the
    processing by 'small-first' or 'big-first'.
    :type order_by: str
    :param queue: The celery queue to send the content to.
    :type queue: str
    """
    docs = docs.exclude(filepath_local="")
    if skip_ocr:
        # Focus on the items that we don't know if they need OCR.
        docs = docs.filter(ocr_status=None)
    else:
        # We're doing OCR. Only work with those items that require it.
        docs = docs.filter(ocr_status=RECAPDocument.OCR_NEEDED)

    if order_by is not None:
        if order_by == "small-first":
            docs = docs.order_by("page_count")
        elif order_by == "big-first":
            docs = docs.order_by("-page_count")

    count = docs.count()
    throttle = CeleryThrottle(queue_name=queue)
    for i, pk in enumerate(docs.values_list("pk", flat=True)):
        throttle.maybe_wait()
        extract_recap_pdf.apply_async((pk, skip_ocr), priority=5, queue=queue)
        if i % 1000 == 0:
            msg = f"Sent {i + 1}/{count} tasks to celery so far."
            logger.info(msg)
            sys.stdout.write(f"\r{msg}")
            sys.stdout.flush()
Example #2
0
def extract_recap_documents(docs, skip_ocr=False, order_by=None, queue=None):
    """Loop over RECAPDocuments and extract their contents. Use OCR if requested.

    :param docs: A queryset containing the RECAPDocuments to be processed.
    :type docs: Django Queryset
    :param skip_ocr: Whether OCR should be completed (False) or whether items
    should simply be updated to have status OCR_NEEDED.
    :type skip_ocr: Bool
    :param order_by: An optimization parameter. You may opt to order the
    processing by 'small-first' or 'big-first'.
    :type order_by: str
    :param queue: The celery queue to send the content to.
    :type queue: str
    """
    docs = docs.exclude(filepath_local='')
    if skip_ocr:
        # Focus on the items that we don't know if they need OCR.
        docs = docs.filter(ocr_status=None)
    else:
        # We're doing OCR. Only work with those items that require it.
        docs = docs.filter(ocr_status=RECAPDocument.OCR_NEEDED)

    if order_by is not None:
        if order_by == 'small-first':
            docs = docs.order_by('page_count')
        elif order_by == 'big-first':
            docs = docs.order_by('-page_count')

    count = docs.count()
    throttle = CeleryThrottle(queue_name=queue)
    for i, pk in enumerate(docs.values_list('pk', flat=True)):
        throttle.maybe_wait()
        extract_recap_pdf.apply_async((pk, skip_ocr), priority=5, queue=queue)
        if i % 1000 == 0:
            msg = "Sent %s/%s tasks to celery so far." % (i + 1, count)
            logger.info(msg)
            sys.stdout.write("\r%s" % msg)
            sys.stdout.flush()