Beispiel #1
0
def run_document_conversion(data, _context=None):
    """Converts document passed in to PDF and triggers PDF extraction."""
    data = get_pubsub_data(data)
    doc_id = data["doc_id"]
    slug = data["slug"]
    extension = data["extension"]

    logger.info("[DOCUMENT CONVERSION] doc_id %s extension %s", doc_id,
                extension)

    # Ensure whitelisted file extension
    if extension.lower().strip() not in SUPPORTED_DOCUMENT_EXTENSIONS:
        raise DocumentExtensionError()

    input_file = path.original_path(doc_id, slug, extension)

    # Ensure non-PDF document size is within the limit
    if storage.size(input_file) > DOCUMENT_SIZE_LIMIT:
        # If not, remove the PDF
        storage.delete(path.path(doc_id))
        raise DocumentSizeError()

    # Run conversion
    convert(input_file, doc_id, slug)

    # Delete the original file
    storage.delete(input_file)

    # Trigger PDF processing (output file should be expected doc path)
    publisher.publish(PDF_PROCESS_TOPIC, data=encode_pubsub_data(data))
Beispiel #2
0
def preprocess(data, _context=None):
    """Preprocess the documents in a project for sidekick"""

    data = get_pubsub_data(data)
    project_id = data["project_id"]

    logger.info("[SIDEKICK PREPROCESS] project_id: %s", project_id)

    try:
        texts, doc_ids, language = load_documents(project_id)
        tfidf, features, doc_svd = process_text(project_id, texts)
        doc_embedding_(project_id, language, tfidf, features, doc_svd, doc_ids)
    except Exception:  # pylint: disable=broad-except
        send_sidekick_update(project_id, {"status": "error"})
    else:
        send_sidekick_update(project_id, {"status": "success"})
Beispiel #3
0
def run_tesseract(data, _context=None):
    """Runs OCR on the images passed in, storing the extracted text.
    """
    # pylint: disable=too-many-locals, too-many-statements
    overall_start = time.time()

    data = get_pubsub_data(data)
    doc_id = data["doc_id"]
    slug = data["slug"]
    access = data.get("access", access_choices.PRIVATE)
    ocr_code = data.get("ocr_code", "eng")
    paths_and_numbers = data["paths_and_numbers"]
    partial = data["partial"]  # Whether it is a partial update (e.g. redaction) or not
    force_ocr = data["force_ocr"]
    if force_ocr:
        ocr_version = f"{OCR_VERSION}_force"
    else:
        ocr_version = OCR_VERSION

    logger.info(
        "[RUN TESSERACT] doc_id %s ocr_code %s ocr_version %s page_numbers %s",
        doc_id,
        ocr_code,
        ocr_version,
        ",".join([str(number[0]) for number in paths_and_numbers]),
    )

    result = {}

    if PROFILE_CPU:
        # Perform speed thresholding to prevent running OCR on a slow CPU
        speed = profile_cpu(CPU_DIFFICULTY)
        if speed > SPEED_THRESHOLD:
            # Resubmit to queue
            publisher.publish(
                OCR_TOPIC,
                data=encode_pubsub_data(
                    {
                        "paths_and_numbers": paths_and_numbers,
                        "doc_id": doc_id,
                        "slug": slug,
                        "access": access,
                        "ocr_code": ocr_code,
                        "partial": partial,
                        "force_ocr": force_ocr,
                    }
                ),
            )
            logging.warning("Too slow (speed: %f)", speed)
            return "Too slow, retrying"

        result["speed"] = speed

    # Keep track of how long OCR takes (useful for profiling)
    elapsed_times = []

    if not paths_and_numbers:
        logging.warning("No paths/numbers")
        return "Ok"

    # Queue up text position extraction tasks
    queue = []

    def flush(queue):
        if not queue:
            return

        # Trigger text position extraction pipeline
        publisher.publish(
            TEXT_POSITION_EXTRACT_TOPIC,
            encode_pubsub_data(
                {
                    "paths_and_numbers": queue,
                    "doc_id": doc_id,
                    "slug": slug,
                    "access": access,
                    "ocr_code": ocr_code,
                    "partial": partial,
                    "in_memory": True,
                }
            ),
        )

        queue.clear()

    def check_and_flush(queue):
        if len(queue) >= TEXT_POSITION_BATCH:
            flush(queue)

    # Loop through all paths and numbers
    for page_number, image_path in paths_and_numbers:

        ocrd = utils.page_ocrd(REDIS, doc_id, page_number)
        logger.info(
            "[RUN TESSERACT] doc_id %s page_number %s ocrd %s",
            doc_id,
            page_number,
            ocrd,
        )

        text_path = path.page_text_path(doc_id, slug, page_number)

        # Benchmark OCR speed
        start_time = time.time()
        logger.info(
            "[RUN TESSERACT] doc_id %s page %s start_time %s",
            doc_id,
            page_number,
            start_time,
        )
        text, pdf_contents = ocr_page(doc_id, image_path, text_path, access, ocr_code)

        elapsed_time = time.time() - start_time
        elapsed_times.append(elapsed_time)
        logger.info(
            "[RUN TESSERACT] doc_id %s page %s elapsed_time %s",
            doc_id,
            page_number,
            elapsed_time,
        )

        # Write the output text and pdf to Redis
        utils.write_page_text(REDIS, doc_id, page_number, text, ocr_version, ocr_code)
        utils.write_page_text_pdf(REDIS, doc_id, page_number, pdf_contents)

        # Decrement the texts remaining
        utils.register_page_ocrd(REDIS, doc_id, page_number)

        # Queue text position extraction tasks
        queue.append(page_number)
        check_and_flush(queue)

    # Flush the remaining queue
    flush(queue)

    result["doc_id"] = doc_id
    result["elapsed"] = elapsed_times
    result["status"] = "Ok"
    result["overall_elapsed"] = time.time() - overall_start
    if PROFILE_CPU:
        result["speed_after"] = profile_cpu()
    return json.dumps(result)
Beispiel #4
0
def success_on_third_try(data):
    data = get_pubsub_data(data)
    communicate_data("Pending", data)
    time.sleep(3)
    communicate_data("Done", data)
Beispiel #5
0
def success_on_first_try(data):
    data = get_pubsub_data(data)
    communicate_data("Pending", data)
    communicate_data("Done", data)
Beispiel #6
0
def timeout_on_second_try(data):
    data = get_pubsub_data(data)
    communicate_data("Pending", data)
    time.sleep(2)
    communicate_data("Done", data)