コード例 #1
0
ファイル: main.py プロジェクト: MuckRock/documentcloud
def process_doc(request, _context=None):
    """Central command to run processing on a doc"""
    data = get_http_data(request)
    doc_id = data["doc_id"]
    job_type = data["method"]
    extension = data.get("extension", "pdf").lower()

    # Initialize the processing environment
    utils.initialize(REDIS, doc_id)

    # Launch PDF processing via pubsub
    if job_type == "process_pdf":
        if extension == "pdf":
            publisher.publish(PDF_PROCESS_TOPIC, data=encode_pubsub_data(data))
        else:
            # Non-PDF files require conversion first
            publisher.publish(DOCUMENT_CONVERT_TOPIC,
                              data=encode_pubsub_data(data))
    elif job_type == "redact_doc":
        publisher.publish(REDACT_TOPIC, data=encode_pubsub_data(data))
    elif job_type == "modify_doc":
        publisher.publish(MODIFY_TOPIC, data=encode_pubsub_data(data))
    elif job_type == "cancel_doc_processing":
        utils.clean_up(REDIS, doc_id)
    else:
        logger.error("Invalid doc processing type: %s",
                     job_type,
                     exc_info=sys.exc_info())
        return "Error"

    return encode_response("Ok")
コード例 #2
0
ファイル: main.py プロジェクト: MuckRock/documentcloud
def run_document_conversion(data, _context=None):
    """Converts document passed in to PDF and triggers PDF extraction."""
    data = get_pubsub_data(data)
    doc_id = data["doc_id"]
    slug = data["slug"]
    extension = data["extension"]

    logger.info("[DOCUMENT CONVERSION] doc_id %s extension %s", doc_id,
                extension)

    # Ensure whitelisted file extension
    if extension.lower().strip() not in SUPPORTED_DOCUMENT_EXTENSIONS:
        raise DocumentExtensionError()

    input_file = path.original_path(doc_id, slug, extension)

    # Ensure non-PDF document size is within the limit
    if storage.size(input_file) > DOCUMENT_SIZE_LIMIT:
        # If not, remove the PDF
        storage.delete(path.path(doc_id))
        raise DocumentSizeError()

    # Run conversion
    convert(input_file, doc_id, slug)

    # Delete the original file
    storage.delete(input_file)

    # Trigger PDF processing (output file should be expected doc path)
    publisher.publish(PDF_PROCESS_TOPIC, data=encode_pubsub_data(data))
コード例 #3
0
def trigger_processing():
    """Triggers PDF processing via pubsub."""
    publisher.publish(
        PDF_PROCESS_TOPIC,
        encode_pubsub_data({
            "doc_id": ID,
            "slug": SLUG,
            "access": Access.private
        }),
    )
コード例 #4
0
def trigger_redacting(page_numbers):
    """Triggers redaction processing via pubsub."""
    publisher.publish(
        REDACT_TOPIC,
        encode_pubsub_data({
            "doc_id":
            ID,
            "slug":
            SLUG,
            "access":
            Access.private,
            "redactions": [{
                "page_number": page_number
            } for page_number in page_numbers],
        }),
    )
コード例 #5
0
    def flush(queue):
        if not queue:
            return

        # Trigger text position extraction pipeline
        publisher.publish(
            TEXT_POSITION_EXTRACT_TOPIC,
            encode_pubsub_data(
                {
                    "paths_and_numbers": queue,
                    "doc_id": doc_id,
                    "slug": slug,
                    "access": access,
                    "ocr_code": ocr_code,
                    "partial": partial,
                    "in_memory": True,
                }
            ),
        )

        queue.clear()
コード例 #6
0
def run_tesseract(data, _context=None):
    """Runs OCR on the images passed in, storing the extracted text.
    """
    # pylint: disable=too-many-locals, too-many-statements
    overall_start = time.time()

    data = get_pubsub_data(data)
    doc_id = data["doc_id"]
    slug = data["slug"]
    access = data.get("access", access_choices.PRIVATE)
    ocr_code = data.get("ocr_code", "eng")
    paths_and_numbers = data["paths_and_numbers"]
    partial = data["partial"]  # Whether it is a partial update (e.g. redaction) or not
    force_ocr = data["force_ocr"]
    if force_ocr:
        ocr_version = f"{OCR_VERSION}_force"
    else:
        ocr_version = OCR_VERSION

    logger.info(
        "[RUN TESSERACT] doc_id %s ocr_code %s ocr_version %s page_numbers %s",
        doc_id,
        ocr_code,
        ocr_version,
        ",".join([str(number[0]) for number in paths_and_numbers]),
    )

    result = {}

    if PROFILE_CPU:
        # Perform speed thresholding to prevent running OCR on a slow CPU
        speed = profile_cpu(CPU_DIFFICULTY)
        if speed > SPEED_THRESHOLD:
            # Resubmit to queue
            publisher.publish(
                OCR_TOPIC,
                data=encode_pubsub_data(
                    {
                        "paths_and_numbers": paths_and_numbers,
                        "doc_id": doc_id,
                        "slug": slug,
                        "access": access,
                        "ocr_code": ocr_code,
                        "partial": partial,
                        "force_ocr": force_ocr,
                    }
                ),
            )
            logging.warning("Too slow (speed: %f)", speed)
            return "Too slow, retrying"

        result["speed"] = speed

    # Keep track of how long OCR takes (useful for profiling)
    elapsed_times = []

    if not paths_and_numbers:
        logging.warning("No paths/numbers")
        return "Ok"

    # Queue up text position extraction tasks
    queue = []

    def flush(queue):
        if not queue:
            return

        # Trigger text position extraction pipeline
        publisher.publish(
            TEXT_POSITION_EXTRACT_TOPIC,
            encode_pubsub_data(
                {
                    "paths_and_numbers": queue,
                    "doc_id": doc_id,
                    "slug": slug,
                    "access": access,
                    "ocr_code": ocr_code,
                    "partial": partial,
                    "in_memory": True,
                }
            ),
        )

        queue.clear()

    def check_and_flush(queue):
        if len(queue) >= TEXT_POSITION_BATCH:
            flush(queue)

    # Loop through all paths and numbers
    for page_number, image_path in paths_and_numbers:

        ocrd = utils.page_ocrd(REDIS, doc_id, page_number)
        logger.info(
            "[RUN TESSERACT] doc_id %s page_number %s ocrd %s",
            doc_id,
            page_number,
            ocrd,
        )

        text_path = path.page_text_path(doc_id, slug, page_number)

        # Benchmark OCR speed
        start_time = time.time()
        logger.info(
            "[RUN TESSERACT] doc_id %s page %s start_time %s",
            doc_id,
            page_number,
            start_time,
        )
        text, pdf_contents = ocr_page(doc_id, image_path, text_path, access, ocr_code)

        elapsed_time = time.time() - start_time
        elapsed_times.append(elapsed_time)
        logger.info(
            "[RUN TESSERACT] doc_id %s page %s elapsed_time %s",
            doc_id,
            page_number,
            elapsed_time,
        )

        # Write the output text and pdf to Redis
        utils.write_page_text(REDIS, doc_id, page_number, text, ocr_version, ocr_code)
        utils.write_page_text_pdf(REDIS, doc_id, page_number, pdf_contents)

        # Decrement the texts remaining
        utils.register_page_ocrd(REDIS, doc_id, page_number)

        # Queue text position extraction tasks
        queue.append(page_number)
        check_and_flush(queue)

    # Flush the remaining queue
    flush(queue)

    result["doc_id"] = doc_id
    result["elapsed"] = elapsed_times
    result["status"] = "Ok"
    result["overall_elapsed"] = time.time() - overall_start
    if PROFILE_CPU:
        result["speed_after"] = profile_cpu()
    return json.dumps(result)
コード例 #7
0
ファイル: main.py プロジェクト: MuckRock/documentcloud
def sidekick(request, _context=None):
    """Kick off sidekick processing lambda"""
    data = get_http_data(request)
    publisher.publish(SIDEKICK_PREPROCESS_TOPIC, data=encode_pubsub_data(data))
    return encode_response("Ok")
コード例 #8
0
ファイル: main.py プロジェクト: MuckRock/documentcloud
def import_documents(request, _context=None):
    """Command to start the import process on an organization"""
    data = get_http_data(request)
    publisher.publish(START_IMPORT_TOPIC, data=encode_pubsub_data(data))
コード例 #9
0
def encode(data):
    """Encodes data in a format expected by pubsub functions invoked directly"""
    return encode_published_pubsub_data(encode_pubsub_data(data))