Example #1
0
def ocr_page_image(doc_path, page_num, lang):
    """
    image = jpg, jpeg, png
    """
    logger.debug("OCR image (jpeg, jpg, png) document")

    page_url = PagePath(
        document_path=doc_path,
        page_num=page_num,
        step=Step(1),
        # jpeg, jpg, png are 1 page documents
        page_count=1)
    # resize and eventually convert (png -> jpg)
    resize_img(page_url, media_root=settings.MEDIA_ROOT)
    extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT)

    # First quickly generate preview images
    for step in Steps():
        page_url.step = step
        resize_img(page_url, media_root=settings.MEDIA_ROOT)
    # reset page's step
    page_url.step = Step(1)
    # Now OCR each image
    for step in Steps():
        if not step.is_thumbnail:
            extract_hocr(page_url, lang=lang, media_root=settings.MEDIA_ROOT)

    return page_url
Example #2
0
def ocr_page_pdf(doc_path, page_num, lang):
    """
    doc_path is an mglib.path.DocumentPath instance
    """
    logger.debug("OCR PDF document")

    page_count = get_pagecount(default_storage.abspath(doc_path.url()))

    if page_num <= page_count:
        # first quickly generate preview images
        page_url = PagePath(document_path=doc_path,
                            page_num=page_num,
                            step=Step(1),
                            page_count=page_count)

        for step in Steps():
            page_url.step = step
            extract_img(page_url, media_root=settings.MEDIA_ROOT)

    if page_num <= page_count:
        page_url = PagePath(document_path=doc_path,
                            page_num=page_num,
                            step=Step(1),
                            page_count=page_count)
        extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT)

        for step in Steps():
            page_url.step = step
            if not step.is_thumbnail:
                extract_hocr(page_url,
                             lang=lang,
                             media_root=settings.MEDIA_ROOT)

    return page_url
Example #3
0
def ocr_page_pdf(doc_path, page_num, lang, **kwargs):
    """
    doc_path is an mglib.path.DocumentPath instance

    On success returns ``mglib.path.PagePath`` instance.
    """
    logger.debug("OCR PDF document")

    file_name = kwargs.pop('file_name', None)

    if not file_name:
        file_name = doc_path.file_name

    page_count = get_pagecount(default_storage.abspath(doc_path.url()))

    if page_num <= page_count:
        # first quickly generate preview images
        page_path = PagePath(document_path=doc_path,
                             page_num=page_num,
                             step=Step(1),
                             page_count=page_count)
        for step in Steps():
            page_path.step = step
            extract_img(page_path, media_root=settings.MEDIA_ROOT)

    notify_pre_page_ocr(page_path,
                        page_num=page_num,
                        lang=lang,
                        file_name=doc_path.file_name,
                        **kwargs)

    if page_num <= page_count:
        page_path = PagePath(document_path=doc_path,
                             page_num=page_num,
                             step=Step(1),
                             page_count=page_count)
        extract_txt(page_path, lang=lang, media_root=settings.MEDIA_ROOT)
        notify_txt_ready(page_path,
                         page_num=page_num,
                         lang=lang,
                         file_name=file_name,
                         **kwargs)

        for step in Steps():
            page_path.step = step
            if not step.is_thumbnail:
                extract_hocr(page_path,
                             lang=lang,
                             media_root=settings.MEDIA_ROOT)
                notify_hocr_ready(
                    page_path,
                    page_num=page_num,
                    lang=lang,
                    # step as integer number
                    step=step.current,
                    file_name=file_name,
                    **kwargs)

    return page_path
Example #4
0
def ocr_page_image(doc_path, page_num, lang, **kwargs):
    """
    image = jpg, jpeg, png

    On success returns ``mglib.path.PagePath`` instance.
    """
    logger.debug("OCR image (jpeg, jpg, png) document")

    page_path = PagePath(
        document_path=doc_path,
        page_num=page_num,
        step=Step(1),
        # jpeg, jpg, png are 1 page documents
        page_count=1)
    notify_pre_page_ocr(page_path,
                        page_num=page_num,
                        lang=lang,
                        file_name=doc_path.file_name,
                        **kwargs)
    # resize and eventually convert (png -> jpg)
    resize_img(page_path, media_root=settings.MEDIA_ROOT)
    extract_txt(page_path, lang=lang, media_root=settings.MEDIA_ROOT)
    notify_txt_ready(page_path,
                     page_num=page_num,
                     lang=lang,
                     file_name=doc_path.file_name,
                     **kwargs)

    # First quickly generate preview images
    for step in Steps():
        page_path.step = step
        resize_img(page_path, media_root=settings.MEDIA_ROOT)
    # reset page's step
    page_path.step = Step(1)
    # Now OCR each image
    for step in Steps():
        if not step.is_thumbnail:
            extract_hocr(page_path, lang=lang, media_root=settings.MEDIA_ROOT)
            notify_hocr_ready(
                page_path,
                page_num=page_num,
                lang=lang,
                # step as integer number
                step=step.current,
                file_name=doc_path.file_name,
                **kwargs)

    return page_path
Example #5
0
def ocr_page_pdf(doc_path, page_num, lang):
    """
    doc_path is an mglib.path.DocumentPath instance
    """
    page_count = get_pagecount(default_storage.abspath(doc_path.url()))
    if page_num <= page_count:
        page_url = PagePath(document_path=doc_path,
                            page_num=page_num,
                            step=Step(1),
                            page_count=page_count)
        extract_img(page_url, media_root=settings.MEDIA_ROOT)
        extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT)

        for step in Steps():
            page_url.step = step
            extract_img(page_url, media_root=settings.MEDIA_ROOT)
            # tesseract unterhalt-1.jpg page-1 -l deu hocr
            if not step.is_thumbnail:
                extract_hocr(page_url,
                             lang=lang,
                             media_root=settings.MEDIA_ROOT)

    return page_url