Example #1
0
def do_document_ocr(queue_document):
    """
    Try first to extract text from document pages using the registered
    parser, if the parser fails or if there is no parser registered for
    the document mimetype do a visual OCR by calling tesseract
    """
    for document_page in queue_document.document.pages.all():
        try:
            # Try to extract text by means of a parser
            parse_document_page(document_page)
        except (ParserError, ParserUnknownFile):
            # Fall back to doing visual OCR
            ocr_transformations, warnings = queue_document.get_transformation_list(
            )

            document_filepath = document_page.document.get_image_cache_name(
                page=document_page.page_number)
            unpaper_output_filename = u'%s_unpaper_out_page_%s%s%s' % (
                document_page.document.uuid, document_page.page_number,
                os.extsep, UNPAPER_FILE_FORMAT)
            unpaper_output_filepath = os.path.join(TEMPORARY_DIRECTORY,
                                                   unpaper_output_filename)

            unpaper_input = convert(document_filepath,
                                    file_format=UNPAPER_FILE_FORMAT,
                                    transformations=ocr_transformations)
            execute_unpaper(input_filepath=unpaper_input,
                            output_filepath=unpaper_output_filepath)

            #from PIL import Image, ImageOps
            #im = Image.open(document_filepath)
            ##if im.mode=='RGBA':
            ##    im=im.convert('RGB')
            ##im = im.convert('L')
            #im = ImageOps.grayscale(im)
            #im.save(unpaper_output_filepath)

            # Convert to TIFF
            pre_ocr_filepath = convert(input_filepath=unpaper_output_filepath,
                                       file_format=DEFAULT_OCR_FILE_FORMAT)
            # Tesseract needs an explicit file extension
            pre_ocr_filepath_w_ext = os.extsep.join(
                [pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION])
            os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext)
            try:
                ocr_text = run_tesseract(pre_ocr_filepath_w_ext,
                                         TESSERACT_LANGUAGE)

                document_page.content = ocr_cleanup(ocr_text)
                document_page.page_label = _(u'Text from OCR')
                document_page.save()
            finally:
                cleanup(pre_ocr_filepath_w_ext)
                cleanup(unpaper_input)
                cleanup(document_filepath)
                cleanup(unpaper_output_filepath)
Example #2
0
def do_document_ocr(queue_document):
    """
    Try first to extract text from document pages using the registered
    parser, if the parser fails or if there is no parser registered for
    the document mimetype do a visual OCR by calling tesseract
    """
    for document_page in queue_document.document.documentpage_set.all():
        try:
            # Try to extract text by means of a parser
            parse_document_page(document_page)
        except (ParserError, ParserUnknownFile):
            # Fall back to doing visual OCR
            ocr_transformations, warnings = queue_document.get_transformation_list()

            document_filepath = document_page.document.get_image_cache_name(page=document_page.page_number)
            unpaper_output_filename = u"%s_unpaper_out_page_%s%s%s" % (
                document_page.document.uuid,
                document_page.page_number,
                os.extsep,
                UNPAPER_FILE_FORMAT,
            )
            unpaper_output_filepath = os.path.join(TEMPORARY_DIRECTORY, unpaper_output_filename)

            unpaper_input = convert(
                document_filepath, file_format=UNPAPER_FILE_FORMAT, transformations=ocr_transformations
            )
            execute_unpaper(input_filepath=unpaper_input, output_filepath=unpaper_output_filepath)

            # from PIL import Image, ImageOps
            # im = Image.open(document_filepath)
            ##if im.mode=='RGBA':
            ##    im=im.convert('RGB')
            ##im = im.convert('L')
            # im = ImageOps.grayscale(im)
            # im.save(unpaper_output_filepath)

            # Convert to TIFF
            pre_ocr_filepath = convert(input_filepath=unpaper_output_filepath, file_format=DEFAULT_OCR_FILE_FORMAT)
            # Tesseract needs an explicit file extension
            pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION])
            os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext)
            try:
                ocr_text = run_tesseract(pre_ocr_filepath_w_ext, TESSERACT_LANGUAGE)

                document_page.content = ocr_cleanup(ocr_text)
                document_page.page_label = _(u"Text from OCR")
                document_page.save()
            finally:
                cleanup(pre_ocr_filepath_w_ext)
                cleanup(unpaper_input)
                cleanup(document_filepath)
                cleanup(unpaper_output_filepath)
Example #3
0
    def setUp(self):
        from ocr.parsers import parse_document_page
        self.document_type = DocumentType(name='test doc type')
        self.document_type.save()

        self.document = Document(
            document_type=self.document_type,
            description='description',
        )
        self.document.save()

        file_object = open(os.path.join(settings.PROJECT_ROOT, 'contrib', 'mayan_11_1.pdf'))
        new_version = self.document.new_version(file=File(file_object, name='mayan_11_1.pdf'))
        file_object.close()
        # Text extraction on the first page only
        parse_document_page(self.document.latest_version.pages.all()[0])
Example #4
0
    def setUp(self):
        from ocr.parsers import parse_document_page
        self.document_type = DocumentType(name='test doc type')
        self.document_type.save()

        self.document = Document(
            document_type=self.document_type,
            description='description',
        )
        self.document.save()

        with open(TEST_DOCUMENT_PATH) as file_object:
            new_version = self.document.new_version(file=File(file_object, name='mayan_11_1.pdf'))

        # Text extraction on the first page only
        parse_document_page(self.document.latest_version.pages.all()[0])