Beispiel #1
0
    def get_text(self):

        if self._text is not None:
            return self._text

        if not settings.OCR_ALWAYS and self._is_ocred():
            self.log("debug", "Skipping OCR, using Text from PDF")
            self._text = get_text_from_pdf(self.document_path)
            return self._text

        images = self._get_greyscale()

        if not images:
            raise ParseError("Empty document, nothing to do.")

        try:

            sample_page_index = int(len(images) / 2)
            self.log(
                "debug", f"Attempting language detection on page "
                f"{sample_page_index + 1} of {len(images)}...")

            sample_page_text = self._ocr([images[sample_page_index]],
                                         settings.OCR_LANGUAGE)[0]
            guessed_language = self._guess_language(sample_page_text)

            if not guessed_language or guessed_language not in ISO639:
                self.log("warning", "Language detection failed.")
                ocr_pages = self._complete_ocr_default_language(
                    images, sample_page_index, sample_page_text)

            elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
                self.log(
                    "debug", f"Detected language: {guessed_language} "
                    f"(default language)")
                ocr_pages = self._complete_ocr_default_language(
                    images, sample_page_index, sample_page_text)

            elif not ISO639[guessed_language] in pyocr.get_available_tools(
            )[0].get_available_languages():  # NOQA: E501
                self.log(
                    "warning",
                    f"Detected language {guessed_language} is not available "
                    f"on this system.")
                ocr_pages = self._complete_ocr_default_language(
                    images, sample_page_index, sample_page_text)

            else:
                self.log("debug", f"Detected language: {guessed_language}")
                ocr_pages = self._ocr(images, ISO639[guessed_language])

            self.log("debug", "OCR completed.")
            self._text = strip_excess_whitespace(" ".join(ocr_pages))
            return self._text

        except OCRError as e:
            raise ParseError(e)
Beispiel #2
0
 def call_convert(input_file, output_file, **kwargs):
     if ".pdf" in input_file:
         raise ParseError("Does not compute.")
     else:
         run_convert(input_file=input_file,
                     output_file=output_file,
                     **kwargs)
Beispiel #3
0
    def get_thumbnail(self):
        """
        The thumbnail of a PDF is just a 500px wide image of the first page.
        """

        out_path = os.path.join(self.tempdir, "convert.png")

        # Run convert to get a decent thumbnail
        try:
            run_convert(self.CONVERT, "-scale", "500x5000", "-alpha", "remove",
                        "-strip", "-trim", "{}[0]".format(self.document_path),
                        out_path)
        except ParseError:
            # if convert fails, fall back to extracting
            # the first PDF page as a PNG using Ghostscript
            self.log(
                "warning", "Thumbnail generation with ImageMagick failed, "
                "falling back to Ghostscript.")
            gs_out_path = os.path.join(self.tempdir, "gs_out.png")
            cmd = [
                self.GHOSTSCRIPT, "-q", "-sDEVICE=pngalpha", "-o", gs_out_path,
                self.document_path
            ]
            if not subprocess.Popen(cmd).wait() == 0:
                raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
            # then run convert on the output from gs
            run_convert(self.CONVERT, "-scale", "500x5000", "-alpha", "remove",
                        "-strip", "-trim", gs_out_path, out_path)

        return out_path
Beispiel #4
0
    def parse(self, document_path, mime_type):
        self.log("info",
                 f"[TIKA_PARSE] Sending {document_path} to Tika server")

        try:
            parsed = parser.from_file(document_path)
        except requests.exceptions.HTTPError as err:
            raise ParseError(
                f"Could not parse {document_path} with tika server: {err}")

        try:
            content = parsed["content"].strip()
        except:
            content = ""

        try:
            creation_date = dateutil.parser.isoparse(
                parsed["metadata"]["Creation-Date"])
        except:
            creation_date = None

        archive_path = os.path.join(self.tempdir, "convert.pdf")
        convert_to_pdf(self, document_path, archive_path)

        self.archive_path = archive_path
        self.date = creation_date
        self.text = content
Beispiel #5
0
    def get_text(self):

        images = self._get_greyscale()

        try:

            return self._get_ocr(images)
        except OCRError as e:
            raise ParseError(e)
Beispiel #6
0
def run_command(*args):
    environment = os.environ.copy()
    if settings.CONVERT_MEMORY_LIMIT:
        environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
    if settings.CONVERT_TMPDIR:
        environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR

    if not subprocess.Popen(' '.join(args), env=environment,
                            shell=True).wait() == 0:
        raise ParseError("Convert failed at {}".format(args))
Beispiel #7
0
    def get_thumbnail(self, document_path, mime_type):
        self.log("info",
                 f"[TIKA_THUMB] Generating thumbnail for{document_path}")
        archive_path = self.archive_path

        out_path = os.path.join(self.tempdir, "convert.png")

        # Run convert to get a decent thumbnail
        try:
            run_convert(
                density=300,
                scale="500x5000>",
                alpha="remove",
                strip=True,
                trim=False,
                input_file="{}[0]".format(archive_path),
                output_file=out_path,
                logging_group=self.logging_group,
            )
        except ParseError:
            # if convert fails, fall back to extracting
            # the first PDF page as a PNG using Ghostscript
            self.log(
                "warning",
                "Thumbnail generation with ImageMagick failed, falling back "
                "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
            )
            gs_out_path = os.path.join(self.tempdir, "gs_out.png")
            cmd = [
                settings.GS_BINARY,
                "-q",
                "-sDEVICE=pngalpha",
                "-o",
                gs_out_path,
                archive_path,
            ]
            if not subprocess.Popen(cmd).wait() == 0:
                raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
            # then run convert on the output from gs
            run_convert(
                density=300,
                scale="500x5000>",
                alpha="remove",
                strip=True,
                trim=False,
                input_file=gs_out_path,
                output_file=out_path,
                logging_group=self.logging_group,
            )

        return out_path
Beispiel #8
0
    def get_text(self):
        if self.TEXT_CACHE is not None:
            return self.TEXT_CACHE

        if not self.OCR_ALWAYS and self._is_ocred():
            self.log("info", "Skipping OCR, using Text from PDF")
            self.TEXT_CACHE = get_text_from_pdf(self.document_path)
            return self.TEXT_CACHE

        images = self._get_greyscale()

        try:

            self.TEXT_CACHE = self._get_ocr(images)
            return self.TEXT_CACHE
        except OCRError as e:
            raise ParseError(e)
    def get_thumbnail(self, document_path, mime_type):
        """
        The thumbnail of a PDF is just a 500px wide image of the first page.
        """

        out_path = os.path.join(self.tempdir, "convert.png")

        # Run convert to get a decent thumbnail
        try:
            run_convert(density=300,
                        scale="500x5000>",
                        alpha="remove",
                        strip=True,
                        trim=False,
                        auto_orient=True,
                        input_file="{}[0]".format(document_path),
                        output_file=out_path,
                        logging_group=self.logging_group)
        except ParseError:
            # if convert fails, fall back to extracting
            # the first PDF page as a PNG using Ghostscript
            self.log(
                'warning',
                "Thumbnail generation with ImageMagick failed, falling back "
                "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!")
            gs_out_path = os.path.join(self.tempdir, "gs_out.png")
            cmd = [
                settings.GS_BINARY, "-q", "-sDEVICE=pngalpha", "-o",
                gs_out_path, document_path
            ]
            if not subprocess.Popen(cmd).wait() == 0:
                raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
            # then run convert on the output from gs
            run_convert(density=300,
                        scale="500x5000>",
                        alpha="remove",
                        strip=True,
                        trim=False,
                        auto_orient=True,
                        input_file=gs_out_path,
                        output_file=out_path,
                        logging_group=self.logging_group)

        return out_path
Beispiel #10
0
def convert_to_pdf(self, document_path, pdf_path):
    pdf_path = os.path.join(self.tempdir, "convert.pdf")
    gotenberg_server = os.getenv("PAPERLESS_GOTENBERG",
                                 "http://localhost:3000")
    url = gotenberg_server + "/convert/office"

    self.log("info", f"[TIKA] Converting {document_path} to PDF as {pdf_path}")
    files = {"files": open(document_path, "rb")}
    headers = {}

    try:
        response = requests.post(url, files=files, headers=headers)
        response.raise_for_status()  # ensure we notice bad responses
    except requests.exceptions.HTTPError as err:
        raise ParseError(
            f"Could not contact gotenberg server at {gotenberg_server}: {err}")

    file = open(pdf_path, "wb")
    file.write(response.content)
    file.close()
Beispiel #11
0
    def test_parser_error(self, m):
        m.side_effect = ParseError()
        Document = self.apps.get_model("documents", "Document")

        doc1 = make_test_document(Document, "document", "image/png",
                                  simple_png, "document.png", simple_pdf)
        doc2 = make_test_document(Document, "document", "application/pdf",
                                  simple_jpg, "document.jpg", simple_pdf)

        self.assertIsNotNone(doc1.archive_checksum)
        self.assertIsNotNone(doc2.archive_checksum)

        with self.assertLogs() as capture:
            self.performMigration()

        self.assertEqual(m.call_count, 6)

        self.assertEqual(
            len(
                list(
                    filter(
                        lambda log: "Parse error, will try again in 5 seconds"
                        in log, capture.output))), 4)

        self.assertEqual(
            len(
                list(
                    filter(
                        lambda log:
                        "Unable to regenerate archive document for ID:" in log,
                        capture.output))), 2)

        Document = self.apps.get_model("documents", "Document")

        doc1 = Document.objects.get(id=doc1.id)
        doc2 = Document.objects.get(id=doc2.id)

        self.assertIsNone(doc1.archive_checksum)
        self.assertIsNone(doc2.archive_checksum)
        self.assertIsNone(doc1.archive_filename)
        self.assertIsNone(doc2.archive_filename)
Beispiel #12
0
    def parse(self, document_path, mime_type, file_name=None):
        self.log("info", f"Sending {document_path} to Tika server")
        tika_server = settings.PAPERLESS_TIKA_ENDPOINT

        try:
            parsed = parser.from_file(document_path, tika_server)
        except Exception as err:
            raise ParseError(
                f"Could not parse {document_path} with tika server at "
                f"{tika_server}: {err}")

        self.text = parsed["content"].strip()

        try:
            self.date = dateutil.parser.isoparse(
                parsed["metadata"]["Creation-Date"])
        except Exception as e:
            self.log(
                "warning", f"Unable to extract date for document "
                f"{document_path}: {e}")

        self.archive_path = self.convert_to_pdf(document_path, file_name)
Beispiel #13
0
    def convert_to_pdf(self, document_path, file_name):
        pdf_path = os.path.join(self.tempdir, "convert.pdf")
        gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT
        url = gotenberg_server + "/convert/office"

        self.log("info", f"Converting {document_path} to PDF as {pdf_path}")
        files = {"files": (file_name or os.path.basename(document_path),
                           open(document_path, "rb"))}
        headers = {}

        try:
            response = requests.post(url, files=files, headers=headers)
            response.raise_for_status()  # ensure we notice bad responses
        except Exception as err:
            raise ParseError(
                f"Error while converting document to PDF: {err}"
            )

        file = open(pdf_path, "wb")
        file.write(response.content)
        file.close()

        return pdf_path
Beispiel #14
0
 def get_text(self):
     try:
         images = self._get_images()
         return self._ocr(images)
     except Exception as e:
         raise ParseError(e)
Beispiel #15
0
    def parse(self, document_path, mime_type, file_name=None):
        # This forces tesseract to use one core per page.
        os.environ['OMP_THREAD_LIMIT'] = "1"

        if mime_type == "application/pdf":
            text_original = self.extract_text(None, document_path)
            original_has_text = text_original and len(text_original) > 50
        else:
            text_original = None
            original_has_text = False

        if settings.OCR_MODE == "skip_noarchive" and original_has_text:
            self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
            self.text = text_original
            return

        import ocrmypdf
        from ocrmypdf import InputFileError, EncryptedPdfError

        archive_path = os.path.join(self.tempdir, "archive.pdf")
        sidecar_file = os.path.join(self.tempdir, "sidecar.txt")

        args = self.construct_ocrmypdf_parameters(document_path, mime_type,
                                                  archive_path, sidecar_file)

        try:
            self.log("debug", f"Calling OCRmyPDF with args: {args}")
            ocrmypdf.ocr(**args)

            self.archive_path = archive_path
            self.text = self.extract_text(sidecar_file, archive_path)

            if not self.text:
                raise NoTextFoundException(
                    "No text was found in the original document")
        except EncryptedPdfError:
            self.log(
                "warning", "This file is encrypted, OCR is impossible. Using "
                "any text present in the original file.")
            if original_has_text:
                self.text = text_original
        except (NoTextFoundException, InputFileError) as e:
            self.log(
                "warning",
                f"Encountered an error while running OCR: {str(e)}. "
                f"Attempting force OCR to get the text.")

            archive_path_fallback = os.path.join(self.tempdir,
                                                 "archive-fallback.pdf")
            sidecar_file_fallback = os.path.join(self.tempdir,
                                                 "sidecar-fallback.txt")

            # Attempt to run OCR with safe settings.

            args = self.construct_ocrmypdf_parameters(document_path,
                                                      mime_type,
                                                      archive_path_fallback,
                                                      sidecar_file_fallback,
                                                      safe_fallback=True)

            try:
                self.log("debug",
                         f"Fallback: Calling OCRmyPDF with args: {args}")
                ocrmypdf.ocr(**args)

                # Don't return the archived file here, since this file
                # is bigger and blurry due to --force-ocr.

                self.text = self.extract_text(sidecar_file_fallback,
                                              archive_path_fallback)

            except Exception as e:
                # If this fails, we have a serious issue at hand.
                raise ParseError(f"{e.__class__.__name__}: {str(e)}")

        except Exception as e:
            # Anything else is probably serious.
            raise ParseError(f"{e.__class__.__name__}: {str(e)}")

        # As a last resort, if we still don't have any text for any reason,
        # try to extract the text from the original document.
        if not self.text:
            if original_has_text:
                self.text = text_original
            else:
                self.log(
                    "warning",
                    f"No text was found in {document_path}, the content will "
                    f"be empty.")
                self.text = ""
Beispiel #16
0
    def construct_ocrmypdf_parameters(self,
                                      input_file,
                                      mime_type,
                                      output_file,
                                      sidecar_file,
                                      safe_fallback=False):
        ocrmypdf_args = {
            'input_file': input_file,
            'output_file': output_file,
            # need to use threads, since this will be run in daemonized
            # processes by django-q.
            'use_threads': True,
            'jobs': settings.THREADS_PER_WORKER,
            'language': settings.OCR_LANGUAGE,
            'output_type': settings.OCR_OUTPUT_TYPE,
            'progress_bar': False
        }

        if settings.OCR_MODE == 'force' or safe_fallback:
            ocrmypdf_args['force_ocr'] = True
        elif settings.OCR_MODE in ['skip', 'skip_noarchive']:
            ocrmypdf_args['skip_text'] = True
        elif settings.OCR_MODE == 'redo':
            ocrmypdf_args['redo_ocr'] = True
        else:
            raise ParseError(f"Invalid ocr mode: {settings.OCR_MODE}")

        if settings.OCR_CLEAN == 'clean':
            ocrmypdf_args['clean'] = True
        elif settings.OCR_CLEAN == 'clean-final':
            if settings.OCR_MODE == 'redo':
                ocrmypdf_args['clean'] = True
            else:
                ocrmypdf_args['clean_final'] = True

        if settings.OCR_DESKEW and not settings.OCR_MODE == 'redo':
            ocrmypdf_args['deskew'] = True

        if settings.OCR_ROTATE_PAGES:
            ocrmypdf_args['rotate_pages'] = True
            ocrmypdf_args[
                'rotate_pages_threshold'] = settings.OCR_ROTATE_PAGES_THRESHOLD  # NOQA: E501

        if settings.OCR_PAGES > 0:
            ocrmypdf_args['pages'] = f"1-{settings.OCR_PAGES}"
        else:
            # sidecar is incompatible with pages
            ocrmypdf_args['sidecar'] = sidecar_file

        if self.is_image(mime_type):
            dpi = self.get_dpi(input_file)
            a4_dpi = self.calculate_a4_dpi(input_file)
            if dpi:
                self.log("debug",
                         f"Detected DPI for image {input_file}: {dpi}")
                ocrmypdf_args['image_dpi'] = dpi
            elif settings.OCR_IMAGE_DPI:
                ocrmypdf_args['image_dpi'] = settings.OCR_IMAGE_DPI
            elif a4_dpi:
                ocrmypdf_args['image_dpi'] = a4_dpi
            else:
                raise ParseError(
                    f"Cannot produce archive PDF for image {input_file}, "
                    f"no DPI information is present in this image and "
                    f"OCR_IMAGE_DPI is not set.")

        if settings.OCR_USER_ARGS and not safe_fallback:
            try:
                user_args = json.loads(settings.OCR_USER_ARGS)
                ocrmypdf_args = {**ocrmypdf_args, **user_args}
            except Exception as e:
                self.log(
                    "warning",
                    f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
                    f"they will not be used. Error: {e}")

        return ocrmypdf_args
Beispiel #17
0
def run_unpaper(args):
    unpaper, pnm = args
    command_args = unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm")
    if not subprocess.Popen(command_args).wait() == 0:
        raise ParseError("Unpaper failed at {}".format(command_args))
Beispiel #18
0
    def parse(self, document_path, mime_type):
        mode = settings.OCR_MODE

        text_original = get_text_from_pdf(document_path)
        has_text = text_original and len(text_original) > 50

        if mode == "skip_noarchive" and has_text:
            self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
            self.text = text_original
            return

        if mode in ['skip', 'skip_noarchive'] and not has_text:
            # upgrade to redo, since there appears to be no text in the
            # document. This happens to some weird encrypted documents or
            # documents with failed OCR attempts for which OCRmyPDF will
            # still report that there actually is text in them.
            self.log(
                "debug", "No text was found in the document and skip is "
                "specified. Upgrading OCR mode to redo.")
            mode = "redo"

        archive_path = os.path.join(self.tempdir, "archive.pdf")

        ocr_args = {
            'input_file': document_path,
            'output_file': archive_path,
            'use_threads': True,
            'jobs': settings.THREADS_PER_WORKER,
            'language': settings.OCR_LANGUAGE,
            'output_type': settings.OCR_OUTPUT_TYPE,
            'progress_bar': False,
            'clean': True
        }

        if settings.OCR_PAGES > 0:
            ocr_args['pages'] = f"1-{settings.OCR_PAGES}"

        # Mode selection.

        if mode in ['skip', 'skip_noarchive']:
            ocr_args['skip_text'] = True
        elif mode == 'redo':
            ocr_args['redo_ocr'] = True
        elif mode == 'force':
            ocr_args['force_ocr'] = True
        else:
            raise ParseError(f"Invalid ocr mode: {mode}")

        if self.is_image(mime_type):
            dpi = self.get_dpi(document_path)
            a4_dpi = self.calculate_a4_dpi(document_path)
            if dpi:
                self.log("debug",
                         f"Detected DPI for image {document_path}: {dpi}")
                ocr_args['image_dpi'] = dpi
            elif settings.OCR_IMAGE_DPI:
                ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI
            elif a4_dpi:
                ocr_args['image_dpi'] = a4_dpi
            else:
                raise ParseError(
                    f"Cannot produce archive PDF for image {document_path}, "
                    f"no DPI information is present in this image and "
                    f"OCR_IMAGE_DPI is not set.")

        if settings.OCR_USER_ARGS:
            try:
                user_args = json.loads(settings.OCR_USER_ARGS)
                ocr_args = {**ocr_args, **user_args}
            except Exception as e:
                self.log(
                    "warning",
                    f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
                    f"they will not be used: {e}")

        # This forces tesseract to use one core per page.
        os.environ['OMP_THREAD_LIMIT'] = "1"

        try:
            self.log("debug", f"Calling OCRmyPDF with {str(ocr_args)}")
            ocrmypdf.ocr(**ocr_args)
            # success! announce results
            self.archive_path = archive_path
            self.text = get_text_from_pdf(archive_path)

        except (InputFileError, EncryptedPdfError) as e:

            self.log(
                "debug", f"Encountered an error: {e}. Trying to use text from "
                f"original.")
            # This happens with some PDFs when used with the redo_ocr option.
            # This is not the end of the world, we'll just use what we already
            # have in the document.
            self.text = text_original
            # Also, no archived file.
            if not self.text:
                # However, if we don't have anything, fail:
                raise ParseError(e)

        except Exception as e:
            # Anything else is probably serious.
            raise ParseError(e)

        if not self.text:
            # This may happen for files that don't have any text.
            self.log(
                'warning', f"Document {document_path} does not have any text."
                f"This is probably an error or you tried to add an image "
                f"without text, or something is wrong with this document.")
            self.text = ""