コード例 #1
0
 def call_convert(input_file, output_file, **kwargs):
     if ".pdf" in input_file:
         raise ParseError("Does not compute.")
     else:
         run_convert(input_file=input_file,
                     output_file=output_file,
                     **kwargs)
コード例 #2
0
    def get_thumbnail(self, document_path, mime_type):
        self.log("info",
                 f"[TIKA_THUMB] Generating thumbnail for{document_path}")
        archive_path = self.archive_path

        out_path = os.path.join(self.tempdir, "convert.png")

        # Run convert to get a decent thumbnail
        try:
            run_convert(
                density=300,
                scale="500x5000>",
                alpha="remove",
                strip=True,
                trim=False,
                input_file="{}[0]".format(archive_path),
                output_file=out_path,
                logging_group=self.logging_group,
            )
        except ParseError:
            # if convert fails, fall back to extracting
            # the first PDF page as a PNG using Ghostscript
            self.log(
                "warning",
                "Thumbnail generation with ImageMagick failed, falling back "
                "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
            )
            gs_out_path = os.path.join(self.tempdir, "gs_out.png")
            cmd = [
                settings.GS_BINARY,
                "-q",
                "-sDEVICE=pngalpha",
                "-o",
                gs_out_path,
                archive_path,
            ]
            if not subprocess.Popen(cmd).wait() == 0:
                raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
            # then run convert on the output from gs
            run_convert(
                density=300,
                scale="500x5000>",
                alpha="remove",
                strip=True,
                trim=False,
                input_file=gs_out_path,
                output_file=out_path,
                logging_group=self.logging_group,
            )

        return out_path
コード例 #3
0
    def get_thumbnail(self, document_path, mime_type):
        """
        The thumbnail of a PDF is just a 500px wide image of the first page.
        """

        out_path = os.path.join(self.tempdir, "convert.png")

        # Run convert to get a decent thumbnail
        try:
            run_convert(density=300,
                        scale="500x5000>",
                        alpha="remove",
                        strip=True,
                        trim=False,
                        auto_orient=True,
                        input_file="{}[0]".format(document_path),
                        output_file=out_path,
                        logging_group=self.logging_group)
        except ParseError:
            # if convert fails, fall back to extracting
            # the first PDF page as a PNG using Ghostscript
            self.log(
                'warning',
                "Thumbnail generation with ImageMagick failed, falling back "
                "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!")
            gs_out_path = os.path.join(self.tempdir, "gs_out.png")
            cmd = [
                settings.GS_BINARY, "-q", "-sDEVICE=pngalpha", "-o",
                gs_out_path, document_path
            ]
            if not subprocess.Popen(cmd).wait() == 0:
                raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
            # then run convert on the output from gs
            run_convert(density=300,
                        scale="500x5000>",
                        alpha="remove",
                        strip=True,
                        trim=False,
                        auto_orient=True,
                        input_file=gs_out_path,
                        output_file=out_path,
                        logging_group=self.logging_group)

        return out_path
コード例 #4
0
ファイル: parsers.py プロジェクト: samotelf/paperless-ng
    def _get_greyscale(self):
        """
        Greyscale images are easier for Tesseract to OCR
        """

        # Convert PDF to multiple PNMs
        input_file = self.document_path

        if settings.OCR_PAGES == 1:
            input_file += "[0]"
        elif settings.OCR_PAGES > 1:
            input_file += f"[0-{settings.OCR_PAGES - 1}]"

        self.log("debug",
                 f"Converting document {input_file} into greyscale images")

        output_files = os.path.join(self.tempdir, "convert-%04d.pnm")

        run_convert(density=settings.CONVERT_DENSITY,
                    depth="8",
                    type="grayscale",
                    input_file=input_file,
                    output_file=output_files,
                    logging_group=self.logging_group)

        # Get a list of converted images
        pnms = []
        for f in os.listdir(self.tempdir):
            if f.endswith(".pnm"):
                pnms.append(os.path.join(self.tempdir, f))

        self.log("debug", f"Running unpaper on {len(pnms)} pages...")

        # Run unpaper in parallel on converted images
        with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
            pnms = pool.map(run_unpaper, pnms)

        return sorted(filter(lambda __: os.path.isfile(__), pnms))