def call_convert(input_file, output_file, **kwargs): if ".pdf" in input_file: raise ParseError("Does not compute.") else: run_convert(input_file=input_file, output_file=output_file, **kwargs)
def get_thumbnail(self, document_path, mime_type): self.log("info", f"[TIKA_THUMB] Generating thumbnail for{document_path}") archive_path = self.archive_path out_path = os.path.join(self.tempdir, "convert.png") # Run convert to get a decent thumbnail try: run_convert( density=300, scale="500x5000>", alpha="remove", strip=True, trim=False, input_file="{}[0]".format(archive_path), output_file=out_path, logging_group=self.logging_group, ) except ParseError: # if convert fails, fall back to extracting # the first PDF page as a PNG using Ghostscript self.log( "warning", "Thumbnail generation with ImageMagick failed, falling back " "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!", ) gs_out_path = os.path.join(self.tempdir, "gs_out.png") cmd = [ settings.GS_BINARY, "-q", "-sDEVICE=pngalpha", "-o", gs_out_path, archive_path, ] if not subprocess.Popen(cmd).wait() == 0: raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) # then run convert on the output from gs run_convert( density=300, scale="500x5000>", alpha="remove", strip=True, trim=False, input_file=gs_out_path, output_file=out_path, logging_group=self.logging_group, ) return out_path
def get_thumbnail(self, document_path, mime_type): """ The thumbnail of a PDF is just a 500px wide image of the first page. """ out_path = os.path.join(self.tempdir, "convert.png") # Run convert to get a decent thumbnail try: run_convert(density=300, scale="500x5000>", alpha="remove", strip=True, trim=False, auto_orient=True, input_file="{}[0]".format(document_path), output_file=out_path, logging_group=self.logging_group) except ParseError: # if convert fails, fall back to extracting # the first PDF page as a PNG using Ghostscript self.log( 'warning', "Thumbnail generation with ImageMagick failed, falling back " "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!") gs_out_path = os.path.join(self.tempdir, "gs_out.png") cmd = [ settings.GS_BINARY, "-q", "-sDEVICE=pngalpha", "-o", gs_out_path, document_path ] if not subprocess.Popen(cmd).wait() == 0: raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) # then run convert on the output from gs run_convert(density=300, scale="500x5000>", alpha="remove", strip=True, trim=False, auto_orient=True, input_file=gs_out_path, output_file=out_path, logging_group=self.logging_group) return out_path
def _get_greyscale(self): """ Greyscale images are easier for Tesseract to OCR """ # Convert PDF to multiple PNMs input_file = self.document_path if settings.OCR_PAGES == 1: input_file += "[0]" elif settings.OCR_PAGES > 1: input_file += f"[0-{settings.OCR_PAGES - 1}]" self.log("debug", f"Converting document {input_file} into greyscale images") output_files = os.path.join(self.tempdir, "convert-%04d.pnm") run_convert(density=settings.CONVERT_DENSITY, depth="8", type="grayscale", input_file=input_file, output_file=output_files, logging_group=self.logging_group) # Get a list of converted images pnms = [] for f in os.listdir(self.tempdir): if f.endswith(".pnm"): pnms.append(os.path.join(self.tempdir, f)) self.log("debug", f"Running unpaper on {len(pnms)} pages...") # Run unpaper in parallel on converted images with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool: pnms = pool.map(run_unpaper, pnms) return sorted(filter(lambda __: os.path.isfile(__), pnms))