Exemple #1
0
def quantize(input_file, output_file, quality_min, quality_max):
    input_file = fspath(input_file)
    output_file = fspath(output_file)
    if input_file.endswith('.jpg'):
        with Image.open(input_file) as im, NamedTemporaryFile(
                suffix='.png') as tmp:
            im.save(tmp)
            args = [
                'pngquant',
                '--force',
                '--skip-if-larger',
                '--output',
                output_file,
                '--quality',
                f'{quality_min}-{quality_max}',
                '--',
                tmp.name,
            ]
            run(args)
    else:
        args = [
            'pngquant',
            '--force',
            '--skip-if-larger',
            '--output',
            output_file,
            '--quality',
            f'{quality_min}-{quality_max}',
            '--',
            input_file,
        ]
        run(args)
Exemple #2
0
def get_orientation(input_file: Path, engine_mode: int, timeout: float):
    args_tesseract = tess_base_args(['osd'], engine_mode) + [
        '--psm',
        '0',
        fspath(input_file),
        'stdout',
    ]

    try:
        p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True)
        stdout = p.stdout
    except TimeoutExpired:
        return OrientationConfidence(angle=0, confidence=0.0)
    except CalledProcessError as e:
        tesseract_log_output(e.stdout)
        tesseract_log_output(e.stderr)
        if (
            b'Too few characters. Skipping this page' in e.output
            or b'Image too large' in e.output
        ):
            return OrientationConfidence(0, 0)
        raise SubprocessOutputError() from e
    else:
        osd = {}
        for line in stdout.decode().splitlines():
            line = line.strip()
            parts = line.split(':', maxsplit=2)
            if len(parts) == 2:
                osd[parts[0].strip()] = parts[1].strip()

        angle = int(osd.get('Orientation in degrees', 0))
        oc = OrientationConfidence(
            angle=angle, confidence=float(osd.get('Orientation confidence', 0))
        )
        return oc
Exemple #3
0
def get_languages():
    def lang_error(output):
        msg = ("Tesseract failed to report available languages.\n"
               "Output from Tesseract:\n"
               "-----------\n")
        msg += output
        return msg

    args_tess = ['tesseract', '--list-langs']
    try:
        proc = run(
            args_tess,
            text=True,
            stdout=PIPE,
            stderr=STDOUT,
            logs_errors_to_stdout=True,
            check=True,
        )
        output = proc.stdout
    except CalledProcessError as e:
        raise MissingDependencyError(lang_error(e.output)) from e

    for line in output.splitlines():
        if line.startswith('Error'):
            raise MissingDependencyError(lang_error(output))
    _header, *rest = output.splitlines()
    return {lang.strip() for lang in rest}
Exemple #4
0
def generate_pdf(
    *,
    input_file: Path,
    output_pdf: Path,
    output_text: Path,
    languages: List[str],
    engine_mode: int,
    tessconfig: List[str],
    timeout: float,
    pagesegmode: int,
    user_words,
    user_patterns,
):
    """Use Tesseract to render a PDF.

    input_file -- image to analyze
    output_pdf -- file to generate
    output_text -- OCR text file
    languages -- list of languages to consider
    engine_mode -- engine mode argument for tess v4
    tessconfig -- tesseract configuration
    timeout -- timeout (seconds)
    """

    args_tesseract = tess_base_args(languages, engine_mode)

    if pagesegmode is not None:
        args_tesseract.extend(['--psm', str(pagesegmode)])

    args_tesseract.extend(['-c', 'textonly_pdf=1'])

    if user_words:
        args_tesseract.extend(['--user-words', user_words])

    if user_patterns:
        args_tesseract.extend(['--user-patterns', user_patterns])

    prefix = os.path.splitext(output_pdf)[0]  # Tesseract appends suffixes

    # Reminder: test suite tesseract test plugins might break after any changes
    # to the number of order parameters here

    args_tesseract.extend([os.fspath(input_file), os.fspath(prefix), 'pdf', 'txt'])
    args_tesseract.extend(tessconfig)
    try:
        p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True)
        stdout = p.stdout
        if os.path.exists(prefix + '.txt'):
            shutil.move(prefix + '.txt', output_text)
    except TimeoutExpired:
        page_timedout(timeout)
        use_skip_page(output_pdf, output_text)
    except CalledProcessError as e:
        tesseract_log_output(e.output)
        if b'Image too large' in e.output:
            use_skip_page(output_pdf, output_text)
            return
        raise SubprocessOutputError() from e
    else:
        tesseract_log_output(stdout)
Exemple #5
0
def rasterize_pdf(
    input_file: os.PathLike,
    output_file: os.PathLike,
    *,
    raster_device: str,
    raster_dpi: Resolution,
    pageno: int = 1,
    page_dpi: Optional[Resolution] = None,
    rotation: Optional[int] = None,
    filter_vector: bool = False,
):
    """Rasterize one page of a PDF at resolution raster_dpi in canvas units."""
    raster_dpi = raster_dpi.round(6)
    if not page_dpi:
        page_dpi = raster_dpi

    args_gs = ([
        GS,
        '-dQUIET',
        '-dSAFER',
        '-dBATCH',
        '-dNOPAUSE',
        f'-sDEVICE={raster_device}',
        f'-dFirstPage={pageno}',
        f'-dLastPage={pageno}',
        f'-r{raster_dpi.x:f}x{raster_dpi.y:f}',
    ] + (['-dFILTERVECTOR'] if filter_vector else []) + [
        '-o',
        '-',
        '-sstdout=%stderr',
        '-dAutoRotatePages=/None',  # Probably has no effect on raster
        '-f',
        fspath(input_file),
    ])

    try:
        p = run(args_gs, stdout=PIPE, stderr=PIPE, check=True)
    except CalledProcessError as e:
        log.error(e.stderr.decode(errors='replace'))
        raise SubprocessOutputError('Ghostscript rasterizing failed')
    else:
        stderr = p.stderr.decode(errors='replace')
        if _gs_error_reported(stderr):
            log.error(stderr)

    with Image.open(BytesIO(p.stdout)) as im:
        if rotation is not None:
            log.debug("Rotating output by %i", rotation)
            # rotation is a clockwise angle and Image.ROTATE_* is
            # counterclockwise so this cancels out the rotation
            if rotation == 90:
                im = im.transpose(Image.ROTATE_90)
            elif rotation == 180:
                im = im.transpose(Image.ROTATE_180)
            elif rotation == 270:
                im = im.transpose(Image.ROTATE_270)
            if rotation % 180 == 90:
                page_dpi = page_dpi.flip_axis()
        im.save(fspath(output_file), dpi=page_dpi)
def run_rig_args(args, **kwargs):
    # Remove the two arguments that tell ghostscript to create a PDF/A
    # Does not remove the Postscript definition file - not necessary
    # to cause PDF/A creation failure
    new_args = [
        arg for arg in args if not arg.startswith('-dPDFA') and not arg.endswith('.ps')
    ]
    proc = run(new_args, **kwargs)
    return proc
Exemple #7
0
def generate_hocr(
    *,
    input_file: Path,
    output_hocr: Path,
    output_text: Path,
    languages: List[str],
    engine_mode: int,
    tessconfig: List[str],
    timeout: float,
    pagesegmode: int,
    user_words,
    user_patterns,
):
    prefix = output_hocr.with_suffix('')

    args_tesseract = tess_base_args(languages, engine_mode)

    if pagesegmode is not None:
        args_tesseract.extend(['--psm', str(pagesegmode)])

    if user_words:
        args_tesseract.extend(['--user-words', user_words])

    if user_patterns:
        args_tesseract.extend(['--user-patterns', user_patterns])

    # Reminder: test suite tesseract test plugins will break after any changes
    # to the number of order parameters here
    args_tesseract.extend([fspath(input_file), fspath(prefix), 'hocr', 'txt'])
    args_tesseract.extend(tessconfig)
    try:
        p = run(args_tesseract,
                stdout=PIPE,
                stderr=STDOUT,
                timeout=timeout,
                check=True)
        stdout = p.stdout
    except TimeoutExpired:
        # Generate a HOCR file with no recognized text if tesseract times out
        # Temporary workaround to hocrTransform not being able to function if
        # it does not have a valid hOCR file.
        page_timedout(timeout)
        _generate_null_hocr(output_hocr, output_text, input_file)
    except CalledProcessError as e:
        tesseract_log_output(e.output)
        if b'Image too large' in e.output:
            _generate_null_hocr(output_hocr, output_text, input_file)
            return

        raise SubprocessOutputError() from e
    else:
        tesseract_log_output(stdout)
        # The sidecar text file will get the suffix .txt; rename it to
        # whatever caller wants it named
        if prefix.with_suffix('.txt').exists():
            prefix.with_suffix('.txt').replace(output_text)
Exemple #8
0
def convert_group(*, cwd, infiles, out_prefix):
    args = [
        'jbig2',
        '-b',
        out_prefix,
        '-s',  # symbol mode (lossy)
        # '-r', # refinement mode (lossless symbol mode, currently disabled in
        # jbig2)
        '-p',
    ]
    args.extend(infiles)
    proc = run(args, cwd=cwd, stdout=PIPE, stderr=PIPE)
    proc.check_returncode()
    return proc
Exemple #9
0
def has_textonly_pdf(langs=None):
    """Does Tesseract have textonly_pdf capability?

    Available in v4.00.00alpha since January 2017. Best to
    parse the parameter list.
    """
    args_tess = tess_base_args(langs, engine_mode=None) + ['--print-parameters', 'pdf']
    params = ''
    try:
        proc = run(args_tess, check=True, stdout=PIPE, stderr=STDOUT)
        params = proc.stdout
    except CalledProcessError as e:
        raise MissingDependencyError(
            "Could not --print-parameters from tesseract. This can happen if the "
            "TESSDATA_PREFIX environment is not set to a valid tessdata folder. "
        ) from e
    if b'textonly_pdf' in params:
        return True
    return False
Exemple #10
0
def quantize(input_file: Path, output_file: Path, quality_min: int,
             quality_max: int):
    with input_as_png(input_file) as input_stream:
        args = [
            'pngquant',
            '--force',
            '--skip-if-larger',
            '--quality',
            f'{quality_min}-{quality_max}',
            '--',  # pngquant: stop processing arguments
            '-',  # pngquant: stream input and output
        ]
        result = run(args,
                     stdin=input_stream,
                     stdout=PIPE,
                     stderr=PIPE,
                     check=False)

    if result.returncode == 0:
        # input_file could be the same as output_file, so we defer the write
        output_file.write_bytes(result.stdout)
Exemple #11
0
def cached_run(options, run_args, **run_kwargs):
    run_args = [str(arg) for arg in run_args]  # flatten PosixPaths
    args = parser.parse_args(run_args[1:])

    if args.imagename in ('stdin', '-'):
        return run(run_args, **run_kwargs)

    source_file = options.input_file
    cache_folder = get_cache_folder(source_file, run_args, args)
    cache_folder.mkdir(parents=True, exist_ok=True)

    log.debug(f"Using Tesseract cache {cache_folder}")

    if (cache_folder / 'stderr.bin').exists():
        log.debug("Cache HIT")

        # Replicate stdout/err
        if args.outputbase != 'stdout':
            if not args.configfiles:
                args.configfiles.append('txt')
            for configfile in args.configfiles:
                # cp cache -> output
                tessfile = args.outputbase + '.' + configfile
                shutil.copy(str(cache_folder / configfile) + '.bin', tessfile)
        return CompletedProcess(
            args=run_args,
            returncode=0,
            stdout=(cache_folder / 'stdout.bin').read_bytes(),
            stderr=(cache_folder / 'stderr.bin').read_bytes(),
        )

    log.debug("Cache MISS")

    cache_kwargs = {
        k: v
        for k, v in run_kwargs.items() if k not in ('stdout', 'stderr')
    }
    assert cache_kwargs['check']
    try:
        p = run(run_args, stdout=PIPE, stderr=PIPE, **cache_kwargs)
    except CalledProcessError as e:
        log.exception(e)
        raise  # Pass exception onward

    # Update cache
    (cache_folder / 'stdout.bin').write_bytes(p.stdout)
    (cache_folder / 'stderr.bin').write_bytes(p.stderr)

    if args.outputbase != 'stdout':
        if not args.configfiles:
            args.configfiles.append('txt')

        for configfile in args.configfiles:
            if configfile not in ('hocr', 'pdf', 'txt'):
                continue
            # cp pwd/{outputbase}.{configfile} -> {cache}/{configfile}
            tessfile = args.outputbase + '.' + configfile
            shutil.copy(tessfile, str(cache_folder / configfile) + '.bin')

    manifest = {}
    manifest['tesseract_version'] = TesseractOcrEngine.version().replace(
        '\n', ' ')
    manifest['platform'] = platform.platform()
    manifest['python'] = platform.python_version()
    manifest['argv_slug'] = cache_folder.name
    manifest['sourcefile'] = str(Path(source_file).relative_to(TESTS_ROOT))

    def clean_sys_argv():
        for arg in run_args[1:]:
            yield re.sub(r'.*/ocrmypdf[.]io[.][^/]+[/](.*)', r'$TMPDIR/\1',
                         arg)

    manifest['args'] = list(clean_sys_argv())
    with (Path(CACHE_ROOT) / 'manifest.jsonl').open('a') as f:
        json.dump(manifest, f)
        f.write('\n')
        f.flush()
    return p
Exemple #12
0
def generate_pdfa(
    pdf_pages,
    output_file: os.PathLike,
    compression: str,
    pdf_version: str = '1.5',
    pdfa_part: str = '2',
):
    compression_args = []
    if compression == 'jpeg':
        compression_args = [
            "-dAutoFilterColorImages=false",
            "-dColorImageFilter=/DCTEncode",
            "-dAutoFilterGrayImages=false",
            "-dGrayImageFilter=/DCTEncode",
        ]
    elif compression == 'lossless':
        compression_args = [
            "-dAutoFilterColorImages=false",
            "-dColorImageFilter=/FlateEncode",
            "-dAutoFilterGrayImages=false",
            "-dGrayImageFilter=/FlateEncode",
        ]
    else:
        compression_args = [
            "-dAutoFilterColorImages=true",
            "-dAutoFilterGrayImages=true",
        ]

    # Older versions of Ghostscript expect a leading slash in
    # sColorConversionStrategy, newer ones should not have it. See Ghostscript
    # git commit fe1c025d.
    strategy = 'RGB' if version() >= '9.19' else '/RGB'

    if version() == '9.23':
        # 9.23: new feature JPEG passthrough is broken in some cases, best to
        # disable it always
        # https://bugs.ghostscript.com/show_bug.cgi?id=699216
        compression_args.append('-dPassThroughJPEGImages=false')

    # nb no need to specify ProcessColorModel when ColorConversionStrategy
    # is set; see:
    # https://bugs.ghostscript.com/show_bug.cgi?id=699392
    args_gs = ([
        GS,
        "-dQUIET",
        "-dBATCH",
        "-dNOPAUSE",
        "-dSAFER",
        "-dCompatibilityLevel=" + str(pdf_version),
        "-sDEVICE=pdfwrite",
        "-dAutoRotatePages=/None",
        "-sColorConversionStrategy=" + strategy,
    ] + compression_args + [
        "-dJPEGQ=95",
        "-dPDFA=" + pdfa_part,
        "-dPDFACompatibilityPolicy=1",
        "-o",
        "-",
        "-sstdout=%stderr",
    ])
    args_gs.extend(fspath(s) for s in pdf_pages)  # Stringify Path objs
    try:
        with Path(output_file).open('wb') as output:
            p = run(args_gs, stdout=output, stderr=PIPE, check=True)
    except CalledProcessError as e:
        # Ghostscript does not change return code when it fails to create
        # PDF/A - check PDF/A status elsewhere
        log.error(e.stderr.decode(errors='replace'))
        raise SubprocessOutputError('Ghostscript PDF/A rendering failed')
    else:
        stderr = p.stderr.decode('utf-8', errors='replace')
        if _gs_error_reported(stderr):
            last_part = None
            repcount = 0
            for part in stderr.split('****'):
                if part != last_part:
                    if repcount > 1:
                        log.error(
                            f"(previous error message repeated {repcount} times)"
                        )
                        repcount = 0
                    log.error(part)
                else:
                    repcount += 1
                last_part = part
        elif 'overprint mode not set' in stderr:
            # Unless someone is going to print PDF/A documents on a
            # magical sRGB printer I can't see the removal of overprinting
            # being a problem....
            log.debug("Ghostscript had to remove PDF 'overprinting' from the "
                      "input file to complete PDF/A conversion. ")
Exemple #13
0
def run_append_stderr(*args, **kwargs):
    proc = run(*args, **kwargs)
    proc.stderr = b'\n'.join([proc.stderr, elision_warning.encode('utf-8')])
    return proc
Exemple #14
0
def convert_single(*, cwd, infile, outfile):
    args = ['jbig2', '-p', infile]
    with open(outfile, 'wb') as fstdout:
        proc = run(args, cwd=cwd, stdout=fstdout, stderr=PIPE)
    proc.check_returncode()
    return proc