コード例 #1
0
ファイル: tesseract.py プロジェクト: ajaykumarbharaj/OCRmyPDF
def get_orientation(input_file: Path, engine_mode: Optional[int], timeout: float):
    args_tesseract = tess_base_args(['osd'], engine_mode) + [
        '--psm',
        '0',
        fspath(input_file),
        'stdout',
    ]

    try:
        p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True)
        stdout = p.stdout
    except TimeoutExpired:
        return OrientationConfidence(angle=0, confidence=0.0)
    except CalledProcessError as e:
        tesseract_log_output(e.stdout)
        tesseract_log_output(e.stderr)
        if (
            b'Too few characters. Skipping this page' in e.output
            or b'Image too large' in e.output
        ):
            return OrientationConfidence(0, 0)
        raise SubprocessOutputError() from e
    else:
        osd = {}
        for line in stdout.decode().splitlines():
            line = line.strip()
            parts = line.split(':', maxsplit=2)
            if len(parts) == 2:
                osd[parts[0].strip()] = parts[1].strip()

        angle = int(osd.get('Orientation in degrees', 0))
        oc = OrientationConfidence(
            angle=angle, confidence=float(osd.get('Orientation confidence', 0))
        )
        return oc
コード例 #2
0
def generate_pdf(
    *,
    input_file: Path,
    output_pdf: Path,
    output_text: Path,
    languages: List[str],
    engine_mode: int,
    tessconfig: List[str],
    timeout: float,
    pagesegmode: int,
    user_words,
    user_patterns,
):
    """Use Tesseract to render a PDF.

    input_file -- image to analyze
    output_pdf -- file to generate
    output_text -- OCR text file
    languages -- list of languages to consider
    engine_mode -- engine mode argument for tess v4
    tessconfig -- tesseract configuration
    timeout -- timeout (seconds)
    """

    args_tesseract = tess_base_args(languages, engine_mode)

    if pagesegmode is not None:
        args_tesseract.extend(['--psm', str(pagesegmode)])

    args_tesseract.extend(['-c', 'textonly_pdf=1'])

    if user_words:
        args_tesseract.extend(['--user-words', user_words])

    if user_patterns:
        args_tesseract.extend(['--user-patterns', user_patterns])

    prefix = os.path.splitext(output_pdf)[0]  # Tesseract appends suffixes

    # Reminder: test suite tesseract test plugins might break after any changes
    # to the number of order parameters here

    args_tesseract.extend([os.fspath(input_file), os.fspath(prefix), 'pdf', 'txt'])
    args_tesseract.extend(tessconfig)
    try:
        p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True)
        stdout = p.stdout
        if os.path.exists(prefix + '.txt'):
            shutil.move(prefix + '.txt', output_text)
    except TimeoutExpired:
        page_timedout(timeout)
        use_skip_page(output_pdf, output_text)
    except CalledProcessError as e:
        tesseract_log_output(e.output)
        if b'Image too large' in e.output:
            use_skip_page(output_pdf, output_text)
            return
        raise SubprocessOutputError() from e
    else:
        tesseract_log_output(stdout)
コード例 #3
0
ファイル: unpaper.py プロジェクト: ttyhu/OCRmyPDF
def run(input_file, output_file, dpi, mode_args):
    args_unpaper = ['unpaper', '-v', '--dpi', str(round(dpi, 6))] + mode_args

    with TemporaryDirectory() as tmpdir:
        input_pnm, output_pnm = _setup_unpaper_io(Path(tmpdir), input_file)

        # To prevent any shenanigans from accepting arbitrary parameters in
        # --unpaper-args, we:
        # 1) run with cwd set to a tmpdir with only unpaper's files
        # 2) forbid the use of '/' in arguments, to prevent changing paths
        # 3) append absolute paths for the input and output file
        # This should ensure that a user cannot clobber some other file with
        # their unpaper arguments (whether intentionally or otherwise)
        args_unpaper.extend([os.fspath(input_pnm), os.fspath(output_pnm)])
        external_run(
            args_unpaper,
            close_fds=True,
            check=True,
            stderr=STDOUT,  # unpaper writes logging output to stdout and stderr
            stdout=PIPE,  # and cannot send file output to stdout
            cwd=tmpdir,
            logs_errors_to_stdout=True,
        )
        try:
            with Image.open(output_pnm) as imout:
                imout.save(output_file, dpi=(dpi, dpi))
        except (FileNotFoundError, OSError):
            raise SubprocessOutputError(
                "unpaper: failed to produce the expected output file. " +
                " Called with: " + str(args_unpaper)) from None
コード例 #4
0
ファイル: ghostscript.py プロジェクト: ttyhu/OCRmyPDF
def rasterize_pdf(
    input_file: os.PathLike,
    output_file: os.PathLike,
    *,
    raster_device: str,
    raster_dpi: Resolution,
    pageno: int = 1,
    page_dpi: Optional[Resolution] = None,
    rotation: Optional[int] = None,
    filter_vector: bool = False,
):
    """Rasterize one page of a PDF at resolution raster_dpi in canvas units."""
    raster_dpi = raster_dpi.round(6)
    if not page_dpi:
        page_dpi = raster_dpi

    args_gs = ([
        GS,
        '-dQUIET',
        '-dSAFER',
        '-dBATCH',
        '-dNOPAUSE',
        f'-sDEVICE={raster_device}',
        f'-dFirstPage={pageno}',
        f'-dLastPage={pageno}',
        f'-r{raster_dpi.x:f}x{raster_dpi.y:f}',
    ] + (['-dFILTERVECTOR'] if filter_vector else []) + [
        '-o',
        '-',
        '-sstdout=%stderr',
        '-dAutoRotatePages=/None',  # Probably has no effect on raster
        '-f',
        fspath(input_file),
    ])

    try:
        p = run(args_gs, stdout=PIPE, stderr=PIPE, check=True)
    except CalledProcessError as e:
        log.error(e.stderr.decode(errors='replace'))
        raise SubprocessOutputError('Ghostscript rasterizing failed')
    else:
        stderr = p.stderr.decode(errors='replace')
        if _gs_error_reported(stderr):
            log.error(stderr)

    with Image.open(BytesIO(p.stdout)) as im:
        if rotation is not None:
            log.debug("Rotating output by %i", rotation)
            # rotation is a clockwise angle and Image.ROTATE_* is
            # counterclockwise so this cancels out the rotation
            if rotation == 90:
                im = im.transpose(Image.ROTATE_90)
            elif rotation == 180:
                im = im.transpose(Image.ROTATE_180)
            elif rotation == 270:
                im = im.transpose(Image.ROTATE_270)
            if rotation % 180 == 90:
                page_dpi = page_dpi.flip_axis()
        im.save(fspath(output_file), dpi=page_dpi)
コード例 #5
0
ファイル: unpaper.py プロジェクト: riku7032/OCRmyPDF
def run(input_file, output_file, dpi, mode_args):
    args_unpaper = ['unpaper', '-v', '--dpi', str(dpi)] + mode_args

    SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'}

    with TemporaryDirectory() as tmpdir, Image.open(input_file) as im:
        if im.mode not in SUFFIXES.keys():
            log.info("Converting image to other colorspace")
            try:
                if im.mode == 'P' and len(im.getcolors()) == 2:
                    im = im.convert(mode='1')
                else:
                    im = im.convert(mode='RGB')
            except IOError as e:
                im.close()
                raise MissingDependencyError(
                    "Could not convert image with type " + im.mode) from e

        try:
            suffix = SUFFIXES[im.mode]
        except KeyError:
            raise MissingDependencyError(
                "Failed to convert image to a supported format.") from e

        input_pnm = Path(tmpdir) / f'input{suffix}'
        output_pnm = Path(tmpdir) / f'output{suffix}'
        im.save(input_pnm, format='PPM')

        # To prevent any shenanigans from accepting arbitrary parameters in
        # --unpaper-args, we:
        # 1) run with cwd set to a tmpdir with only unpaper's files
        # 2) forbid the use of '/' in arguments, to prevent changing paths
        # 3) append absolute paths for the input and output file
        # This should ensure that a user cannot clobber some other file with
        # their unpaper arguments (whether intentionally or otherwise)
        args_unpaper.extend([os.fspath(input_pnm), os.fspath(output_pnm)])
        try:
            proc = external_run(
                args_unpaper,
                check=True,
                close_fds=True,
                universal_newlines=True,
                stderr=STDOUT,
                cwd=tmpdir,
                stdout=PIPE,
            )
        except CalledProcessError as e:
            log.debug(e.output)
            raise e from e
        else:
            log.debug(proc.stdout)
            # unpaper sets dpi to 72; fix this
            try:
                with Image.open(output_pnm) as imout:
                    imout.save(output_file, dpi=(dpi, dpi))
            except (FileNotFoundError, OSError):
                raise SubprocessOutputError(
                    "unpaper: failed to produce the expected output file. " +
                    " Called with: " + str(args_unpaper)) from None
コード例 #6
0
def generate_hocr(
    *,
    input_file: Path,
    output_hocr: Path,
    output_text: Path,
    languages: List[str],
    engine_mode: int,
    tessconfig: List[str],
    timeout: float,
    pagesegmode: int,
    user_words,
    user_patterns,
):
    prefix = output_hocr.with_suffix('')

    args_tesseract = tess_base_args(languages, engine_mode)

    if pagesegmode is not None:
        args_tesseract.extend(['--psm', str(pagesegmode)])

    if user_words:
        args_tesseract.extend(['--user-words', user_words])

    if user_patterns:
        args_tesseract.extend(['--user-patterns', user_patterns])

    # Reminder: test suite tesseract test plugins will break after any changes
    # to the number of order parameters here
    args_tesseract.extend([fspath(input_file), fspath(prefix), 'hocr', 'txt'])
    args_tesseract.extend(tessconfig)
    try:
        p = run(args_tesseract,
                stdout=PIPE,
                stderr=STDOUT,
                timeout=timeout,
                check=True)
        stdout = p.stdout
    except TimeoutExpired:
        # Generate a HOCR file with no recognized text if tesseract times out
        # Temporary workaround to hocrTransform not being able to function if
        # it does not have a valid hOCR file.
        page_timedout(timeout)
        _generate_null_hocr(output_hocr, output_text, input_file)
    except CalledProcessError as e:
        tesseract_log_output(e.output)
        if b'Image too large' in e.output:
            _generate_null_hocr(output_hocr, output_text, input_file)
            return

        raise SubprocessOutputError() from e
    else:
        tesseract_log_output(stdout)
        # The sidecar text file will get the suffix .txt; rename it to
        # whatever caller wants it named
        if prefix.with_suffix('.txt').exists():
            prefix.with_suffix('.txt').replace(output_text)
コード例 #7
0
ファイル: ghostscript.py プロジェクト: ttyhu/OCRmyPDF
def generate_pdfa(
    pdf_pages,
    output_file: os.PathLike,
    compression: str,
    pdf_version: str = '1.5',
    pdfa_part: str = '2',
):
    compression_args = []
    if compression == 'jpeg':
        compression_args = [
            "-dAutoFilterColorImages=false",
            "-dColorImageFilter=/DCTEncode",
            "-dAutoFilterGrayImages=false",
            "-dGrayImageFilter=/DCTEncode",
        ]
    elif compression == 'lossless':
        compression_args = [
            "-dAutoFilterColorImages=false",
            "-dColorImageFilter=/FlateEncode",
            "-dAutoFilterGrayImages=false",
            "-dGrayImageFilter=/FlateEncode",
        ]
    else:
        compression_args = [
            "-dAutoFilterColorImages=true",
            "-dAutoFilterGrayImages=true",
        ]

    # Older versions of Ghostscript expect a leading slash in
    # sColorConversionStrategy, newer ones should not have it. See Ghostscript
    # git commit fe1c025d.
    strategy = 'RGB' if version() >= '9.19' else '/RGB'

    if version() == '9.23':
        # 9.23: new feature JPEG passthrough is broken in some cases, best to
        # disable it always
        # https://bugs.ghostscript.com/show_bug.cgi?id=699216
        compression_args.append('-dPassThroughJPEGImages=false')

    # nb no need to specify ProcessColorModel when ColorConversionStrategy
    # is set; see:
    # https://bugs.ghostscript.com/show_bug.cgi?id=699392
    args_gs = ([
        GS,
        "-dQUIET",
        "-dBATCH",
        "-dNOPAUSE",
        "-dSAFER",
        "-dCompatibilityLevel=" + str(pdf_version),
        "-sDEVICE=pdfwrite",
        "-dAutoRotatePages=/None",
        "-sColorConversionStrategy=" + strategy,
    ] + compression_args + [
        "-dJPEGQ=95",
        "-dPDFA=" + pdfa_part,
        "-dPDFACompatibilityPolicy=1",
        "-o",
        "-",
        "-sstdout=%stderr",
    ])
    args_gs.extend(fspath(s) for s in pdf_pages)  # Stringify Path objs
    try:
        with Path(output_file).open('wb') as output:
            p = run(args_gs, stdout=output, stderr=PIPE, check=True)
    except CalledProcessError as e:
        # Ghostscript does not change return code when it fails to create
        # PDF/A - check PDF/A status elsewhere
        log.error(e.stderr.decode(errors='replace'))
        raise SubprocessOutputError('Ghostscript PDF/A rendering failed')
    else:
        stderr = p.stderr.decode('utf-8', errors='replace')
        if _gs_error_reported(stderr):
            last_part = None
            repcount = 0
            for part in stderr.split('****'):
                if part != last_part:
                    if repcount > 1:
                        log.error(
                            f"(previous error message repeated {repcount} times)"
                        )
                        repcount = 0
                    log.error(part)
                else:
                    repcount += 1
                last_part = part
        elif 'overprint mode not set' in stderr:
            # Unless someone is going to print PDF/A documents on a
            # magical sRGB printer I can't see the removal of overprinting
            # being a problem....
            log.debug("Ghostscript had to remove PDF 'overprinting' from the "
                      "input file to complete PDF/A conversion. ")
コード例 #8
0
def generate_pdfa(
    pdf_pages,
    output_file: os.PathLike,
    *,
    compression: str,
    pdf_version: str = '1.5',
    pdfa_part: str = '2',
    progressbar_class=None,
):
    # Ghostscript's compression is all or nothing. We can either force all images
    # to JPEG, force all to Flate/PNG, or let it decide how to encode the images.
    # In most case it's best to let it decide.
    compression_args = []
    if compression == 'jpeg':
        compression_args = [
            "-dAutoFilterColorImages=false",
            "-dColorImageFilter=/DCTEncode",
            "-dAutoFilterGrayImages=false",
            "-dGrayImageFilter=/DCTEncode",
        ]
    elif compression == 'lossless':
        compression_args = [
            "-dAutoFilterColorImages=false",
            "-dColorImageFilter=/FlateEncode",
            "-dAutoFilterGrayImages=false",
            "-dGrayImageFilter=/FlateEncode",
        ]
    else:
        compression_args = [
            "-dAutoFilterColorImages=true",
            "-dAutoFilterGrayImages=true",
        ]

    strategy = 'LeaveColorUnchanged'
    # Older versions of Ghostscript expect a leading slash in
    # sColorConversionStrategy, newer ones should not have it. See Ghostscript
    # git commit fe1c025d.
    strategy = ('/' + strategy) if version() < '9.19' else strategy

    if version() == '9.23':
        # 9.23: added JPEG passthrough as a new feature, but with a bug that
        # incorrectly formats some images. Fixed as of 9.24. So we disable this
        # feature for 9.23.
        # https://bugs.ghostscript.com/show_bug.cgi?id=699216
        compression_args.append('-dPassThroughJPEGImages=false')

    # nb no need to specify ProcessColorModel when ColorConversionStrategy
    # is set; see:
    # https://bugs.ghostscript.com/show_bug.cgi?id=699392
    args_gs = ([
        GS,
        "-dBATCH",
        "-dNOPAUSE",
        "-dSAFER",
        "-dCompatibilityLevel=" + str(pdf_version),
        "-sDEVICE=pdfwrite",
        "-dAutoRotatePages=/None",
        "-sColorConversionStrategy=" + strategy,
    ] + compression_args + [
        "-dJPEGQ=95",
        "-dPDFA=" + pdfa_part,
        "-dPDFACompatibilityPolicy=1",
        "-o",
        "-",
        "-sstdout=%stderr",
    ])
    args_gs.extend(fspath(s) for s in pdf_pages)  # Stringify Path objs

    try:
        with Path(output_file).open('wb') as output:
            p = run_polling_stderr(
                args_gs,
                stdout=output,
                stderr=PIPE,
                check=True,
                text=True,
                encoding='utf-8',
                errors='replace',
                callback=GhostscriptFollower(progressbar_class),
            )
    except CalledProcessError as e:
        # Ghostscript does not change return code when it fails to create
        # PDF/A - check PDF/A status elsewhere
        log.error(e.stderr)
        raise SubprocessOutputError(
            'Ghostscript PDF/A rendering failed') from e
    else:
        stderr = p.stderr
        # If there is an error we log the whole stderr, except for filtering
        # duplicates.
        if _gs_error_reported(stderr):
            last_part = None
            repcount = 0
            for part in stderr.split('****'):
                if part != last_part:
                    if repcount > 1:
                        log.error(
                            f"(previous error message repeated {repcount} times)"
                        )
                        repcount = 0
                    log.error(part)
                else:
                    repcount += 1
                last_part = part