Beispiel #1
0
    def __init__(
        self,
        infile,
        detailed_analysis: bool = False,
        progbar: bool = False,
        max_workers: int = None,
        check_pages=None,
    ):
        self._infile = infile
        if check_pages is None:
            check_pages = range(0, 1_000_000_000)

        with pikepdf.open(infile) as pdf:
            if pdf.is_encrypted:
                raise EncryptedPdfError()  # Triggered by encryption with empty passwd
            self._pages = _pdf_pageinfo_concurrent(
                pdf,
                infile,
                progbar,
                max_workers,
                check_pages=check_pages,
                detailed_analysis=detailed_analysis,
            )
            self._needs_rendering = pdf.root.get('/NeedsRendering', False)
            self._has_acroform = False
            if '/AcroForm' in pdf.root:
                if len(pdf.root.AcroForm.get('/Fields', [])) > 0:
                    self._has_acroform = True
                elif '/XFA' in pdf.root.AcroForm:
                    self._has_acroform = True
Beispiel #2
0
def _pdf_get_all_pageinfo(infile,
                          detailed_analysis=False,
                          log=None,
                          progbar=False):
    pdf = pikepdf.open(infile)  # Do not close in this function
    if pdf.is_encrypted:
        pdf.close()
        raise EncryptedPdfError()  # Triggered by encryption with empty passwd
    if detailed_analysis:
        pages_xml = None
    else:
        pages_xml = ghosttext.extract_text_xml(infile,
                                               pdf,
                                               pageno=None,
                                               log=log)

    pages = []
    for n, _ in tqdm(
            enumerate(pdf.pages),
            total=len(pdf.pages),
            desc="Scan",
            unit='page',
            disable=not progbar,
    ):
        page_xml = pages_xml[n] if pages_xml else None
        page = PageInfo(pdf, n, infile, page_xml, detailed_analysis)
        pages.append(page)

    return pages, pdf
Beispiel #3
0
def get_page_analysis(infile, pageno, pscript5_mode):
    rman = pdfminer.pdfinterp.PDFResourceManager(caching=True)
    dev = TextPositionTracker(rman,
                              laparams=LAParams(all_texts=True,
                                                detect_vertical=True))
    interp = pdfminer.pdfinterp.PDFPageInterpreter(rman, dev)

    if pscript5_mode:
        patcher = patch.multiple(
            'pdfminer.pdffont.PDFType3Font',
            spec=True,
            get_ascent=PDFType3Font__PScript5_get_ascent,
            get_descent=PDFType3Font__PScript5_get_descent,
            get_height=PDFType3Font__PScript5_get_height,
        )
        patcher.start()

    try:
        with Path(infile).open('rb') as f:
            page = PDFPage.get_pages(f, pagenos=[pageno], maxpages=0)
            interp.process_page(next(page))
    except PDFTextExtractionNotAllowed:
        raise EncryptedPdfError()
    finally:
        if pscript5_mode:
            patcher.stop()

    return dev.get_result()
Beispiel #4
0
def get_pdfinfo(
    input_file,
    detailed_analysis=False,
    progbar=False,
    max_workers=None,
    check_pages=None,
):
    try:
        return PdfInfo(
            input_file,
            detailed_analysis=detailed_analysis,
            progbar=progbar,
            max_workers=max_workers,
            check_pages=check_pages,
        )
    except pikepdf.PasswordError:
        raise EncryptedPdfError()
    except pikepdf.PdfError:
        raise InputFileError()
Beispiel #5
0
def get_page_analysis(infile, pageno, pscript5_mode):
    rman = pdfminer.pdfinterp.PDFResourceManager(caching=True)
    if pdfminer.__version__ < '20200402':
        # Workaround for https://github.com/pdfminer/pdfminer.six/issues/395
        disable_boxes_flow = 2
    else:
        disable_boxes_flow = None
    dev = TextPositionTracker(
        rman,
        laparams=LAParams(all_texts=True,
                          detect_vertical=True,
                          boxes_flow=disable_boxes_flow),
    )
    interp = pdfminer.pdfinterp.PDFPageInterpreter(rman, dev)

    patcher = None
    if pscript5_mode:
        patcher = patch.multiple(
            'pdfminer.pdffont.PDFType3Font',
            spec=True,
            get_ascent=PDFType3Font__PScript5_get_ascent,
            get_descent=PDFType3Font__PScript5_get_descent,
            get_height=PDFType3Font__PScript5_get_height,
        )
        patcher.start()

    try:
        with Path(infile).open('rb') as f:
            page_iter = PDFPage.get_pages(f, pagenos=[pageno], maxpages=0)
            page = next(page_iter, None)
            if page is None:
                raise InputFileError(
                    f"pdfminer could not process page {pageno} (counting from 0)."
                )
            interp.process_page(page)
    except PDFTextExtractionNotAllowed as e:
        raise EncryptedPdfError() from e
    finally:
        if patcher is not None:
            patcher.stop()

    return dev.get_result()
Beispiel #6
0
def get_pdfinfo(
    input_file,
    *,
    executor: Executor,
    detailed_analysis=False,
    progbar=False,
    max_workers=None,
    check_pages=None,
) -> PdfInfo:
    try:
        return PdfInfo(
            input_file,
            detailed_analysis=detailed_analysis,
            progbar=progbar,
            max_workers=max_workers,
            check_pages=check_pages,
            executor=executor,
        )
    except pikepdf.PasswordError as e:
        raise EncryptedPdfError() from e
    except pikepdf.PdfError as e:
        raise InputFileError() from e