Example #1
0
def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike, xmltext: str):
    pageinfo = {}
    pageinfo['pageno'] = pageno
    pageinfo['images'] = []

    page = pdf.pages[pageno]
    mediabox = [Decimal(d) for d in page.MediaBox.as_list()]
    width_pt = mediabox[2] - mediabox[0]
    height_pt = mediabox[3] - mediabox[1]

    if xmltext is not None:
        bboxes = ghosttext.page_get_textblocks(fspath(infile),
                                               pageno,
                                               xmltext=xmltext,
                                               height=height_pt)
        pageinfo['bboxes'] = bboxes
    else:
        pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5')
        miner = get_page_analysis(infile, pageno, pscript5_mode)
        pageinfo['textboxes'] = list(simplify_textboxes(miner, get_text_boxes))
        bboxes = (box.bbox for box in pageinfo['textboxes'])

    pageinfo['has_text'] = _page_has_text(bboxes, width_pt, height_pt)

    userunit = page.get('/UserUnit', Decimal(1.0))
    if not isinstance(userunit, Decimal):
        userunit = Decimal(userunit)
    pageinfo['userunit'] = userunit
    pageinfo['width_inches'] = width_pt * userunit / Decimal(72.0)
    pageinfo['height_inches'] = height_pt * userunit / Decimal(72.0)

    try:
        pageinfo['rotate'] = int(page['/Rotate'])
    except KeyError:
        pageinfo['rotate'] = 0

    userunit_shorthand = (userunit, 0, 0, userunit, 0, 0)
    contentsinfo = [
        ci for ci in _process_content_streams(
            pdf=pdf, container=page, shorthand=userunit_shorthand)
    ]

    pageinfo['has_vector'] = False
    if any(isinstance(ci, VectorInfo) for ci in contentsinfo):
        pageinfo['has_vector'] = True

    pageinfo['images'] = [
        im for im in contentsinfo if isinstance(im, ImageInfo)
    ]
    if pageinfo['images']:
        xres = Decimal(max(image.xres for image in pageinfo['images']))
        yres = Decimal(max(image.yres for image in pageinfo['images']))
        pageinfo['xres'], pageinfo['yres'] = xres, yres
        pageinfo['width_pixels'] = int(round(xres * pageinfo['width_inches']))
        pageinfo['height_pixels'] = int(round(yres *
                                              pageinfo['height_inches']))

    return pageinfo
Example #2
0
def _pdf_get_pageinfo(
    pdf, pageno: int, infile: PathLike, check_pages, detailed_analysis: bool
):
    pageinfo: Dict[str, Any] = {}
    pageinfo['pageno'] = pageno
    pageinfo['images'] = []

    page = pdf.pages[pageno]
    mediabox = [Decimal(d) for d in page.MediaBox.as_list()]
    width_pt = mediabox[2] - mediabox[0]
    height_pt = mediabox[3] - mediabox[1]

    check_this_page = pageno in check_pages

    if check_this_page and detailed_analysis:
        pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5')
        miner = get_page_analysis(infile, pageno, pscript5_mode)
        pageinfo['textboxes'] = list(simplify_textboxes(miner, get_text_boxes))
        bboxes = (box.bbox for box in pageinfo['textboxes'])

        pageinfo['has_text'] = _page_has_text(bboxes, width_pt, height_pt)
    else:
        pageinfo['textboxes'] = []
        pageinfo['has_text'] = None  # i.e. "no information"

    userunit = page.get('/UserUnit', Decimal(1.0))
    if not isinstance(userunit, Decimal):
        userunit = Decimal(userunit)
    pageinfo['userunit'] = userunit
    pageinfo['width_inches'] = width_pt * userunit / Decimal(72.0)
    pageinfo['height_inches'] = height_pt * userunit / Decimal(72.0)

    try:
        pageinfo['rotate'] = int(page['/Rotate'])
    except KeyError:
        pageinfo['rotate'] = 0

    userunit_shorthand = (userunit, 0, 0, userunit, 0, 0)

    if check_this_page:
        pageinfo['has_vector'] = False
        pageinfo['has_text'] = False
        pageinfo['images'] = []
        for ci in _process_content_streams(
            pdf=pdf, container=page, shorthand=userunit_shorthand
        ):
            if isinstance(ci, VectorMarker):
                pageinfo['has_vector'] = True
            elif isinstance(ci, TextMarker):
                pageinfo['has_text'] = True
            elif isinstance(ci, ImageInfo):
                pageinfo['images'].append(ci)
            else:
                raise NotImplementedError()
    else:
        pageinfo['has_vector'] = None  # i.e. "no information"
        pageinfo['has_text'] = None
        pageinfo['images'] = None

    if pageinfo['images']:
        dpi = Resolution(0.0, 0.0).take_max(image.dpi for image in pageinfo['images'])
        pageinfo['dpi'] = dpi
        pageinfo['width_pixels'] = int(round(dpi.x * float(pageinfo['width_inches'])))
        pageinfo['height_pixels'] = int(round(dpi.y * float(pageinfo['height_inches'])))

    return pageinfo
Example #3
0
    def _gather_pageinfo(
        self,
        pdf: Pdf,
        pageno: int,
        infile: PathLike,
        check_pages: Container[int],
        detailed_analysis: bool,
    ):
        page = pdf.pages[pageno]
        mediabox = [Decimal(d) for d in page.MediaBox.as_list()]
        width_pt = mediabox[2] - mediabox[0]
        height_pt = mediabox[3] - mediabox[1]

        check_this_page = pageno in check_pages

        if check_this_page and detailed_analysis:
            pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5')
            miner = get_page_analysis(infile, pageno, pscript5_mode)
            self._textboxes = list(simplify_textboxes(miner, get_text_boxes))
            bboxes = (box.bbox for box in self._textboxes)

            self._has_text = _page_has_text(bboxes, width_pt, height_pt)
        else:
            self._textboxes = []
            self._has_text = None  # i.e. "no information"

        userunit = page.get('/UserUnit', Decimal(1.0))
        if not isinstance(userunit, Decimal):
            userunit = Decimal(userunit)
        self._userunit = userunit
        self._width_inches = width_pt * userunit / Decimal(72.0)
        self._height_inches = height_pt * userunit / Decimal(72.0)

        try:
            self._rotate = int(page['/Rotate'])
        except KeyError:
            self._rotate = 0

        userunit_shorthand = (userunit, 0, 0, userunit, 0, 0)

        if check_this_page:
            self._has_vector = False
            self._has_text = False
            self._images = []
            for ci in _process_content_streams(
                pdf=pdf, container=page, shorthand=userunit_shorthand
            ):
                if isinstance(ci, VectorMarker):
                    self._has_vector = True
                elif isinstance(ci, TextMarker):
                    self._has_text = True
                elif isinstance(ci, ImageInfo):
                    self._images.append(ci)
                else:
                    raise NotImplementedError()
        else:
            self._has_vector = None  # i.e. "no information"
            self._has_text = None
            self._images = None

        self._dpi = None
        if self._images:
            dpi = Resolution(0.0, 0.0).take_max(image.dpi for image in self._images)
            self._dpi = dpi
            self._width_pixels = int(round(dpi.x * float(self._width_inches)))
            self._height_pixels = int(round(dpi.y * float(self._height_inches)))