Beispiel #1
0
class TextPositionTracker(PDFLayoutAnalyzer):
    """A page layout analyzer that pays attention to text visibility"""

    def __init__(self, rsrcmgr, pageno=1, laparams=None):
        super().__init__(rsrcmgr, pageno, laparams)
        self.textstate = None
        self.result = None
        self.cur_item = None  # not defined in pdfminer code as it should be

    def begin_page(self, page, ctm):
        super().begin_page(page, ctm)
        self.cur_item = LTPage(self.pageno, page.mediabox)

    def end_page(self, page):
        assert not self._stack, str(len(self._stack))
        assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))
        if self.laparams is not None:
            self.cur_item.analyze(self.laparams)
        self.pageno += 1
        self.receive_layout(self.cur_item)

    def render_string(self, textstate, seq, ncs, graphicstate):
        self.textstate = textstate.copy()
        super().render_string(self.textstate, seq, ncs, graphicstate)

    def render_char(
        self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate
    ):
        try:
            text = font.to_unichr(cid)
            assert isinstance(text, str), str(type(text))
        except PDFUnicodeNotDefined:
            text = self.handle_undefined_char(font, cid)
        textwidth = font.char_width(cid)
        textdisp = font.char_disp(cid)
        item = LTStateAwareChar(
            matrix,
            font,
            fontsize,
            scaling,
            rise,
            text,
            textwidth,
            textdisp,
            ncs,
            graphicstate,
            self.textstate,
        )
        self.cur_item.add(item)
        return item.adv

    def handle_undefined_char(self, font, cid):
        # log.info('undefined: %r, %r', font, cid)
        return (font.fontname, cid)

    def receive_layout(self, ltpage):
        self.result = ltpage

    def get_result(self):
        return self.result
 def begin_page(self, page, ctm):
     (x0, y0, x1, y1) = page.mediabox
     (x0, y0) = apply_matrix_pt(ctm, (x0, y0))
     (x1, y1) = apply_matrix_pt(ctm, (x1, y1))
     mediabox = (0, 0, abs(x0-x1), abs(y0-y1))
     self.cur_item = LTPage(self.pageno, mediabox)
     return
Beispiel #3
0
 def begin_page(self, page, ctm):
     super().begin_page(page, ctm)
     self.cur_item = LTPage(self.pageno, page.mediabox)
Beispiel #4
0
 def begin_page(self, page):
   self.cur_item = LTPage(self.pageno, page.mediabox, page.rotate)
   return
 def __init__(self, pageid, bbox, rotate=0):
     LTPage.__init__(self, pageid, bbox)
     return