class TextPositionTracker(PDFLayoutAnalyzer): """A page layout analyzer that pays attention to text visibility""" def __init__(self, rsrcmgr, pageno=1, laparams=None): super().__init__(rsrcmgr, pageno, laparams) self.textstate = None self.result = None self.cur_item = None # not defined in pdfminer code as it should be def begin_page(self, page, ctm): super().begin_page(page, ctm) self.cur_item = LTPage(self.pageno, page.mediabox) def end_page(self, page): assert not self._stack, str(len(self._stack)) assert isinstance(self.cur_item, LTPage), str(type(self.cur_item)) if self.laparams is not None: self.cur_item.analyze(self.laparams) self.pageno += 1 self.receive_layout(self.cur_item) def render_string(self, textstate, seq, ncs, graphicstate): self.textstate = textstate.copy() super().render_string(self.textstate, seq, ncs, graphicstate) def render_char( self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate ): try: text = font.to_unichr(cid) assert isinstance(text, str), str(type(text)) except PDFUnicodeNotDefined: text = self.handle_undefined_char(font, cid) textwidth = font.char_width(cid) textdisp = font.char_disp(cid) item = LTStateAwareChar( matrix, font, fontsize, scaling, rise, text, textwidth, textdisp, ncs, graphicstate, self.textstate, ) self.cur_item.add(item) return item.adv def handle_undefined_char(self, font, cid): # log.info('undefined: %r, %r', font, cid) return (font.fontname, cid) def receive_layout(self, ltpage): self.result = ltpage def get_result(self): return self.result
def begin_page(self, page, ctm): (x0, y0, x1, y1) = page.mediabox (x0, y0) = apply_matrix_pt(ctm, (x0, y0)) (x1, y1) = apply_matrix_pt(ctm, (x1, y1)) mediabox = (0, 0, abs(x0-x1), abs(y0-y1)) self.cur_item = LTPage(self.pageno, mediabox) return
def begin_page(self, page, ctm): super().begin_page(page, ctm) self.cur_item = LTPage(self.pageno, page.mediabox)
def begin_page(self, page): self.cur_item = LTPage(self.pageno, page.mediabox, page.rotate) return
def __init__(self, pageid, bbox, rotate=0): LTPage.__init__(self, pageid, bbox) return