class PDFTextDevice(PDFDevice): def handle_undefined_char(self, cidcoding, cid): if self.debug: print >> sys.stderr, 'undefined: %r, %r' % (cidcoding, cid) return '?' def render_chars(self, matrix, font, fontsize, charspace, scaling, chars): return (0, 0) def render_string(self, textstate, seq): matrix = mult_matrix(textstate.matrix, self.ctm) font = textstate.font fontsize = textstate.fontsize scaling = textstate.scaling * .01 charspace = textstate.charspace * scaling wordspace = textstate.wordspace * scaling if font.is_multibyte(): wordspace = 0 dxscale = .001 * fontsize * scaling if font.is_vertical(): textstate.linematrix = self.render_string_vertical( seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, dxscale) else: textstate.linematrix = self.render_string_horizontal( seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, dxscale) return def render_string_horizontal(self, seq, matrix, (x, y), font, fontsize, scaling, charspace, wordspace, dxscale): chars = [] needspace = False for obj in seq: if isinstance(obj, int) or isinstance(obj, float): (dx, dy) = self.render_chars(translate_matrix(matrix, (x, y)), font, fontsize, charspace, scaling, chars) x += dx - obj * dxscale y += dy chars = [] needspace = False else: for cid in font.decode(obj): try: char = font.to_unichr(cid) except PDFUnicodeNotDefined, e: (cidcoding, cid) = e.args char = self.handle_undefined_char(cidcoding, cid) chars.append((char, cid)) if cid == 32 and wordspace: if needspace: x += charspace (dx, dy) = self.render_chars( translate_matrix(matrix, (x, y)), font, fontsize, charspace, scaling, chars) needspace = True x += dx + wordspace y += dy chars = []
def render_string(self, textstate, textmatrix, seq): font = textstate.font text = [] textmatrix = mult_matrix(textmatrix, self.ctm) for x in seq: if isinstance(x, int) or isinstance(x, float): text.append((None, None, x)) else: chars = font.decode(x) for cid in chars: try: char = font.to_unicode(cid) except PDFUnicodeNotDefined, e: (cidcoding, cid) = e.args char = self.handle_undefined_char(cidcoding, cid) text.append((char, cid, font.char_disp(cid))) if cid == 32 and not font.is_multibyte(): if text: item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, text) self.cur_item.add(item) (dx, dy) = item.adv dx += textstate.wordspace * textstate.scaling * .01 textmatrix = translate_matrix(textmatrix, (dx, dy)) text = []
def render_string_horizontal(self, seq, matrix, pos, font, fontsize, scaling, charspace, wordspace, rise, dxscale): (x, y) = pos needcharspace = False for obj in seq: if isnumber(obj): x -= obj*dxscale needcharspace = True else: for cid in font.decode(obj): if needcharspace: x += charspace x += self.render_char(translate_matrix(matrix, (x, y)), font, fontsize, scaling, rise, cid) if cid == 32 and wordspace: x += wordspace needcharspace = True return (x, y)
class PDFTextDevice(PDFDevice): def handle_undefined_char(self, cidcoding, cid): if self.debug: print >> sys.stderr, 'undefined: %r, %r' % (cidcoding, cid) return '?' def render_string(self, textstate, seq): matrix = mult_matrix(textstate.matrix, self.ctm) font = textstate.font fontsize = textstate.fontsize scaling = textstate.scaling * .01 charspace = textstate.charspace * scaling wordspace = textstate.wordspace * scaling rise = textstate.rise if font.is_multibyte(): wordspace = 0 dxscale = .001 * fontsize * scaling if font.is_vertical(): textstate.linematrix = self.render_string_vertical( seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, rise, dxscale) else: textstate.linematrix = self.render_string_horizontal( seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, rise, dxscale) return def render_string_horizontal(self, seq, matrix, (x, y), font, fontsize, scaling, charspace, wordspace, rise, dxscale): needcharspace = False for obj in seq: if isinstance(obj, int) or isinstance(obj, float): x -= obj * dxscale needcharspace = True else: for cid in font.decode(obj): if needcharspace: x += charspace x += self.render_char(translate_matrix(matrix, (x, y)), font, fontsize, scaling, rise, cid) if cid == 32 and wordspace: x += wordspace needcharspace = True return (x, y)
def do_TJ(self, seq): #print >>stderr, 'TJ(%r): %r' % (seq,self.textstate) textstate = self.textstate textmatrix = translate_matrix(textstate.matrix, textstate.linematrix) self.device.render_string(textstate, textmatrix, seq) font = textstate.font s = ''.join(x for x in seq if isinstance(x, str)) w = ((font.string_width(s) - sum(x for x in seq if not isinstance(x, str)) * .001) * textstate.fontsize + len(s) * textstate.charspace) (lx, ly) = textstate.linematrix if font.is_vertical(): # advance vertically ly += w * (textstate.scaling * .01) else: # advance horizontally if not font.is_multibyte(): w += s.count(' ') * textstate.wordspace lx += w * (textstate.scaling * .01) textstate.linematrix = (lx, ly) return
def do_TJ(self, seq): # print >>stderr, 'TJ(%r): %r' % (seq,self.textstate) textstate = self.textstate textmatrix = translate_matrix(textstate.matrix, textstate.linematrix) self.device.render_string(textstate, textmatrix, seq) font = textstate.font s = "".join(x for x in seq if isinstance(x, str)) w = (font.string_width(s) - sum(x for x in seq if not isinstance(x, str)) * 0.001) * textstate.fontsize + len( s ) * textstate.charspace (lx, ly) = textstate.linematrix if font.is_vertical(): # advance vertically ly += w * (textstate.scaling * 0.01) else: # advance horizontally if not font.is_multibyte(): w += s.count(" ") * textstate.wordspace lx += w * (textstate.scaling * 0.01) textstate.linematrix = (lx, ly) return
def render_string(self, textstate, textmatrix, seq): font = textstate.font text = [] textmatrix = mult_matrix(textmatrix, self.ctm) for x in seq: if isinstance(x, int) or isinstance(x, float): text.append((None, None, x)) else: chars = font.decode(x) for cid in chars: try: char = font.to_unicode(cid) except PDFUnicodeNotDefined, e: (cidcoding, cid) = e.args char = self.handle_undefined_char(cidcoding, cid) text.append((char, cid, font.char_disp(cid))) if cid == 32 and not font.is_multibyte(): if text: item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, text) self.cur_item.add(item) (dx,dy) = item.adv dx += textstate.wordspace * textstate.scaling * .01 textmatrix = translate_matrix(textmatrix, (dx, dy)) text = []
x += wordspace needcharspace = True return (x, y) def render_string_vertical(self, seq, matrix, (x, y), font, fontsize, scaling, charspace, wordspace, rise, dxscale): needcharspace = False for obj in seq: if isinstance(obj, int) or isinstance(obj, float): y -= obj * dxscale needcharspace = True else: for cid in font.decode(obj): if needcharspace: y += charspace y += self.render_char(translate_matrix(matrix, (x, y)), font, fontsize, scaling, rise, cid) if cid == 32 and wordspace: y += wordspace needcharspace = True return (x, y) def render_char(self, matrix, font, fontsize, scaling, rise, cid): return 0 ## TagExtractor ## class TagExtractor(PDFDevice): def __init__(self, rsrcmgr, outfp, codec='utf-8', debug=0): PDFDevice.__init__(self, rsrcmgr)
char = self.handle_undefined_char(cidcoding, cid) chars.append((char, cid)) if cid == 32 and wordspace: if needspace: x += charspace (dx, dy) = self.render_chars( translate_matrix(matrix, (x, y)), font, fontsize, charspace, scaling, chars) needspace = True x += dx + wordspace y += dy chars = [] if chars: if needspace: x += charspace (dx, dy) = self.render_chars(translate_matrix(matrix, (x, y)), font, fontsize, charspace, scaling, chars) x += dx y += dy return (x, y) def render_string_vertical(self, seq, matrix, (x, y), font, fontsize, scaling, charspace, wordspace, dxscale): chars = [] needspace = False for obj in seq: if isinstance(obj, int) or isinstance(obj, float): (dx, dy) = self.render_chars(translate_matrix(matrix, (x, y)), font, fontsize, charspace, scaling, chars) x += dx
if cid == 32 and wordspace: x += wordspace needcharspace = True return x, y def render_string_vertical(self, seq, matrix, (x, y), font, fontsize, scaling, charspace, wordspace, rise, dxscale): needcharspace = False for obj in seq: if isinstance(obj, (int, float)): y -= obj * dxscale needcharspace = True else: for cid in font.decode(obj): if needcharspace: y += charspace y += self.render_char(translate_matrix(matrix, (x, y)), font, fontsize, scaling, rise, cid) if cid == 32 and wordspace: y += wordspace needcharspace = True return x, y def render_char(self, matrix, font, fontsize, scaling, rise, cid): return 0 class TagExtractor(PDFDevice): def __init__(self, rsrcmgr, outfp, codec='utf-8'): PDFDevice.__init__(self, rsrcmgr) self.outfp = outfp self.codec = codec
(cidcoding, cid) = e.args char = self.handle_undefined_char(cidcoding, cid) chars.append((char, cid)) if cid == 32 and wordspace: if needspace: x += charspace (dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font, fontsize, charspace, scaling, chars) needspace = True x += dx + wordspace y += dy chars = [] if chars: if needspace: x += charspace (dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font, fontsize, charspace, scaling, chars) x += dx y += dy return (x, y) def render_string_vertical(self, seq, matrix, (x,y), font, fontsize, scaling, charspace, wordspace, dxscale): chars = [] needspace = False for obj in seq: if isinstance(obj, int) or isinstance(obj, float): (dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font, fontsize, charspace, scaling, chars) x += dx y += dy - obj*dxscale