Example #1
0
 def do_Do(self, xobjid):
   xobjid = literal_name(xobjid)
   try:
     xobj = stream_value(self.xobjmap[xobjid])
   except KeyError:
     if STRICT:
       raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
     return
   if 1 <= self.debug:
     print >>stderr, 'Processing xobj: %r' % xobj
   subtype = xobj.dic.get('Subtype')
   if subtype is LITERAL_FORM and 'BBox' in xobj.dic:
     interpreter = self.dup()
     bbox = list_value(xobj.dic['BBox'])
     matrix = list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY))
     self.device.begin_figure(xobjid, bbox, matrix)
     interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=mult_matrix(matrix, self.ctm))
     self.device.end_figure(xobjid)
   elif subtype is LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic:
     self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY)
     (w,h) = (xobj.dic['Width'], xobj.dic['Height'])
     self.device.render_image(xobj, (w,h))
     self.device.end_figure(xobjid)
   else:
     # unsupported xobject type.
     pass
   return
    def do_Do(self, xobjid):
        # the base of this function is basically copy-pasted from ancestor; unfortunately, I found no better solution
        xobjid = literal_name(xobjid)
        try:
            xobj = stream_value(self.xobjmap[xobjid])
        except KeyError:
            if STRICT:
                raise PDFInterpreterError("Undefined xobject id: %r" % xobjid)
            return
        if self.debug:
            logging.info("Processing xobj: %r" % xobj)
        subtype = xobj.get("Subtype")
        if subtype is LITERAL_FORM and "BBox" in xobj:
            interpreter = self.dup()
            interpreter.is_first_level_call = None
            bbox = list_value(xobj["BBox"])
            matrix = list_value(xobj.get("Matrix", MATRIX_IDENTITY))
            # According to PDF reference 1.7 section 4.9.1, XObjects in
            # earlier PDFs (prior to v1.2) use the page's Resources entry
            # instead of having their own Resources entry.
            resources = dict_value(xobj.get("Resources")) or self.resources.copy()

            self.device.begin_figure(xobjid, bbox, matrix)
            interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm))
            self.device.end_figure(xobjid)

            # for (k,v) in interpreter.text_lines.iteritems():
            #     self.text_sequences[k + self.keyword_count] = v
            self.keyword_count += interpreter.keyword_count
            print "Included %i keywords" % interpreter.keyword_count
        else:
            # ignored xobject type.
            pass
        return
Example #3
0
 def draw_cid(self, ts, cid, force_space=False):
     verbose("drawing cid: ", cid)
     Trm = utils.mult_matrix((ts.Tfs * ts.Th, 0, 0, ts.Tfs, 0, ts.Trise),
                             ts.Tm)
     if Trm[1] != 0:
         return
     if Trm[2] != 0:
         return
     verbose('Trm', Trm)
     if cid == 32 or force_space:
         Tw = ts.Tw
     else:
         Tw = 0
     try:
         if force_space:
             unichar = ' '
         else:
             try:
                 unichar = ts.Tf.to_unichr(cid)
             except Exception as e:
                 verbose(f"Failed to process {cid = }: {e}")
                 unichar = ' '
     except PDFUnicodeNotDefined:
         if MISSING_CHAR:
             unichar = MISSING_CHAR
         else:
             raise
     (gx, gy) = utils.apply_matrix_pt(Trm, (0, 0))
     verbose("drawing unichar: '", unichar, "' @", gx, ",", gy)
     tfs = Trm[0]
     if self.current_block is None:
         self.current_block = (ts.Tf, tfs, gx, gy, [unichar])
     elif ((self.current_block[0] == ts.Tf)
           and (self.current_block[1] == tfs)):
         self.current_block[4].append(unichar)
     else:
         self.blocks.append(self.current_block)
         self.current_block = (ts.Tf, tfs, gx, gy, [unichar])
     verbose('current block: ', self.current_block)
     verbose('blocks: ', self.blocks)
     if force_space:
         pass
     else:
         w = ts.Tf.char_width(cid)
         if ts.Tf.is_vertical():
             tx = 0
             ty = self.new_ty(w, 0, ts.Tfs, ts.Tc, Tw)
         else:
             tx = self.new_tx(w, 0, ts.Tfs, ts.Tc, Tw, ts.Th)
             ty = 0
         ts.Tm = utils.translate_matrix(ts.Tm, (tx, ty))
 def render_string(self, textstate, seq):
     matrix = mult_matrix(textstate.matrix, self.ctm)
     font = textstate.font
     fontsize = textstate.fontsize
     scaling = textstate.scaling * .01
     charspace = textstate.charspace * scaling
     wordspace = textstate.wordspace * scaling
     rise = textstate.rise
     if font.is_multibyte():
         wordspace = 0
     dxscale = .001 * fontsize * scaling
     if font.is_vertical():
         textstate.linematrix = self.render_string_vertical(
             seq, matrix, textstate.linematrix, font, fontsize,
             scaling, charspace, wordspace, rise, dxscale)
     else:
         textstate.linematrix = self.render_string_horizontal(
             seq, matrix, textstate.linematrix, font, fontsize,
             scaling, charspace, wordspace, rise, dxscale)
     return
Example #5
0
 def do_cm(self, a1, b1, c1, d1, e1, f1):
   self.ctm = mult_matrix((a1,b1,c1,d1,e1,f1), self.ctm)
   self.device.set_ctm(self.ctm)
   return
 def begin_figure(self, name, bbox, matrix):
     self._stack.append(self.cur_item)
     self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
     return
Example #7
0
 def show_string(self, ts, array):
     verbose(ts)
     sentence = []
     word = []
     m = (ts.Tfs * ts.Th, 0, 0, ts.Tfs, 0, ts.Trise)
     applicable_Tm = utils.mult_matrix(m, ts.Tm)
     (sx, _, _, sy, tx, ty) = applicable_Tm
     current_state = (sx, sy, tx, ty)
     if self.last_state == None:
         self.paragraph = []
         verbose('current paragraph becomes=', self.paragraph)
     elif current_state[0] == self.last_state[0]:
         verbose('DECISION: grouping the text object to last')
     else:
         verbose('DECISION: finalizing the paragraph')
         key = self.last_state[0]
         item = self.paragraph_map.get(key, '')
         if len(item) > 0:
             item = item = ' '
         new_item = ' '.join(self.paragraph)
         self.paragraph_map[key] = item + new_item
         self.paragraph = []
         verbose('current paragraph becomes=', self.paragraph)
     self.last_state = current_state
     for obj in array:
         verbose("processing obj=", obj)
         if utils.isnumber(obj):
             Tj = obj
             if Tj < WITHIN_WORD_MOVE_LIMIT:
                 verbose("DECISION: new word")
                 sentence.append(''.join(word))
                 verbose('current sentence becomes=', sentence)
                 word = []
                 verbose('current word becomes=', word)
             else:
                 verbose("DECISION: move inside the current word")
             if ts.Tf.is_vertical():
                 tx = 0
                 ty = ((Tj / 1000) * ts.Tfs)
             else:
                 tx = ((Tj / 1000) * ts.Tfs) * ts.Th
                 ty = 0
             ts.Tm = utils.mult_matrix((1, 0, 0, 1, tx, ty), ts.Tm)
         else:
             for cid in ts.Tf.decode(obj):
                 verbose("processing cid=", cid)
                 m = (ts.Tfs * ts.Th, 0, 0, ts.Tfs, 0, ts.Trise)
                 applicable_Tm = utils.mult_matrix(m, ts.Tm)
                 if cid == 32:
                     applicable_Tw = ts.Tw
                     sentence.append(''.join(word))
                     verbose('current sentence becomes=', sentence)
                     word = []
                 else:
                     try:
                         text = ts.Tf.to_unichr(cid)
                     except PDFUnicodeNotDefined:
                         if MISSING_CHAR:
                             text = MISSING_CHAR
                         else:
                             raise
                     word.append(text)
                     verbose('current word becomes=', word)
                     applicable_Tw = 0
                 w = ts.Tf.char_width(cid)
                 if ts.Tf.is_vertical():
                     tx = 0
                     ty = ((w - 0) * ts.Tfs + ts.Tc + applicable_Tw)
                 else:
                     tx = ((w - 0) * ts.Tfs + ts.Tc + applicable_Tw) * ts.Th
                     ty = 0
                 ts.Tm = utils.mult_matrix((1, 0, 0, 1, tx, ty), ts.Tm)
     if len(word) > 0:
         sentence.append(''.join(word))
         verbose('current sentence becomes=', sentence)
         word = []
         verbose('current word becomes=', word)
     self.paragraph.append(' '.join(sentence))
     verbose('current paragraph becomes=', self.paragraph)
     return
Example #8
0
 def do_Td(self, tx, ty):
     verbose_operator("PDF OPERATOR Td: tx=", tx, ", ty=", ty)
     m = (1, 0, 0, 1, tx, ty)
     self.mpts.Tlm = utils.mult_matrix(m, self.mpts.Tlm)
     self.mpts.Tm = self.mpts.Tlm
     return
 def begin_figure(self, name, bbox, matrix):
     super(PDFLocPageAnalyzer, self).begin_figure(name, bbox, matrix)
     self.cur_item = PDFLocFigure(name, bbox, mult_matrix(matrix, self.ctm))