Example #1
0
 def process_string(self, ts, array):
     verbose('SHOW STRING ts: ', ts)
     verbose('SHOW STRING array: ', array)
     for obj in array:
         verbose("processing obj: ", obj)
         # this comes from TJ, number translates Tm
         if utils.isnumber(obj):
             Tj = obj
             verbose("processing translation: ", Tj)
             # translating Tm, change tx, ty according to direction
             if ts.Tf.is_vertical():
                 tx = 0
                 ty = self.new_ty(0, Tj, ts.Tfs, 0, ts.Tw)
             else:
                 tx = self.new_tx(0, Tj, ts.Tfs, 0, ts.Tw, ts.Th)
                 ty = 0
             # update Tm accordingly
             ts.Tm = utils.translate_matrix(ts.Tm, (tx, ty))
             # there is an heuristic needed here, not sure what
             # if -Tj > ts.Tf.char_width('o'):
             #    self.draw_cid(ts, 0, force_space=True)
         else:
             verbose("processing string")
             for cid in ts.Tf.decode(obj):
                 self.draw_cid(ts, cid)
Example #2
0
def filterObjs(obj, x, codec=None):
    if obj is None:
        return
    if isinstance(obj, dict):
        for (k, v) in six.iteritems(obj):
            if (k == "URI"):
                x.append(e(v))
            filterObjs(v, x)
        return

    if isinstance(obj, list):
        for v in obj:
            filterObjs(v, x)
        return

    if isinstance(obj, (six.string_types, six.binary_type)):
        return

    if isinstance(obj, PDFStream):
        return

    if isinstance(obj, PDFObjRef):
        return

    if isinstance(obj, PSKeyword):
        return

    if isinstance(obj, PSLiteral):
        return

    if isnumber(obj):
        return

    raise TypeError(obj)
 def render_string_horizontal(self, seq, matrix, pos,
                              font, fontsize, scaling, charspace, wordspace,
                              rise, dxscale, ncs, graphicstate):
     (x, y) = pos
     needcharspace = False
     h_est = fontsize*matrix[3] # We estimate the size of the font by multiplying the fontsize by the height scaling in the textmatrix.
     for obj in seq:
         if utils.isnumber(obj):
             x -= obj*dxscale
             needcharspace = True
         else:
             for (char, cid) in zip(obj,font.decode(obj)):
                 if needcharspace:
                     x += charspace
                 char_width = self.render_char(utils.translate_matrix(matrix, (x, y)),
                                               font, fontsize, scaling, rise, cid,
                                               ncs, graphicstate)
                 self.push_char(bytes([char]), char_width, (x, y), h_est, font, matrix)
                 x += char_width
                 if cid == 32 and wordspace:
                     x += wordspace
                 needcharspace = True
     # Push none to indicate end of rendering.
     self.push_char(None, 0, (x,y), h_est, font, matrix)
     return (x, y)
Example #4
0
    def dumpobj(self, out, obj):
        if obj is None:
            out.write(b'null ')
            return

        if isinstance(obj, dict):
            out.write(b'<<')
            for (k, v) in obj.items():
                out.write(b'/%s ' % bytes(k, 'utf-8'))
                self.dumpobj(out, v)
            out.write(b'>>')
            return

        if isinstance(obj, list):
            out.write(b'[')
            for v in obj:
                self.dumpobj(out, v)
            out.write(b']')
            return

        if isinstance(obj, bytes):
            out.write(b'(')
            out.write(obj)
            out.write(b')')
            return

        if isinstance(obj, str):
            out.write(b'(')
            out.write(bytes(e(obj), 'utf-8'))
            out.write(b')')
            return

        if isinstance(obj, bool):
            if obj:
                out.write(b'true ')
            else:
                out.write(b'false ')
            return
        if isnumber(obj):
            if isinstance(obj, float):
                s = (b'%.5f ' % obj).rstrip(b'0')
            else:
                s = b'%d ' % obj
            out.write(s)
            return

        if isinstance(obj, PDFObjRef):
            out.write(b'%d 0 R ' % (obj.objid))
            return

        if isinstance(obj, PSKeyword):
            out.write(b'/%s ' % bytes(obj.name, 'utf-8'))
            return

        if isinstance(obj, PSLiteral):
            out.write(b'/%s ' % bytes(obj.name, 'utf-8'))
            return

        # if isinstance(obj, PDFStream):
        raise TypeError(obj)
def dumpxml(out, obj, mode=None):
    if obj is None:
        out.write('<null />')
        return

    if isinstance(obj, dict):
        out.write('<dict size="%d">\n' % len(obj))
        for (k, v) in obj.items():
            out.write('<key>%s</key>\n' % k)
            out.write('<value>')
            dumpxml(out, v)
            out.write('</value>\n')
        out.write('</dict>')
        return

    if isinstance(obj, list):
        out.write('<list size="%d">\n' % len(obj))
        for v in obj:
            dumpxml(out, v)
            out.write('\n')
        out.write('</list>')
        return

    if isinstance(obj, bytes):
        out.write('<string size="%d">%s</string>' % (len(obj), encode(obj)))
        return

    if isinstance(obj, PDFStream):
        if mode == 'raw':
            out.buffer.write(obj.get_rawdata())
        elif mode == 'binary':
            out.buffer.write(obj.get_data())
        else:
            out.write('<stream>\n<props>\n')
            dumpxml(out, obj.attrs)
            out.write('\n</props>\n')
            if mode == 'text':
                data = obj.get_data()
                out.write('<data size="%d">%s</data>\n' %
                          (len(data), encode(data)))
            out.write('</stream>')
        return

    if isinstance(obj, PDFObjRef):
        out.write('<ref id="%d" />' % obj.objid)
        return

    if isinstance(obj, PSKeyword):
        out.write('<keyword>%s</keyword>' % obj.name)
        return

    if isinstance(obj, PSLiteral):
        out.write('<literal>%s</literal>' % obj.name)
        return

    if isnumber(obj):
        out.write('<number>%s</number>' % obj)
        return

    raise TypeError(obj)
Example #6
0
def dumpxml(out, obj, codec=None):
    if obj is None:
        out.write('<null />')
        return

    if isinstance(obj, dict):
        out.write('<dict size="%d">\n' % len(obj))
        for (k,v) in obj.iteritems():
            out.write('<key>%s</key>\n' % k)
            out.write('<value>')
            dumpxml(out, v)
            out.write('</value>\n')
        out.write('</dict>')
        return

    if isinstance(obj, list):
        out.write('<list size="%d">\n' % len(obj))
        for v in obj:
            dumpxml(out, v)
            out.write('\n')
        out.write('</list>')
        return

    if isinstance(obj, str):
        out.write('<string size="%d">%s</string>' % (len(obj), e(obj)))
        return

    if isinstance(obj, PDFStream):
        if codec == 'raw':
            out.write(obj.get_rawdata())
        elif codec == 'binary':
            out.write(obj.get_data())
        else:
            out.write('<stream>\n<props>\n')
            dumpxml(out, obj.attrs)
            out.write('\n</props>\n')
            if codec == 'text':
                data = obj.get_data()
                out.write('<data size="%d">%s</data>\n' % (len(data), e(data)))
            out.write('</stream>')
        return

    if isinstance(obj, PDFObjRef):
        out.write('<ref id="%d" />' % obj.objid)
        return

    if isinstance(obj, PSKeyword):
        out.write('<keyword>%s</keyword>' % obj.name)
        return

    if isinstance(obj, PSLiteral):
        out.write('<literal>%s</literal>' % obj.name)
        return

    if isnumber(obj):
        out.write('<number>%s</number>' % obj)
        return

    raise TypeError(obj)
Example #7
0
def dumpxml(out, obj, codec=None):
    if obj is None:
        out.write("<null />")
        return

    if isinstance(obj, dict):
        out.write('<dict size="%d">\n' % len(obj))
        for (k, v) in obj.items():
            out.write("<key>%s</key>\n" % k)
            out.write("<value>")
            dumpxml(out, v)
            out.write("</value>\n")
        out.write("</dict>")
        return

    if isinstance(obj, list):
        out.write('<list size="%d">\n' % len(obj))
        for v in obj:
            dumpxml(out, v)
            out.write("\n")
        out.write("</list>")
        return

    if isinstance(obj, ((str,), bytes)):
        out.write('<string size="%d">%s</string>' % (len(obj), e(obj)))
        return

    if isinstance(obj, PDFStream):
        if codec == "raw":
            out.write(obj.get_rawdata())
        elif codec == "binary":
            out.write(obj.get_data())
        else:
            out.write("<stream>\n<props>\n")
            dumpxml(out, obj.attrs)
            out.write("\n</props>\n")
            if codec == "text":
                data = obj.get_data()
                out.write('<data size="%d">%s</data>\n' % (len(data), e(data)))
            out.write("</stream>")
        return

    if isinstance(obj, PDFObjRef):
        out.write('<ref id="%d" />' % obj.objid)
        return

    if isinstance(obj, PSKeyword):
        out.write("<keyword>%s</keyword>" % obj.name)
        return

    if isinstance(obj, PSLiteral):
        out.write("<literal>%s</literal>" % obj.name)
        return

    if isnumber(obj):
        out.write("<number>%s</number>" % obj)
        return

    raise TypeError(obj)
Example #8
0
def dumpxml(out, obj, codec=None):
    if obj is None:
        out += '<null />'
        return

    if isinstance(obj, dict):
        out += '<dict size="{}">\n'.format(len(obj))
        for (k, v) in obj.items():
            out += '<key>{}</key>\n'.format(k)
            out += '<value>'
            dumpxml(out, v)
            out += '</value>\n'
        out += '</dict>'
        return out

    if isinstance(obj, list):
        out += '<list size="{}">\n'.format(len(obj))
        for v in obj:
            dumpxml(out, v)
            out += '\n'
        out += '</list>'
        return out

    if isinstance(obj, ((str,), bytes)):
        out += '<string size="{}">{}</string>'.format(len(obj), e(obj))
        return out

    if isinstance(obj, PDFStream):
        if codec == 'raw':
            out += obj.get_rawdata()
        elif codec == 'binary':
            out += obj.get_data()
        else:
            out += '<stream>\n<props>\n'
            dumpxml(out, obj.attrs)
            out += '\n</props>\n'
            if codec == 'text':
                data = obj.get_data()
                out += '<data size="{}">{}</data>\n'.format(len(data), e(data))
            out += '</stream>'
        return out

    if isinstance(obj, PDFObjRef):
        out += '<ref id="{}" />'.format(obj.objid)
        return out

    if isinstance(obj, PSKeyword):
        out += '<keyword>{}</keyword>'.format(obj.name)
        return out

    if isinstance(obj, PSLiteral):
        out += '<literal>{}</literal>'.format(obj.name)
        return out

    if isnumber(obj):
        out += '<number>{}</number>'.format(obj)
        return out

    raise TypeError(obj)
def get_obj_type(obj):
    if obj is None:
        return None
    if isinstance(obj, dict):
        return 'dict'
    if isinstance(obj, list):
        return 'list'
    if isinstance(obj, str):
        return 'str'
    if isinstance(obj, PDFStream):
        return 'PDFStream'
    if isinstance(obj, PDFObjRef):
        return 'PDFObjRef'
    if isinstance(obj, PSKeyword):
        return 'PSKeyword'
    if isinstance(obj, PSLiteral):
        return 'PSLiteral'
    if isnumber(obj):
        return 'number'
    return 'TypeError'
Example #10
0
def dumpxml(obj, codec=None):
    #print "dumpxml"
    res = ""
    if obj is None:
        res += '<null />'
        return res

    if isinstance(obj, dict):
        #print "dict"
        res += '<dict size="%' + str(len(obj)) + '">\n'
        for (k,v) in obj.iteritems():
            #print "dict loop"
            res += '<key>' + k + '</key>\n'
            res += '<value>'
            res += dumpxml( v)
            #print "after v dump"
            res += '</value>\n'
        res += '</dict>'
        #print "return dict"
        return res

    if isinstance(obj, list):
        #print "list"
        res += '<list size="' + str(len(obj)) + '">\n'
        for v in obj:
            #print "before list dump"
            res += dumpxml(v)
            #print "after list dump"
            res += '\n'
        res += '</list>'
        return res

    if isinstance(obj, str):
        #print "string"
        res += '<string size="' + str(len(obj)) + '">' + e(obj) + '</string>'
        return res

    if isinstance(obj, PDFStream):
        #print "PDFStream"
        if codec == 'raw':
            res += obj.get_rawdata()
        elif codec == 'binary':
            res += obj.get_data()
        else:
            res += '<stream>\n<props>\n'
            #print "before dump attrs"
            res += dumpxml(obj.attrs)
            #print "after dump attrs"
            res += '\n</props>\n'
            if codec == 'text':
                data = obj.get_data()
                res += '<data size="' + str(len(data)) + '">' + e(data) + '</data>\n'
            res += '</stream>'
        return res

    if isinstance(obj, PDFObjRef):
        #print "PDFObjRef"
        res += '<ref id="' + str(obj.objid) + '" />'
        return res

    if isinstance(obj, PSKeyword):
        #print "PSKeyword"
        res += '<keyword>' + obj.name + '</keyword>'
        return res

    if isinstance(obj, PSLiteral):
        #print "PSLiteral"
        res += '<literal>' + obj.name + '</literal>'
        return res

    if isnumber(obj):
        #print "Number " + str(obj)
        res += '<number>' + str(obj) + '</number>'
        return res

    raise TypeError(obj)
Example #11
0
    def dump(self, obj):
        res = ""
        if obj is None:
            res += '<null />'
            return res

        if isinstance(obj, dict):
            res += '<dict size="%' + str(len(obj)) + '">\n'
            for (k, v) in obj.iteritems():
                k = re.sub(r'\W+', '', k)
                if k.isdigit() or not k:
                    k = 'xml_creator_' + k
                res += '<' + k + '>'
                res += self.dump(v)
                res += '</' + k + '>\n'
            res += '</dict>'
            return res

        if isinstance(obj, list):
            res += '<list size="' + str(len(obj)) + '">\n'
            for v in obj:
                res += self.dump(v)
                res += '\n'
            res += '</list>'
            return res

        if isinstance(obj, str):
            self.check_js(obj)
            # encode base64 to avoid illegal xml characters
            res += '<string>' + self.e(obj).encode('base64') + '</string>'
            return res

        if isinstance(obj, PDFStream):
            res += '<stream>\n'
            try:
                res += '<props>\n'
                res += self.dump(obj.attrs)
                res += '\n</props>\n'
                data = obj.get_data()
                self.check_js(str(data))
                self.check_swf(str(data))
                res += '<data size="' + str(len(data)) + '">' + self.e(
                    data).encode('base64') + '</data>\n'
            # Throws an exception if the filter is unsupported, etc
            except Exception as e:
                # print e.message
                res += '<StreamException>%s</StreamException>' % str(e)
            # make sure the tag is closed appropriately
            res += '</stream>'
            return res

        if isinstance(obj, PDFObjRef):
            res += '<ref id="' + str(obj.objid) + '" />'
            return res

        if isinstance(obj, PSKeyword):
            self.check_js(obj.name)
            res += '<keyword>' + obj.name + '</keyword>'
            return res

        if isinstance(obj, PSLiteral):
            self.check_js(obj.name)
            res += '<literal>' + obj.name + '</literal>'
            return res

        if isnumber(obj):
            self.check_js(str(obj))
            res += '<number>' + str(obj) + '</number>'
            return res

        raise TypeError(obj)
Example #12
0
def dumpxml(out, obj, codec=None):
    if obj is None:
        out.write('<null />')
        return

    if isinstance(obj, dict):
        out.write('<dict size="%d">\n' % len(obj))
        for (k, v) in obj.items():
            out.write('<key>%s</key>\n' % k)
            out.write('<value>')
            dumpxml(out, v)
            out.write('</value>\n')
        out.write('</dict>')
        return

    if isinstance(obj, list):
        out.write('<list size="%d">\n' % len(obj))
        for v in obj:
            dumpxml(out, v)
            out.write('\n')
        out.write('</list>')
        return

    if isinstance(obj, str):
        out.write('<string size="%d">%s</string>' % (len(obj), e(obj)))
        return

    # added to avoid encoding errors
    if isinstance(obj, bytes):
        value = "".join(map(chr, obj))
        out.write('<string size="%d">%s</string>' % (len(value), e(value)))
        return

    if isinstance(obj, PDFStream):
        if codec == 'raw':
            out.write(obj.get_rawdata())
        elif codec == 'binary':
            out.write(obj.get_data())
        else:
            out.write('<stream>\n<props>\n')
            dumpxml(out, obj.attrs)
            out.write('\n</props>\n')
            if codec == 'text':
                data = obj.get_data()
                out.write('<data size="%d">%s</data>\n' % (len(data), e(data)))
            out.write('</stream>')
        return

    if isinstance(obj, PDFObjRef):
        out.write('<ref id="%d" />' % obj.objid)
        return

    if isinstance(obj, PSKeyword):
        out.write('<keyword>%s</keyword>' % obj.name)
        return

    if isinstance(obj, PSLiteral):
        out.write('<literal>%s</literal>' % obj.name)
        return

    if isnumber(obj):
        out.write('<number>%s</number>' % obj)
        return

    # raise TypeError(obj)
    print('Exception')
    print(obj)
    print(type(obj))
Example #13
0
def dumpxml(out: TextIO, obj: object, codec: Optional[str] = None) -> None:
    if obj is None:
        out.write('<null />')
        return

    if isinstance(obj, dict):
        out.write('<dict size="%d">\n' % len(obj))
        for (k, v) in obj.items():
            out.write('<key>%s</key>\n' % k)
            out.write('<value>')
            dumpxml(out, v)
            out.write('</value>\n')
        out.write('</dict>')
        return

    if isinstance(obj, list):
        out.write('<list size="%d">\n' % len(obj))
        for v in obj:
            dumpxml(out, v)
            out.write('\n')
        out.write('</list>')
        return

    if isinstance(obj, (str, bytes)):
        out.write('<string size="%d">%s</string>' % (len(obj), escape(obj)))
        return

    if isinstance(obj, PDFStream):
        if codec == 'raw':
            # Bug: writing bytes to text I/O. This will raise TypeError.
            out.write(obj.get_rawdata())  # type: ignore [arg-type]
        elif codec == 'binary':
            # Bug: writing bytes to text I/O. This will raise TypeError.
            out.write(obj.get_data())  # type: ignore [arg-type]
        else:
            out.write('<stream>\n<props>\n')
            dumpxml(out, obj.attrs)
            out.write('\n</props>\n')
            if codec == 'text':
                data = obj.get_data()
                out.write('<data size="%d">%s</data>\n' %
                          (len(data), escape(data)))
            out.write('</stream>')
        return

    if isinstance(obj, PDFObjRef):
        out.write('<ref id="%d" />' % obj.objid)
        return

    if isinstance(obj, PSKeyword):
        # Likely bug: obj.name is bytes, not str
        out.write('<keyword>%s</keyword>' %
                  obj.name)  # type: ignore [str-bytes-safe]
        return

    if isinstance(obj, PSLiteral):
        # Likely bug: obj.name may be bytes, not str
        out.write('<literal>%s</literal>' %
                  obj.name)  # type: ignore [str-bytes-safe]
        return

    if isnumber(obj):
        out.write('<number>%s</number>' % obj)
        return

    raise TypeError(obj)
    def dump(self, obj):
        res = ""
        if obj is None:
            res += '<null />'
            return res

        if isinstance(obj, dict):
            res += '<dict size="%' + str(len(obj)) + '">\n'
            for (k,v) in obj.iteritems():
                k = re.sub(r'\W+', '', k)
                if k.isdigit() or not k:
                    k = 'xml_creator_' + k
                res += '<' + k + '>'
                res += self.dump(v)
                res += '</' + k + '>\n'
            res += '</dict>'
            return res

        if isinstance(obj, list):
            res += '<list size="' + str(len(obj)) + '">\n'
            for v in obj:
                res += self.dump(v)
                res += '\n'
            res += '</list>'
            return res

        if isinstance(obj, str):
            self.check_js(obj)
            #encode base64 to avoid illegal xml characters
            res += '<string>' + self.e(obj).encode('base64') + '</string>'
            return res

        if isinstance(obj, PDFStream):
            res += '<stream>\n'
            try:
                res += '<props>\n'
                res += self.dump(obj.attrs)
                res += '\n</props>\n'
                data = obj.get_data()
                self.check_js(str(data))
                self.check_swf(str(data))
                res += '<data size="' + str(len(data)) + '">' + self.e(data).encode('base64') + '</data>\n'
            #Throws an exception if the filter is unsupported, etc
            except Exception as e:
                #print e.message
                res += '<StreamException>%s</StreamException>' % str(e)
            #make sure the tag is closed appropriately
            res += '</stream>'
            return res

        if isinstance(obj, PDFObjRef):
            res += '<ref id="' + str(obj.objid) + '" />'
            return res

        if isinstance(obj, PSKeyword):
            self.check_js(obj.name)
            res += '<keyword>' + obj.name + '</keyword>'
            return res

        if isinstance(obj, PSLiteral):
            self.check_js(obj.name)
            res += '<literal>' + obj.name + '</literal>'
            return res

        if isnumber(obj):
            self.check_js(str(obj))
            res += '<number>' + str(obj) + '</number>'
            return res

        raise TypeError(obj)
Example #15
0
 def show_string(self, ts, array):
     verbose(ts)
     sentence = []
     word = []
     m = (ts.Tfs * ts.Th, 0, 0, ts.Tfs, 0, ts.Trise)
     applicable_Tm = utils.mult_matrix(m, ts.Tm)
     (sx, _, _, sy, tx, ty) = applicable_Tm
     current_state = (sx, sy, tx, ty)
     if self.last_state == None:
         self.paragraph = []
         verbose('current paragraph becomes=', self.paragraph)
     elif current_state[0] == self.last_state[0]:
         verbose('DECISION: grouping the text object to last')
     else:
         verbose('DECISION: finalizing the paragraph')
         key = self.last_state[0]
         item = self.paragraph_map.get(key, '')
         if len(item) > 0:
             item = item = ' '
         new_item = ' '.join(self.paragraph)
         self.paragraph_map[key] = item + new_item
         self.paragraph = []
         verbose('current paragraph becomes=', self.paragraph)
     self.last_state = current_state
     for obj in array:
         verbose("processing obj=", obj)
         if utils.isnumber(obj):
             Tj = obj
             if Tj < WITHIN_WORD_MOVE_LIMIT:
                 verbose("DECISION: new word")
                 sentence.append(''.join(word))
                 verbose('current sentence becomes=', sentence)
                 word = []
                 verbose('current word becomes=', word)
             else:
                 verbose("DECISION: move inside the current word")
             if ts.Tf.is_vertical():
                 tx = 0
                 ty = ((Tj / 1000) * ts.Tfs)
             else:
                 tx = ((Tj / 1000) * ts.Tfs) * ts.Th
                 ty = 0
             ts.Tm = utils.mult_matrix((1, 0, 0, 1, tx, ty), ts.Tm)
         else:
             for cid in ts.Tf.decode(obj):
                 verbose("processing cid=", cid)
                 m = (ts.Tfs * ts.Th, 0, 0, ts.Tfs, 0, ts.Trise)
                 applicable_Tm = utils.mult_matrix(m, ts.Tm)
                 if cid == 32:
                     applicable_Tw = ts.Tw
                     sentence.append(''.join(word))
                     verbose('current sentence becomes=', sentence)
                     word = []
                 else:
                     try:
                         text = ts.Tf.to_unichr(cid)
                     except PDFUnicodeNotDefined:
                         if MISSING_CHAR:
                             text = MISSING_CHAR
                         else:
                             raise
                     word.append(text)
                     verbose('current word becomes=', word)
                     applicable_Tw = 0
                 w = ts.Tf.char_width(cid)
                 if ts.Tf.is_vertical():
                     tx = 0
                     ty = ((w - 0) * ts.Tfs + ts.Tc + applicable_Tw)
                 else:
                     tx = ((w - 0) * ts.Tfs + ts.Tc + applicable_Tw) * ts.Th
                     ty = 0
                 ts.Tm = utils.mult_matrix((1, 0, 0, 1, tx, ty), ts.Tm)
     if len(word) > 0:
         sentence.append(''.join(word))
         verbose('current sentence becomes=', sentence)
         word = []
         verbose('current word becomes=', word)
     self.paragraph.append(' '.join(sentence))
     verbose('current paragraph becomes=', self.paragraph)
     return