Exemple #1
0
class PDFExploreCmd(cmd.Cmd):
    prompt = '>>> '

    def __init__(self, pdf_path):
        cmd.Cmd.__init__(self)
        self.debug = False
        self.current_obj = None
        self.pdf_path = pdf_path
        self.fp = open(pdf_path, 'rb')
        self.parser = PDFParser(self.fp)
        self.doc = PDFDocument()
        self.parser.set_document(self.doc)
        self.doc.set_parser(self.parser)
        self.doc.initialize()

    def _cached_objects(self):
        return sorted(
            list(self.doc._cached_objs.items()) +
            list(self.doc._parsed_objs.items()))

    def _get_refs(self):
        result = []

        def search(obj, objid):
            if isinstance(obj, PDFObjRef):
                result.append((objid, obj))
            elif isinstance(obj, dict):
                for value in obj.values():
                    search(value, objid)
            elif isinstance(obj, list):
                for value in obj:
                    search(value, objid)

        objs = self._cached_objects()
        for objid, obj in objs:
            search(obj, objid)
        return result

    def precmd(self, line):
        if self.debug and line.strip() not in {'debug', 'quit', 'q'}:
            import pdb
            pdb.set_trace()
        return line

    def do_debug(self, arg):
        "Toggles debug mode (perform actions in the debugger)."
        self.debug = not self.debug
        fmt = "Debug mode %s"
        mode = "ON" if self.debug else "OFF"
        print(fmt % mode)

    def do_status(self, arg):
        "Print current status, positions, etc.."
        print("Lexer pos: %d" % self.parser.lex.lexpos)
        print("File Length: %d" % self.parser.lex.lexlen)
        if self.current_obj:
            (objid, _, obj) = self.current_obj
        else:
            objid, obj = -1, "None"
        print("Current Object: %d %s" % (objid, obj))

    do_st = do_status

    def do_xref(self, arg):
        "Print out the PDF's xrefs."
        for index, xref in enumerate(self.doc.xrefs, start=1):
            print("Xref #%d (%s)" % (index, xref.__class__.__name__))
            errors = []
            for objid in xref.get_objids():
                try:
                    _, pos = xref.get_pos(objid)
                    fmt = "%5d: %8d"
                    if pos >= self.parser.lex.lexlen:
                        fmt = "%5d: " + bcolors.WARNING + "%8d" + bcolors.ENDC
                    print(fmt % (objid, pos))
                except KeyError:
                    errors.append(objid)
            if errors:
                print("Errors on %s" % ', '.join(map(str, errors)))

    @intarg()
    def do_setpos(self, arg):
        "Set the current position of the parser to the offset supplied as an argument."
        self.parser.setpos(arg)
        self.parser.reset()

    @intarg(1)
    def do_rtok(self, arg):
        "Read the next X tokens, X being the supplied argument."
        tokens = []
        try:
            for _ in range(arg):
                pos, token = self.parser.nexttoken()
                token = str(token)
                if len(token) > 20:
                    token = token[:20] + "[...(%d)]" % (len(token) - 20)
                tokens.append(token)
        except PSEOF:
            pass
        print(' '.join(tokens))
        if len(tokens) != arg:
            print("End of file reached")

    @intarg(1)
    def do_ptok(self, arg):
        "Peek the next X tokens, X being the supplied argument. Your current position will not change."
        pos = self.parser.lex.lexpos
        self.do_rtok(arg)
        self.do_setpos(pos)

    def do_robj(self, arg):
        "Read the next object and sets it as the 'current' object."
        objid, genno, obj = self.doc.readobj()
        self.current_obj = (objid, genno, obj)
        self.do_st('')

    @intarg()
    def do_sobj(self, arg):
        "Select object with ID X. The object has to have been read already."
        obj = None
        if arg in self.doc._cached_objs:
            obj = self.doc._cached_objs[arg]
        elif arg in self.doc._parsed_objs:
            obj = self.doc._parsed_objs[arg]
        else:
            print("Object hasn't been read yet.")
            strmid, index = self.doc.find_obj_ref(arg)
            if index is not None:
                print("However, our object id is in a xref")
                if strmid:
                    print("Stream ID: %d" % strmid)
                print("Position: %d" % index)
        if obj is not None:
            self.current_obj = (arg, 0, obj)
            self.do_st('')

    def do_dbgobj(self, arg):
        "Enter in debug mode with current obj as 'obj' in the local scope."
        if not self.current_obj:
            print("No current obj.")
            return
        objid, genno, obj = self.current_obj
        import pdb
        pdb.set_trace()

    def do_readall(self, arg):
        "Read all objects in the document."
        self.doc._parse_everything()
        print("Read %d objects:" % len(self.doc._cached_objs))
        self.do_whatisread('')

    def do_dumpdata(self, arg):
        "For each read stream, print out the decoded data it contains."
        objs = self._cached_objects()
        for objid, obj in objs:
            print("Dumping obj id: %d" % objid)
            print(repr(obj))
            if hasattr(obj, 'get_data'):
                print(repr(obj.get_data()))

    def do_whatisread(self, arg):
        "Prints a list of all read object ids."
        objs = self._cached_objects()
        print(repr([objid for objid, obj in objs]))

    def do_refs(self, arg):
        "Look in all read objects and find all objects that reference to our current object."
        if not self.current_obj:
            print("No current obj.")
            return

        target_id, _, _ = self.current_obj
        result = [
            parent_id for parent_id, ref in self._get_refs()
            if ref.objid == target_id
        ]
        print(repr(result))

    def do_deadrefs(self, arg):
        "Print (dead_id, host_id) for all dead references in the document."
        objs = self._cached_objects()
        objids = {objid for objid, obj in objs}
        result = [(ref.objid, parent_id)
                  for parent_id, ref in self._get_refs()
                  if ref.objid not in objids]
        print(repr(result))

    def do_quit(self, arg):
        "Quit PDFExplore"
        self.fp.close()
        sys.exit(0)

    do_q = do_quit
Exemple #2
0
class PDFExploreCmd(cmd.Cmd):
    prompt = '>>> '
    
    def __init__(self, pdf_path):
        cmd.Cmd.__init__(self)
        self.debug = False
        self.current_obj = None
        self.pdf_path = pdf_path
        self.fp = open(pdf_path, 'rb')
        self.parser = PDFParser(self.fp)
        self.doc = PDFDocument()
        self.parser.set_document(self.doc)
        self.doc.set_parser(self.parser)
        self.doc.initialize()
    
    def _cached_objects(self):
        return sorted(list(self.doc._cached_objs.items()) + list(self.doc._parsed_objs.items()))
    
    def _get_refs(self):
        result = []
        def search(obj, objid):
            if isinstance(obj, PDFObjRef):
                result.append((objid, obj))
            elif isinstance(obj, dict):
                for value in obj.values():
                    search(value, objid)
            elif isinstance(obj, list):
                for value in obj:
                    search(value, objid)
        objs = self._cached_objects()
        for objid, obj in objs:
            search(obj, objid)
        return result
    
    def precmd(self, line):
        if self.debug and line.strip() not in {'debug', 'quit', 'q'}:
            import pdb; pdb.set_trace()
        return line
    
    def do_debug(self, arg):
        "Toggles debug mode (perform actions in the debugger)."
        self.debug = not self.debug
        fmt = "Debug mode %s"
        mode = "ON" if self.debug else "OFF"
        print(fmt % mode)
    
    def do_status(self, arg):
        "Print current status, positions, etc.."
        print("Lexer pos: %d" % self.parser.lex.lexpos)
        print("File Length: %d" % self.parser.lex.lexlen)
        if self.current_obj:
            (objid, _, obj) = self.current_obj
        else:
            objid, obj = -1, "None"
        print("Current Object: %d %s" % (objid, obj))
    do_st = do_status
    
    def do_xref(self, arg):
        "Print out the PDF's xrefs."
        for index, xref in enumerate(self.doc.xrefs, start=1):
            print("Xref #%d (%s)" % (index, xref.__class__.__name__))
            errors = []
            for objid in xref.get_objids():
                try:
                    _, pos = xref.get_pos(objid)
                    fmt = "%5d: %8d"
                    if pos >= self.parser.lex.lexlen:
                        fmt = "%5d: " + bcolors.WARNING + "%8d" + bcolors.ENDC
                    print(fmt % (objid, pos))
                except KeyError:
                    errors.append(objid)
            if errors:
                print("Errors on %s" % ', '.join(map(str, errors)))
    
    @intarg()
    def do_setpos(self, arg):
        "Set the current position of the parser to the offset supplied as an argument."
        self.parser.setpos(arg)
        self.parser.reset()
    
    @intarg(1)
    def do_rtok(self, arg):
        "Read the next X tokens, X being the supplied argument."
        tokens = []
        try:
            for _ in range(arg):
                pos, token = self.parser.nexttoken()
                token = str(token)
                if len(token) > 20:
                    token = token[:20] + "[...(%d)]" % (len(token)-20)
                tokens.append(token)
        except PSEOF:
            pass
        print(' '.join(tokens))
        if len(tokens) != arg:
            print("End of file reached")
    
    @intarg(1)
    def do_ptok(self, arg):
        "Peek the next X tokens, X being the supplied argument. Your current position will not change."
        pos = self.parser.lex.lexpos
        self.do_rtok(arg)
        self.do_setpos(pos)
    
    def do_robj(self, arg):
        "Read the next object and sets it as the 'current' object."
        objid, genno, obj = self.doc.readobj()
        self.current_obj = (objid, genno, obj)
        self.do_st('')
    
    @intarg()
    def do_sobj(self, arg):
        "Select object with ID X. The object has to have been read already."
        obj = None
        if arg in self.doc._cached_objs:
            obj = self.doc._cached_objs[arg]
        elif arg in self.doc._parsed_objs:
            obj = self.doc._parsed_objs[arg]
        else:
            print("Object hasn't been read yet.")
            strmid, index = self.doc.find_obj_ref(arg)
            if index is not None:
                print("However, our object id is in a xref")
                if strmid:
                    print("Stream ID: %d" % strmid)
                print("Position: %d" % index)
        if obj is not None:
            self.current_obj = (arg, 0, obj)
            self.do_st('')
    
    def do_dbgobj(self, arg):
        "Enter in debug mode with current obj as 'obj' in the local scope."
        if not self.current_obj:
            print("No current obj.")
            return
        objid, genno, obj = self.current_obj
        import pdb; pdb.set_trace()
    
    def do_readall(self, arg):
        "Read all objects in the document."
        self.doc._parse_everything()
        print("Read %d objects:" % len(self.doc._cached_objs))
        self.do_whatisread('')
    
    def do_dumpdata(self, arg):
        "For each read stream, print out the decoded data it contains."
        objs = self._cached_objects()
        for objid, obj in objs:
            print("Dumping obj id: %d" % objid)
            print(repr(obj))
            if hasattr(obj, 'get_data'):
                print(repr(obj.get_data()))
    
    def do_whatisread(self, arg):
        "Prints a list of all read object ids."
        objs = self._cached_objects()
        print(repr([objid for objid, obj in objs]))
    
    def do_refs(self, arg):
        "Look in all read objects and find all objects that reference to our current object."
        if not self.current_obj:
            print("No current obj.")
            return
        
        target_id, _, _ = self.current_obj
        result = [parent_id for parent_id, ref in self._get_refs() if ref.objid == target_id]
        print(repr(result))
    
    def do_deadrefs(self, arg):
        "Print (dead_id, host_id) for all dead references in the document."
        objs = self._cached_objects()
        objids = {objid for objid, obj in objs}
        result = [(ref.objid, parent_id) for parent_id, ref in self._get_refs() if ref.objid not in objids]
        print(repr(result))
    
    def do_quit(self, arg):
        "Quit PDFExplore"
        self.fp.close()
        sys.exit(0)
    do_q = do_quit