Example #1
0
    def object_search(self, data, search_size=100):
        """
        Locate objects and references of interest
        @return dictionary containing object types and object id's
        - Use regex and strings to locate PDF tags of interest

        Note: It is important that objects_str definitions do 
            not detect objects found with objects_regex defs.
        """
        oPDFParser = pdfparser.cPDFParser(data)
        done = False
        objects = {}
        objects_regex = [(r'js', r'\/JavaScript\s(\d+)\s\d+\sR'),
                         (r'js', r'\/JS\s(\d+)\s\d+\sR'),
                         (r'file', r'\/F\s(\d+)\s\d+\sR')]

        objects_str = [(r'js', '/JavaScript\n'), (r'js', '/JavaScript\r\n'),
                       (r'js', '/JS\n'), (r'js', '/JS\r\n'), (r'file', '/F\n'),
                       (r'file', '/F\r\n')]

        #Walk the PDF objects
        while done == False:
            try:
                pdf_object = oPDFParser.GetObject()
            except Exception as e:
                pdf_object = None

            if pdf_object != None:
                if pdf_object.type in [pdfparser.PDF_ELEMENT_INDIRECT_OBJECT]:
                    #See if this PDF object has references to items of interest
                    rawContent = pdfparser.FormatOutput(
                        pdf_object.content, True)
                    pdf_references = pdf_object.GetReferences()
                    if pdf_references:
                        #Match getReferences() with objects_regex results
                        for item in objects_regex:
                            matches = re.findall(item[1],
                                                 rawContent[:search_size])
                            for match in matches:
                                for ref in pdf_references:
                                    #Record found items
                                    if match == ref[0]:
                                        if objects.get(item[0]):
                                            objects[item[0]].append(match)
                                        else:
                                            objects[item[0]] = [match]
                    #Find items within the current object.
                    for item in objects_str:
                        if pdf_object.Contains(item[1]):
                            if objects.get(item[0]):
                                objects[item[0]].append(str(pdf_object.id))
                            else:
                                objects[item[0]] = [str(pdf_object.id)]
            else:
                done = True
        return objects
Example #2
0
    def _scan(self, context):
        data = context.data
        self.object_summary = {
            'XRef':             0,
            'Catalog':          0,
            'ObjStm':           0,
            'Page':             0,
            'Metadata':         0,
            'XObject':          0,
            'Sig':              0,
            'Pages':            0,
            'FontDescriptor':   0,
            'Font':             0,
            'EmbeddedFile':     0,
            'StructTreeRoot':   0,
            'Mask':             0,
            'Group':            0,
            'Outlines':         0,
            'Action':           0,
            'Annot':            0,
            'Other_objects':    0,
            'Encoding':         0,
            'ExtGState':        0,
            'Pattern':          0,
            '3D':               0,
            'Total':            0,
            'Version':      '',
        }
        self.object_summary["Version"] = self._get_pdf_version(data[:1024])

        oPDFParser = pdfparser.cPDFParser(data)
        done = True
        self._debug("Parsing document")
        while done == True:
            try:
                pdf_object = oPDFParser.GetObject()
            except Exception as e:
                pdf_object = None
            if pdf_object != None:
                if pdf_object.type in [pdfparser.PDF_ELEMENT_INDIRECT_OBJECT]:
                    rawContent = pdfparser.FormatOutput(pdf_object.content, True)
                    section_md5_digest = hashlib.md5(rawContent).hexdigest()
                    section_entropy = self.H(rawContent)
                    object_type = pdf_object.GetType()
                    result = {
                            "obj_id":           pdf_object.id,
                            "obj_version":      pdf_object.version,
                            "size":             len(rawContent),
                            "md5":              section_md5_digest,
                            "type":             object_type,
                            "entropy":          section_entropy,
                    }
                    if object_type[1:] in self.object_summary:
                        self.object_summary[object_type[1:]] += 1
                    else:
                        self.object_summary["Other_objects"] += 1
                    self.object_summary["Total"] += 1
                    self._add_result('pdf_object', pdf_object.id, result)
            else:
                done = False
        for item in self.object_summary.items():
            item_str = "{0}: {1}".format(item[0], item[1])
            self._add_result('stats', item_str, {'type': item[0], 'count': item[1]})
Example #3
0
    def run_pdfparser(self, data):
        """
        Uses pdf-parser to get information for each object.
        """
        oPDFParser = pdfparser.cPDFParser(data)
        done = False
        found_objects = {}

        #Walk the PDF and inspect PDF objects
        found_objects = self.object_search(data)

        while done == False:
            try:
                pdf_object = oPDFParser.GetObject()
            except Exception as e:
                pdf_object = None

            if pdf_object != None:
                if pdf_object.type in [pdfparser.PDF_ELEMENT_INDIRECT_OBJECT]:
                    #Get general information for this PDF object
                    rawContent = pdfparser.FormatOutput(
                        pdf_object.content, True)
                    section_md5_digest = hashlib.md5(rawContent).hexdigest()
                    section_entropy = self.H(rawContent)
                    object_type = pdf_object.GetType()

                    #Access data associated with this PDF object
                    if pdf_object.ContainsStream():
                        object_stream = True
                        try:
                            #decompress stream using codec
                            streamContent = pdf_object.Stream()
                        except Exception as e:
                            streamContent = "decompress failed."

                        if "decompress failed." in streamContent[:50]:
                            #Provide raw stream data
                            streamContent = pdf_object.Stream('')

                        #Stream returns list of object tags (not actual stream data)
                        if type(streamContent) == list:
                            streamContent = pdfparser.FormatOutput(
                                pdf_object.content, True)
                            #Inspect pdf_object.content and extract raw stream
                            stream_start = streamContent.find('stream') + len(
                                'stream')
                            stream_end = streamContent.rfind('endstream')
                            if stream_start >= 0 and stream_end > 0:
                                streamContent = streamContent[
                                    stream_start:stream_end]

                        stream_md5_digest = hashlib.md5(
                            streamContent).hexdigest()
                    else:
                        object_stream = False
                        stream_md5_digest = ''

                    #Collect references between this object and others
                    object_references = []
                    for reference in pdf_object.GetReferences():
                        object_references.append(reference[0])
                    object_references = ','.join(object_references)

                    #Get results from the object searching
                    object_content = []
                    if found_objects.get('js'):
                        if str(pdf_object.id) in found_objects.get('js'):
                            object_content.append('JavaScript')
                    if found_objects.get('file'):
                        if str(pdf_object.id) in found_objects.get('file'):
                            object_content.append('EmbeddedFile')

                    result = {
                        "obj_id": pdf_object.id,
                        "obj_version": pdf_object.version,
                        "size": len(rawContent),
                        "type": object_type,
                        "entropy": section_entropy,
                        "content": ','.join(object_content),
                        "x_refs": object_references,
                        "stream": object_stream,
                        "stream_md5": stream_md5_digest,
                    }
                    self._add_result('pdf_parser', section_md5_digest, result)
            else:
                done = True
Example #4
0
def run(fname, data, fast=False):
    ret = {}
    ret['objects'] = {}
    ret['stats'] = {}
    #data = obj.filedata.read()
    object_summary = {
        'XRef': 0,
        'Catalog': 0,
        'ObjStm': 0,
        'Page': 0,
        'Metadata': 0,
        'XObject': 0,
        'Sig': 0,
        'Pages': 0,
        'FontDescriptor': 0,
        'Font': 0,
        'EmbeddedFile': 0,
        'StructTreeRoot': 0,
        'Mask': 0,
        'Group': 0,
        'Outlines': 0,
        'Action': 0,
        'Annot': 0,
        'Other_objects': 0,
        'Encoding': 0,
        'ExtGState': 0,
        'Pattern': 0,
        '3D': 0,
        'Total': 0,
        'Version': '',
    }
    object_summary["Version"] = _get_pdf_version(data[:1024])
    oPDFParser = pdfparser.cPDFParser(fname)
    done = True
    #self._debug("Parsing document")
    while done == True:
        try:
            pdf_object = oPDFParser.GetObject()
        except Exception as e:
            pdf_object = None
        if pdf_object != None:
            if pdf_object.type in [pdfparser.PDF_ELEMENT_INDIRECT_OBJECT]:
                rawContent = pdfparser.FormatOutput(pdf_object.content, True)
                if PY3:
                    rawContent = rawContent.encode('utf-8', 'replace')
                object_type = pdf_object.GetType()
                if not fast:
                    section_md5_digest = hashlib.md5(rawContent).hexdigest()
                    section_entropy = H(rawContent)
                    result = {
                        "obj_id": pdf_object.id,
                        "obj_version": pdf_object.version,
                        "size": len(rawContent),
                        "md5": section_md5_digest,
                        "type": object_type,
                        "entropy": section_entropy,
                    }
                else:
                    result = {
                        "obj_id": pdf_object.id,
                        "obj_version": pdf_object.version,
                        "size": len(rawContent),
                        "type": object_type,
                    }

                if object_type[1:] in object_summary:
                    object_summary[object_type[1:]] += 1
                else:
                    object_summary["Other_objects"] += 1
                object_summary["Total"] += 1
                #self._add_result('pdf_object', pdf_object.id, result)
                ret['objects'][pdf_object.id] = result
        else:
            done = False
    for item in object_summary.items():
        item_str = "{0}: {1}".format(item[0], item[1])
        #self._add_result('stats', item_str, {'type': item[0], 'count': item[1]})
        ret['stats'][item[0]] = item[1]
    return ret