Python cPDFParserの例、pdfparser.cPDFParser Pythonの例

コード例 #1

0

ファイルを表示

ファイル: parser_contents2json.py プロジェクト: 0day1day/pdfxray_public

def contents(file):
	PDF_ELEMENT_INDIRECT_OBJECT = 2
	oPDFParser = pdfparser.cPDFParser(file)
	cntComment = 0
	cntXref = 0
	cntTrailer = 0
	cntStartXref = 0
	cntIndirectObject = 0
	dicObjectTypes = {}
	content_json_objs = [] #9bplus
	
	selectComment = True
	selectXref = True
	selectTrailer = True
	selectStartXref = True
	selectIndirectObject = True
	
	while True:
	    object = oPDFParser.GetObject()
	    if object != None:
			if object.type == PDF_ELEMENT_INDIRECT_OBJECT and selectIndirectObject:
				content_json_objs.append(pdfparser.content2JSON(object))
	    else:
		    break	
	
	data = { 'object': content_json_objs }
	result = json.dumps(data)
	return result

コード例 #2

0

ファイルを表示

ファイル: parser_hash2json.py プロジェクト: jonz-secops/pdfxray_public

def conversion(file):

    PDF_ELEMENT_INDIRECT_OBJECT = 2
    oPDFParser = pdfparser.cPDFParser(file)
    cntComment = 0
    cntXref = 0
    cntTrailer = 0
    cntStartXref = 0
    cntIndirectObject = 0
    dicObjectTypes = {}
    json_objs = []  #9bplus

    selectComment = True
    selectXref = True
    selectTrailer = True
    selectStartXref = True
    selectIndirectObject = True

    while True:
        object = oPDFParser.GetObject()
        if object != None:
            if object.type == PDF_ELEMENT_INDIRECT_OBJECT and selectIndirectObject:
                json_objs.append(pdfparser.hash2JSON(object))
        else:
            break

    data = {'object': json_objs}
    result = json.dumps(data)
    return result

コード例 #3

0

ファイルを表示

ファイル: __init__.py プロジェクト: 0x3a/crits_services

    def object_search(self, data, search_size=100):
        """
        Locate objects and references of interest
        @return dictionary containing object types and object id's
        - Use regex and strings to locate PDF tags of interest

        Note: It is important that objects_str definitions do 
            not detect objects found with objects_regex defs.
        """
        oPDFParser = pdfparser.cPDFParser(data)
        done = False 
        objects = {}
        objects_regex = [(r'js', r'\/JavaScript\s(\d+)\s\d+\sR'),
                        (r'js', r'\/JS\s(\d+)\s\d+\sR'),
                        (r'file', r'\/F\s(\d+)\s\d+\sR')]

        objects_str = [(r'js', '/JavaScript\n'),
                        (r'js', '/JavaScript\r\n'),
                        (r'js', '/JS\n'),
                        (r'js', '/JS\r\n'),
                        (r'file', '/F\n'),
                        (r'file', '/F\r\n')]

        #Walk the PDF objects
        while done == False:
            try:
                pdf_object = oPDFParser.GetObject()
            except Exception as e:
                pdf_object = None

            if pdf_object != None:
                if pdf_object.type in [pdfparser.PDF_ELEMENT_INDIRECT_OBJECT]:
                    #See if this PDF object has references to items of interest
                    rawContent = pdfparser.FormatOutput(pdf_object.content, True)
                    pdf_references = pdf_object.GetReferences()
                    if pdf_references:
                        #Match getReferences() with objects_regex results
                        for item in objects_regex:
                            matches = re.findall(item[1],rawContent[:search_size])
                            for match in matches:
                                for ref in pdf_references:
                                    #Record found items
                                    if match == ref[0]:
                                        if objects.get(item[0]):
                                            objects[item[0]].append(match)
                                        else:
                                            objects[item[0]] = [match]
                    #Find items within the current object.
                    for item in objects_str:
                        if pdf_object.Contains(item[1]):
                            if objects.get(item[0]):
                                objects[item[0]].append(str(pdf_object.id))
                            else:
                                objects[item[0]] = [str(pdf_object.id)]
            else:
                done = True
        return objects

コード例 #4

0

ファイルを表示

    def object_search(self, data, search_size=100):
        """
        Locate objects and references of interest
        @return dictionary containing object types and object id's
        - Use regex and strings to locate PDF tags of interest

        Note: It is important that objects_str definitions do 
            not detect objects found with objects_regex defs.
        """
        oPDFParser = pdfparser.cPDFParser(data)
        done = False
        objects = {}
        objects_regex = [(r'js', r'\/JavaScript\s(\d+)\s\d+\sR'),
                         (r'js', r'\/JS\s(\d+)\s\d+\sR'),
                         (r'file', r'\/F\s(\d+)\s\d+\sR')]

        objects_str = [(r'js', '/JavaScript\n'), (r'js', '/JavaScript\r\n'),
                       (r'js', '/JS\n'), (r'js', '/JS\r\n'), (r'file', '/F\n'),
                       (r'file', '/F\r\n')]

        #Walk the PDF objects
        while done == False:
            try:
                pdf_object = oPDFParser.GetObject()
            except Exception as e:
                pdf_object = None

            if pdf_object != None:
                if pdf_object.type in [pdfparser.PDF_ELEMENT_INDIRECT_OBJECT]:
                    #See if this PDF object has references to items of interest
                    rawContent = pdfparser.FormatOutput(
                        pdf_object.content, True)
                    pdf_references = pdf_object.GetReferences()
                    if pdf_references:
                        #Match getReferences() with objects_regex results
                        for item in objects_regex:
                            matches = re.findall(item[1],
                                                 rawContent[:search_size])
                            for match in matches:
                                for ref in pdf_references:
                                    #Record found items
                                    if match == ref[0]:
                                        if objects.get(item[0]):
                                            objects[item[0]].append(match)
                                        else:
                                            objects[item[0]] = [match]
                    #Find items within the current object.
                    for item in objects_str:
                        if pdf_object.Contains(item[1]):
                            if objects.get(item[0]):
                                objects[item[0]].append(str(pdf_object.id))
                            else:
                                objects[item[0]] = [str(pdf_object.id)]
            else:
                done = True
        return objects

コード例 #5

0

ファイルを表示

ファイル: object_builder.py プロジェクト: Titotix/malpdfobj

def get_indirect_objects(file):
    oPDFParser = pdfparser.cPDFParser(file)
    indirect_objects = []

    while True:
        object = oPDFParser.GetObject()
        if object != None:
            if object.type == PDF_ELEMENT_INDIRECT_OBJECT:
                indirect_objects.append(object)
        else:
            break

    return indirect_objects

コード例 #6

0

ファイルを表示

ファイル: object_builder.py プロジェクト: Titotix/malpdfobj

def get_indirect_objects2json(file, hexa):
    oPDFParser = pdfparser.cPDFParser(file)
    indirect_objects = []

    while True:
        object = oPDFParser.GetObject()
        if object != None:
            if object.type == PDF_ELEMENT_INDIRECT_OBJECT:
                indirect_objects.append(pdfparser.content2JSON(object, hexa))
        else:
            break

    return indirect_objects

コード例 #7

0

ファイルを表示

    def run_pdfparser(self, data):
        """
        Uses pdf-parser to get information for each object.
        """
        oPDFParser = pdfparser.cPDFParser(data)
        done = False
        found_objects = {}

        #Walk the PDF and inspect PDF objects
        found_objects = self.object_search(data)

        while done == False:
            try:
                pdf_object = oPDFParser.GetObject()
            except Exception as e:
                pdf_object = None

            if pdf_object != None:
                if pdf_object.type in [pdfparser.PDF_ELEMENT_INDIRECT_OBJECT]:
                    #Get general information for this PDF object
                    rawContent = pdfparser.FormatOutput(
                        pdf_object.content, True)
                    section_md5_digest = hashlib.md5(rawContent).hexdigest()
                    section_entropy = self.H(rawContent)
                    object_type = pdf_object.GetType()

                    #Access data associated with this PDF object
                    if pdf_object.ContainsStream():
                        object_stream = True
                        try:
                            #decompress stream using codec
                            streamContent = pdf_object.Stream()
                        except Exception as e:
                            streamContent = "decompress failed."

                        if "decompress failed." in streamContent[:50]:
                            #Provide raw stream data
                            streamContent = pdf_object.Stream('')

                        #Stream returns list of object tags (not actual stream data)
                        if type(streamContent) == list:
                            streamContent = pdfparser.FormatOutput(
                                pdf_object.content, True)
                            #Inspect pdf_object.content and extract raw stream
                            stream_start = streamContent.find('stream') + len(
                                'stream')
                            stream_end = streamContent.rfind('endstream')
                            if stream_start >= 0 and stream_end > 0:
                                streamContent = streamContent[
                                    stream_start:stream_end]

                        stream_md5_digest = hashlib.md5(
                            streamContent).hexdigest()
                    else:
                        object_stream = False
                        stream_md5_digest = ''

                    #Collect references between this object and others
                    object_references = []
                    for reference in pdf_object.GetReferences():
                        object_references.append(reference[0])
                    object_references = ','.join(object_references)

                    #Get results from the object searching
                    object_content = []
                    if found_objects.get('js'):
                        if str(pdf_object.id) in found_objects.get('js'):
                            object_content.append('JavaScript')
                    if found_objects.get('file'):
                        if str(pdf_object.id) in found_objects.get('file'):
                            object_content.append('EmbeddedFile')

                    result = {
                        "obj_id": pdf_object.id,
                        "obj_version": pdf_object.version,
                        "size": len(rawContent),
                        "type": object_type,
                        "entropy": section_entropy,
                        "content": ','.join(object_content),
                        "x_refs": object_references,
                        "stream": object_stream,
                        "stream_md5": stream_md5_digest,
                    }
                    self._add_result('pdf_parser', section_md5_digest, result)
            else:
                done = True

コード例 #8

0

ファイルを表示

ファイル: pdfinfo.py プロジェクト: nbeede/multiscanner

def run(fname, data, fast=False):
    ret = {}
    ret['objects'] = {}
    ret['stats'] = {}
    #data = obj.filedata.read()
    object_summary = {
        'XRef':	0,
        'Catalog': 0,
        'ObjStm': 0,
        'Page': 0,
        'Metadata': 0,
        'XObject': 0,
        'Sig': 0,
        'Pages': 0,
        'FontDescriptor': 0,
        'Font': 0,
        'EmbeddedFile': 0,
        'StructTreeRoot': 0,
        'Mask': 0,
        'Group': 0,
        'Outlines': 0,
        'Action': 0,
        'Annot': 0,
        'Other_objects': 0,
        'Encoding': 0,
        'ExtGState': 0,
        'Pattern': 0,
        '3D': 0,
        'Total': 0,
        'Version': '',
    }
    object_summary["Version"] = _get_pdf_version(data[:1024])
    oPDFParser = pdfparser.cPDFParser(fname)
    done = True
    #self._debug("Parsing document")
    while done == True:
        try:
            pdf_object = oPDFParser.GetObject()
        except Exception as e:
            pdf_object = None
        if pdf_object != None:
            if pdf_object.type in [pdfparser.PDF_ELEMENT_INDIRECT_OBJECT]:
                rawContent = pdfparser.FormatOutput(pdf_object.content, True)
                object_type = pdf_object.GetType()
                if not fast:
                    section_md5_digest = hashlib.md5(rawContent.encode(encoding='UTF-8', errors='replace')).hexdigest()
                    section_entropy = H(rawContent)
                    result = {
                            "obj_id": pdf_object.id,
                            "obj_version": pdf_object.version,
                            "size": len(rawContent),
                            "md5": section_md5_digest,
                            "type": object_type,
                            "entropy": section_entropy,
                    }
                else:
                    result = {
                            "obj_id": pdf_object.id,
                            "obj_version": pdf_object.version,
                            "size": len(rawContent),
                            "type": object_type,
                    }
                    
                if object_type[1:] in object_summary:
                    object_summary[object_type[1:]] += 1
                else:
                    object_summary["Other_objects"] += 1
                object_summary["Total"] += 1
                #self._add_result('pdf_object', pdf_object.id, result)
                ret['objects'][pdf_object.id] = result
        else:
            done = False
    for item in object_summary.items():
        item_str = "{0}: {1}".format(item[0], item[1])
        #self._add_result('stats', item_str, {'type': item[0], 'count': item[1]})
        ret['stats'][item[0]] = item[1]
    return ret

コード例 #9

0

ファイルを表示

ファイル: __init__.py プロジェクト: 0x3a/crits_services

    def run_pdfparser(self, data):
        """
        Uses pdf-parser to get information for each object.
        """        
        oPDFParser = pdfparser.cPDFParser(data)
        done = False
        found_objects = {}

        #Walk the PDF and inspect PDF objects
        found_objects = self.object_search(data)

        while done == False:
            try:
                pdf_object = oPDFParser.GetObject()
            except Exception as e:
                pdf_object = None

            if pdf_object != None:
                if pdf_object.type in [pdfparser.PDF_ELEMENT_INDIRECT_OBJECT]:
                    #Get general information for this PDF object
                    rawContent = pdfparser.FormatOutput(pdf_object.content, True)
                    section_md5_digest = hashlib.md5(rawContent).hexdigest()
                    section_entropy = self.H(rawContent)
                    object_type = pdf_object.GetType()

                    #Access data associated with this PDF object
                    if pdf_object.ContainsStream():
                        object_stream = True
                        try:
                            #decompress stream using codec
                            streamContent = pdf_object.Stream() 
                        except Exception as e:
                            streamContent = "decompress failed."

                        if "decompress failed." in streamContent[:50]:
                            #Provide raw stream data
                            streamContent = pdf_object.Stream('')

                        #Stream returns list of object tags (not actual stream data)
                        if type(streamContent) == list:
                            streamContent = pdfparser.FormatOutput(pdf_object.content, True)
                            #Inspect pdf_object.content and extract raw stream
                            stream_start = streamContent.find('stream') + len('stream')
                            stream_end = streamContent.rfind('endstream')
                            if stream_start >= 0 and stream_end > 0:
                                streamContent = streamContent[stream_start:stream_end]

                        stream_md5_digest = hashlib.md5(streamContent).hexdigest()
                    else:
                        object_stream = False
                        stream_md5_digest = ''

                    #Collect references between this object and others
                    object_references = []
                    for reference in pdf_object.GetReferences():
                        object_references.append(reference[0])
                    object_references = ','.join(object_references)

                    #Get results from the object searching
                    object_content = []
                    if found_objects.get('js'):
                        if str(pdf_object.id) in found_objects.get('js'):
                            object_content.append('JavaScript')
                    if found_objects.get('file'):
                        if str(pdf_object.id) in found_objects.get('file'):
                            object_content.append('EmbeddedFile')

                    result = {
                            "obj_id":           pdf_object.id,
                            "obj_version":      pdf_object.version,
                            "size":             len(rawContent),
                            "type":             object_type,
                            "entropy":          section_entropy,
                            "content":          ','.join(object_content),
                            "x_refs":           object_references,
                            "stream":           object_stream,
                            "stream_md5":       stream_md5_digest,
                    }
                    self._add_result('pdf_parser', section_md5_digest, result)
            else:
                done = True

コード例 #10

0

ファイルを表示

ファイル: __init__.py プロジェクト: maurakilleen/crits_services

    def _scan(self, context):
        data = context.data
        self.object_summary = {
            'XRef':             0,
            'Catalog':          0,
            'ObjStm':           0,
            'Page':             0,
            'Metadata':         0,
            'XObject':          0,
            'Sig':              0,
            'Pages':            0,
            'FontDescriptor':   0,
            'Font':             0,
            'EmbeddedFile':     0,
            'StructTreeRoot':   0,
            'Mask':             0,
            'Group':            0,
            'Outlines':         0,
            'Action':           0,
            'Annot':            0,
            'Other_objects':    0,
            'Encoding':         0,
            'ExtGState':        0,
            'Pattern':          0,
            '3D':               0,
            'Total':            0,
            'Version':      '',
        }
        self.object_summary["Version"] = self._get_pdf_version(data[:1024])

        oPDFParser = pdfparser.cPDFParser(data)
        done = True
        self._debug("Parsing document")
        while done == True:
            try:
                pdf_object = oPDFParser.GetObject()
            except Exception as e:
                pdf_object = None
            if pdf_object != None:
                if pdf_object.type in [pdfparser.PDF_ELEMENT_INDIRECT_OBJECT]:
                    rawContent = pdfparser.FormatOutput(pdf_object.content, True)
                    section_md5_digest = hashlib.md5(rawContent).hexdigest()
                    section_entropy = self.H(rawContent)
                    object_type = pdf_object.GetType()
                    result = {
                            "obj_id":           pdf_object.id,
                            "obj_version":      pdf_object.version,
                            "size":             len(rawContent),
                            "md5":              section_md5_digest,
                            "type":             object_type,
                            "entropy":          section_entropy,
                    }
                    if object_type[1:] in self.object_summary:
                        self.object_summary[object_type[1:]] += 1
                    else:
                        self.object_summary["Other_objects"] += 1
                    self.object_summary["Total"] += 1
                    self._add_result('pdf_object', pdf_object.id, result)
            else:
                done = False
        for item in self.object_summary.items():
            item_str = "{0}: {1}".format(item[0], item[1])
            self._add_result('stats', item_str, {'type': item[0], 'count': item[1]})

コード例 #11

0

ファイルを表示

ファイル: __init__.py プロジェクト: Pr0hest/crits_services

    def run(self, obj, config):
        data = obj.filedata.read()
        self.object_summary = {
            'XRef':             0,
            'Catalog':          0,
            'ObjStm':           0,
            'Page':             0,
            'Metadata':         0,
            'XObject':          0,
            'Sig':              0,
            'Pages':            0,
            'FontDescriptor':   0,
            'Font':             0,
            'EmbeddedFile':     0,
            'StructTreeRoot':   0,
            'Mask':             0,
            'Group':            0,
            'Outlines':         0,
            'Action':           0,
            'Annot':            0,
            'Other_objects':    0,
            'Encoding':         0,
            'ExtGState':        0,
            'Pattern':          0,
            '3D':               0,
            'Total':            0,
            'Version':      '',
        }
        self.object_summary["Version"] = self._get_pdf_version(data[:1024])

        oPDFParser = pdfparser.cPDFParser(data)
        done = True
        self._debug("Parsing document")
        while done == True:
            try:
                pdf_object = oPDFParser.GetObject()
            except Exception as e:
                pdf_object = None
            if pdf_object != None:
                if pdf_object.type in [pdfparser.PDF_ELEMENT_INDIRECT_OBJECT]:
                    rawContent = pdfparser.FormatOutput(pdf_object.content, True)
                    section_md5_digest = hashlib.md5(rawContent).hexdigest()
                    section_entropy = self.H(rawContent)
                    object_type = pdf_object.GetType()
                    result = {
                            "obj_id":           pdf_object.id,
                            "obj_version":      pdf_object.version,
                            "size":             len(rawContent),
                            "md5":              section_md5_digest,
                            "type":             object_type,
                            "entropy":          section_entropy,
                    }
                    if object_type[1:] in self.object_summary:
                        self.object_summary[object_type[1:]] += 1
                    else:
                        self.object_summary["Other_objects"] += 1
                    self.object_summary["Total"] += 1
                    self._add_result('pdf_object', pdf_object.id, result)
            else:
                done = False
        for item in self.object_summary.items():
            item_str = "{0}: {1}".format(item[0], item[1])
            self._add_result('stats', item_str, {'type': item[0], 'count': item[1]})

コード例 #12

0

ファイルを表示

ファイル: __init__.py プロジェクト: 9b/crits_services

    def _scan(self, context):
        data = context.data
        self.object_summary = {
            "XRef": 0,
            "Catalog": 0,
            "ObjStm": 0,
            "Page": 0,
            "Metadata": 0,
            "XObject": 0,
            "Sig": 0,
            "Pages": 0,
            "FontDescriptor": 0,
            "Font": 0,
            "EmbeddedFile": 0,
            "StructTreeRoot": 0,
            "Mask": 0,
            "Group": 0,
            "Outlines": 0,
            "Action": 0,
            "Annot": 0,
            "Other_objects": 0,
            "Encoding": 0,
            "ExtGState": 0,
            "Pattern": 0,
            "3D": 0,
            "Total": 0,
            "Version": "",
        }
        self.object_summary["Version"] = self._get_pdf_version(data[:1024])

        oPDFParser = pdfparser.cPDFParser(data)
        done = True
        self._debug("Parsing document")
        while done == True:
            try:
                pdf_object = oPDFParser.GetObject()
            except Exception as e:
                pdf_object = None
            if pdf_object != None:
                if pdf_object.type in [pdfparser.PDF_ELEMENT_INDIRECT_OBJECT]:
                    rawContent = pdfparser.FormatOutput(pdf_object.content, True)
                    section_md5_digest = hashlib.md5(rawContent).hexdigest()
                    section_entropy = self.H(rawContent)
                    object_type = pdf_object.GetType()
                    result = {
                        "obj_id": pdf_object.id,
                        "obj_version": pdf_object.version,
                        "size": len(rawContent),
                        "md5": section_md5_digest,
                        "type": object_type,
                        "entropy": section_entropy,
                    }
                    if object_type[1:] in self.object_summary:
                        self.object_summary[object_type[1:]] += 1
                    else:
                        self.object_summary["Other_objects"] += 1
                    self.object_summary["Total"] += 1
                    self._add_result("pdf_object", pdf_object.id, result)
            else:
                done = False
        for item in self.object_summary.items():
            item_str = "{0}: {1}".format(item[0], item[1])
            self._add_result("stats", item_str, {"type": item[0], "count": item[1]})

コード例 #13

0

ファイルを表示

ファイル: pdfinfo.py プロジェクト: visaobuon/multiscanner

def run(fname, data, fast=False):
    ret = {}
    ret['objects'] = {}
    ret['stats'] = {}
    #data = obj.filedata.read()
    object_summary = {
        'XRef': 0,
        'Catalog': 0,
        'ObjStm': 0,
        'Page': 0,
        'Metadata': 0,
        'XObject': 0,
        'Sig': 0,
        'Pages': 0,
        'FontDescriptor': 0,
        'Font': 0,
        'EmbeddedFile': 0,
        'StructTreeRoot': 0,
        'Mask': 0,
        'Group': 0,
        'Outlines': 0,
        'Action': 0,
        'Annot': 0,
        'Other_objects': 0,
        'Encoding': 0,
        'ExtGState': 0,
        'Pattern': 0,
        '3D': 0,
        'Total': 0,
        'Version': '',
    }
    object_summary["Version"] = _get_pdf_version(data[:1024])
    oPDFParser = pdfparser.cPDFParser(fname)
    done = True
    #self._debug("Parsing document")
    while done == True:
        try:
            pdf_object = oPDFParser.GetObject()
        except Exception as e:
            pdf_object = None
        if pdf_object != None:
            if pdf_object.type in [pdfparser.PDF_ELEMENT_INDIRECT_OBJECT]:
                rawContent = pdfparser.FormatOutput(pdf_object.content, True)
                if PY3:
                    rawContent = rawContent.encode('utf-8', 'replace')
                object_type = pdf_object.GetType()
                if not fast:
                    section_md5_digest = hashlib.md5(rawContent).hexdigest()
                    section_entropy = H(rawContent)
                    result = {
                        "obj_id": pdf_object.id,
                        "obj_version": pdf_object.version,
                        "size": len(rawContent),
                        "md5": section_md5_digest,
                        "type": object_type,
                        "entropy": section_entropy,
                    }
                else:
                    result = {
                        "obj_id": pdf_object.id,
                        "obj_version": pdf_object.version,
                        "size": len(rawContent),
                        "type": object_type,
                    }

                if object_type[1:] in object_summary:
                    object_summary[object_type[1:]] += 1
                else:
                    object_summary["Other_objects"] += 1
                object_summary["Total"] += 1
                #self._add_result('pdf_object', pdf_object.id, result)
                ret['objects'][pdf_object.id] = result
        else:
            done = False
    for item in object_summary.items():
        item_str = "{0}: {1}".format(item[0], item[1])
        #self._add_result('stats', item_str, {'type': item[0], 'count': item[1]})
        ret['stats'][item[0]] = item[1]
    return ret