def object_search(self, data, search_size=100): """ Locate objects and references of interest @return dictionary containing object types and object id's - Use regex and strings to locate PDF tags of interest Note: It is important that objects_str definitions do not detect objects found with objects_regex defs. """ oPDFParser = pdfparser.cPDFParser(data) done = False objects = {} objects_regex = [(r'js', r'\/JavaScript\s(\d+)\s\d+\sR'), (r'js', r'\/JS\s(\d+)\s\d+\sR'), (r'file', r'\/F\s(\d+)\s\d+\sR')] objects_str = [(r'js', '/JavaScript\n'), (r'js', '/JavaScript\r\n'), (r'js', '/JS\n'), (r'js', '/JS\r\n'), (r'file', '/F\n'), (r'file', '/F\r\n')] #Walk the PDF objects while done == False: try: pdf_object = oPDFParser.GetObject() except Exception as e: pdf_object = None if pdf_object != None: if pdf_object.type in [pdfparser.PDF_ELEMENT_INDIRECT_OBJECT]: #See if this PDF object has references to items of interest rawContent = pdfparser.FormatOutput( pdf_object.content, True) pdf_references = pdf_object.GetReferences() if pdf_references: #Match getReferences() with objects_regex results for item in objects_regex: matches = re.findall(item[1], rawContent[:search_size]) for match in matches: for ref in pdf_references: #Record found items if match == ref[0]: if objects.get(item[0]): objects[item[0]].append(match) else: objects[item[0]] = [match] #Find items within the current object. for item in objects_str: if pdf_object.Contains(item[1]): if objects.get(item[0]): objects[item[0]].append(str(pdf_object.id)) else: objects[item[0]] = [str(pdf_object.id)] else: done = True return objects
def _scan(self, context): data = context.data self.object_summary = { 'XRef': 0, 'Catalog': 0, 'ObjStm': 0, 'Page': 0, 'Metadata': 0, 'XObject': 0, 'Sig': 0, 'Pages': 0, 'FontDescriptor': 0, 'Font': 0, 'EmbeddedFile': 0, 'StructTreeRoot': 0, 'Mask': 0, 'Group': 0, 'Outlines': 0, 'Action': 0, 'Annot': 0, 'Other_objects': 0, 'Encoding': 0, 'ExtGState': 0, 'Pattern': 0, '3D': 0, 'Total': 0, 'Version': '', } self.object_summary["Version"] = self._get_pdf_version(data[:1024]) oPDFParser = pdfparser.cPDFParser(data) done = True self._debug("Parsing document") while done == True: try: pdf_object = oPDFParser.GetObject() except Exception as e: pdf_object = None if pdf_object != None: if pdf_object.type in [pdfparser.PDF_ELEMENT_INDIRECT_OBJECT]: rawContent = pdfparser.FormatOutput(pdf_object.content, True) section_md5_digest = hashlib.md5(rawContent).hexdigest() section_entropy = self.H(rawContent) object_type = pdf_object.GetType() result = { "obj_id": pdf_object.id, "obj_version": pdf_object.version, "size": len(rawContent), "md5": section_md5_digest, "type": object_type, "entropy": section_entropy, } if object_type[1:] in self.object_summary: self.object_summary[object_type[1:]] += 1 else: self.object_summary["Other_objects"] += 1 self.object_summary["Total"] += 1 self._add_result('pdf_object', pdf_object.id, result) else: done = False for item in self.object_summary.items(): item_str = "{0}: {1}".format(item[0], item[1]) self._add_result('stats', item_str, {'type': item[0], 'count': item[1]})
def run_pdfparser(self, data): """ Uses pdf-parser to get information for each object. """ oPDFParser = pdfparser.cPDFParser(data) done = False found_objects = {} #Walk the PDF and inspect PDF objects found_objects = self.object_search(data) while done == False: try: pdf_object = oPDFParser.GetObject() except Exception as e: pdf_object = None if pdf_object != None: if pdf_object.type in [pdfparser.PDF_ELEMENT_INDIRECT_OBJECT]: #Get general information for this PDF object rawContent = pdfparser.FormatOutput( pdf_object.content, True) section_md5_digest = hashlib.md5(rawContent).hexdigest() section_entropy = self.H(rawContent) object_type = pdf_object.GetType() #Access data associated with this PDF object if pdf_object.ContainsStream(): object_stream = True try: #decompress stream using codec streamContent = pdf_object.Stream() except Exception as e: streamContent = "decompress failed." if "decompress failed." in streamContent[:50]: #Provide raw stream data streamContent = pdf_object.Stream('') #Stream returns list of object tags (not actual stream data) if type(streamContent) == list: streamContent = pdfparser.FormatOutput( pdf_object.content, True) #Inspect pdf_object.content and extract raw stream stream_start = streamContent.find('stream') + len( 'stream') stream_end = streamContent.rfind('endstream') if stream_start >= 0 and stream_end > 0: streamContent = streamContent[ stream_start:stream_end] stream_md5_digest = hashlib.md5( streamContent).hexdigest() else: object_stream = False stream_md5_digest = '' #Collect references between this object and others object_references = [] for reference in pdf_object.GetReferences(): object_references.append(reference[0]) object_references = ','.join(object_references) #Get results from the object searching object_content = [] if found_objects.get('js'): if str(pdf_object.id) in found_objects.get('js'): object_content.append('JavaScript') if found_objects.get('file'): if str(pdf_object.id) in found_objects.get('file'): object_content.append('EmbeddedFile') result = { "obj_id": pdf_object.id, "obj_version": pdf_object.version, "size": len(rawContent), "type": object_type, "entropy": section_entropy, "content": ','.join(object_content), "x_refs": object_references, "stream": object_stream, "stream_md5": stream_md5_digest, } self._add_result('pdf_parser', section_md5_digest, result) else: done = True
def run(fname, data, fast=False): ret = {} ret['objects'] = {} ret['stats'] = {} #data = obj.filedata.read() object_summary = { 'XRef': 0, 'Catalog': 0, 'ObjStm': 0, 'Page': 0, 'Metadata': 0, 'XObject': 0, 'Sig': 0, 'Pages': 0, 'FontDescriptor': 0, 'Font': 0, 'EmbeddedFile': 0, 'StructTreeRoot': 0, 'Mask': 0, 'Group': 0, 'Outlines': 0, 'Action': 0, 'Annot': 0, 'Other_objects': 0, 'Encoding': 0, 'ExtGState': 0, 'Pattern': 0, '3D': 0, 'Total': 0, 'Version': '', } object_summary["Version"] = _get_pdf_version(data[:1024]) oPDFParser = pdfparser.cPDFParser(fname) done = True #self._debug("Parsing document") while done == True: try: pdf_object = oPDFParser.GetObject() except Exception as e: pdf_object = None if pdf_object != None: if pdf_object.type in [pdfparser.PDF_ELEMENT_INDIRECT_OBJECT]: rawContent = pdfparser.FormatOutput(pdf_object.content, True) if PY3: rawContent = rawContent.encode('utf-8', 'replace') object_type = pdf_object.GetType() if not fast: section_md5_digest = hashlib.md5(rawContent).hexdigest() section_entropy = H(rawContent) result = { "obj_id": pdf_object.id, "obj_version": pdf_object.version, "size": len(rawContent), "md5": section_md5_digest, "type": object_type, "entropy": section_entropy, } else: result = { "obj_id": pdf_object.id, "obj_version": pdf_object.version, "size": len(rawContent), "type": object_type, } if object_type[1:] in object_summary: object_summary[object_type[1:]] += 1 else: object_summary["Other_objects"] += 1 object_summary["Total"] += 1 #self._add_result('pdf_object', pdf_object.id, result) ret['objects'][pdf_object.id] = result else: done = False for item in object_summary.items(): item_str = "{0}: {1}".format(item[0], item[1]) #self._add_result('stats', item_str, {'type': item[0], 'count': item[1]}) ret['stats'][item[0]] = item[1] return ret