def test_frommime(self): """Construction with a mimetype help""" # From file path mime_type = ct.CT_WORDPROC_DOCX_PUBLIC test_file_path = os.path.join(TEST_FILES_IN, ALL_IN_FILES[0]) doc = openxmllib.openXmlDocument(path=test_file_path, mime_type=mime_type) self.assertTrue( isinstance(doc, openxmllib.wordprocessing.WordprocessingDocument), "Failed to create with mime type %s" % mime_type) self.assertEqual(doc.mimeType, mime_type) # From file object fh = open(test_file_path, 'rb') doc = openxmllib.openXmlDocument(file_=fh, mime_type=mime_type) fh.close() self.assertTrue( isinstance(doc, openxmllib.wordprocessing.WordprocessingDocument), "Failed to create with mime type %s" % mime_type) self.assertEqual(doc.mimeType, mime_type) # From file content fh = open(test_file_path, 'rb') doc = openxmllib.openXmlDocument(data=fh.read(), mime_type=mime_type) fh.close() self.assertTrue( isinstance(doc, openxmllib.wordprocessing.WordprocessingDocument), "Failed to create with mime type %s" % mime_type) self.assertEqual(doc.mimeType, mime_type) return
def setUp(self): file_paths = [os.path.join(TEST_FILES_IN, fn) for fn in ALL_IN_FILES] self.docs = [openxmllib.openXmlDocument(path=pth) for pth in file_paths] cover_file_paths = [os.path.join(TEST_FILES_IN, fn) for fn in ALL_IN_COVER_FILES] self.coverdocs = [openxmllib.openXmlDocument(path=pth) for pth in cover_file_paths]
def test_frommime(self): """Construction with a mimetype help""" # From file path mime_type = ct.CT_WORDPROC_DOCX_PUBLIC test_file_path = os.path.join(TEST_FILES_IN, ALL_IN_FILES[0]) doc = openxmllib.openXmlDocument(path=test_file_path, mime_type=mime_type) self.failUnless(isinstance(doc, openxmllib.wordprocessing.WordprocessingDocument), "Failed to create with mime type %s" % mime_type) self.failUnlessEqual(doc.mimeType, mime_type) # From file object fh = file(test_file_path, 'rb') doc = openxmllib.openXmlDocument(file_=fh, mime_type=mime_type) fh.close() self.failUnless(isinstance(doc, openxmllib.wordprocessing.WordprocessingDocument), "Failed to create with mime type %s" % mime_type) self.failUnlessEqual(doc.mimeType, mime_type) # From file content fh = file(test_file_path, 'rb') doc = openxmllib.openXmlDocument(data=fh.read(), mime_type=mime_type) fh.close() self.failUnless(isinstance(doc, openxmllib.wordprocessing.WordprocessingDocument), "Failed to create with mime type %s" % mime_type) self.failUnlessEqual(doc.mimeType, mime_type) return
def setUp(self): test_file_path = os.path.join(TEST_FILES_IN, ALL_IN_FILES[0]) self.doc = openxmllib.openXmlDocument(path=test_file_path) test_template_file_path = os.path.join(TEST_FILES_IN, ALL_IN_TEMPLATE_FILES[0]) self.template = openxmllib.openXmlDocument( path=test_template_file_path) return
def get_metadata(self): """Returns a .modules.metadata.Metadata object """ self.metadata = Metadata() document = openxmllib.openXmlDocument(path=self.path) self.metadata.add(document.allProperties, "ooxml") return self.metadata
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data sf.debug("Received event, " + eventName + ", from " + srcModuleName) if eventData in self.results: return None else: self.results.append(eventData) for fileExt in self.opts['fileexts']: if self.checkForStop(): return None if "." + fileExt.lower() in eventData.lower(): # Fetch the file, allow much more time given that these files are # typically large. ret = sf.fetchUrl(eventData, timeout=self.opts['timeout'], useragent=self.opts['_useragent'], dontMangle=True) if ret['content'] == None: sf.error("Unable to fetch file for meta analysis: " + \ eventData, False) return None if len(ret['content']) < 1024: sf.error("Strange content encountered, size of " + \ len(res['content']), False) meta = None # Based on the file extension, handle it if fileExt.lower() == "pdf": try: data = StringIO(ret['content']) meta = str(metapdf.MetaPdfReader().read_metadata(data)) sf.debug("Obtained meta data from " + eventData) except BaseException as e: sf.error("Unable to parse meta data from: " + \ eventData + "(" + str(e) + ")", False) return None if fileExt.lower() in ["pptx", "docx", "xlsx"]: try: mtype = mimetypes.guess_type(eventData)[0] doc = openxmllib.openXmlDocument(data=ret['content'], mime_type=mtype) sf.debug("Office type: " + doc.mimeType) meta = str(doc.allProperties) except ValueError as e: sf.error("Unable to parse meta data from: " + \ eventData + "(" + str(e) + ")", False) if meta != None: evt = SpiderFootEvent("RAW_FILE_META_DATA", meta, self.__name__, event) self.notifyListeners(evt)
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data sf.debug("Received event, " + eventName + ", from " + srcModuleName) if eventData in self.results: return None else: self.results.append(eventData) for fileExt in self.opts['fileexts']: if self.checkForStop(): return None if "." + fileExt.lower() in eventData.lower(): # Fetch the file, allow much more time given that these files are # typically large. ret = sf.fetchUrl(eventData, timeout=self.opts['timeout'], useragent=self.opts['_useragent'], dontMangle=True) if ret['content'] == None: sf.error("Unable to fetch file for meta analysis: " + \ eventData, False) return None if len(ret['content']) < 1024: sf.error("Strange content encountered, size of " + \ len(res['content']), False) meta = None # Based on the file extension, handle it if fileExt.lower() == "pdf": try: data = StringIO(ret['content']) meta = str(metapdf.MetaPdfReader().read_metadata(data)) sf.debug("Obtained meta data from " + eventData) except BaseException as e: sf.error("Unable to parse meta data from: " + \ eventData + "(" + str(e) + ")", False) return None if fileExt.lower() in [ "pptx", "docx", "xlsx" ]: try: mtype = mimetypes.guess_type(eventData)[0] doc = openxmllib.openXmlDocument(data=ret['content'], mime_type=mtype) sf.debug("Office type: " + doc.mimeType) meta = str(doc.allProperties) except ValueError as e: sf.error("Unable to parse meta data from: " + \ eventData + "(" + str(e) + ")", False) except lxml.etree.XMLSyntaxError as e: sf.error("Unable to parse XML within: " + \ eventData + "(" + str(e) + ")", False) if meta != None: evt = SpiderFootEvent("RAW_FILE_META_DATA", meta, self.__name__, event) self.notifyListeners(evt)
def metadata(self): output = {} doc = openxmllib.openXmlDocument(path=self.filepath) for key, value in doc.coreProperties.items(): output[key] = value for key, value in doc.extendedProperties.items(): output[key] = value return output
def test_frompath(self): """Construction from a path""" for test_filename in ALL_IN_FILES: test_filepath = os.path.join(TEST_FILES_IN, test_filename) doc = openxmllib.openXmlDocument(test_filepath) self.failUnless(isinstance(doc, openxmllib.document.Document), "%s should be processed" % test_filepath) return
def test_frompath(self): """Construction from a path""" for test_filename in ALL_IN_FILES: test_filepath = os.path.join(TEST_FILES_IN, test_filename) doc = openxmllib.openXmlDocument(test_filepath) self.assertTrue(isinstance(doc, openxmllib.document.Document), "%s should be processed" % test_filepath) return
def getIndexableValue(self, field, instance): """ getIndexableValue(self, field, instance) => (possibliy big) string Return the ZCatalog-indexable string for that type. """ content = field.get(instance) content_type = field.getContentType(instance) doc = openxmllib.openXmlDocument(content.data, self.content_types[0]) return doc.indexableText().encode(instance.getCharset(), 'replace')
def test_xmlfile(self): """Working around absence of BOM support in lxml""" from lxml import etree test_file_path = os.path.join(TEST_FILES_IN, ALL_IN_FILES[0]) doc = openxmllib.openXmlDocument(test_file_path) toc_path = os.path.join(doc._cache_dir, '[Content_Types].xml') fh = openxmllib.utils.xmlFile(toc_path, 'rb') xml = etree.parse(fh) self.assertTrue(isinstance(xml.getroot(), etree._Element), "Expected an XML element") return
def test_fromfile(self): """Construction from a file file object""" for test_filename in ALL_IN_FILES: test_filepath = os.path.join(TEST_FILES_IN, test_filename) fh = file(test_filepath, 'rb') doc = openxmllib.openXmlDocument(test_filepath) fh.close() self.failUnless(isinstance(doc, openxmllib.document.Document), "%s should be processed" % test_filepath) return
def showWords(self, filename): if not self.checkfile(filename): return self.log(1, "Processing %s...", filename) start_time = time.time() doc = openxmllib.openXmlDocument(path=filename) text = doc.indexableText(include_properties=False) duration = time.time() - start_time print self.recode(text) self.log(1, "Words extracted in %s second(s)", duration) return
def test_fromfile(self): """Construction from a file file object""" for test_filename in ALL_IN_FILES: test_filepath = os.path.join(TEST_FILES_IN, test_filename) fh = open(test_filepath, 'rb') doc = openxmllib.openXmlDocument(test_filepath) fh.close() self.assertTrue(isinstance(doc, openxmllib.document.Document), "%s should be processed" % test_filepath) return
def test_xmlfile(self): """Working around absence of BOM support in lxml""" from lxml import etree test_file_path = os.path.join(TEST_FILES_IN, ALL_IN_FILES[0]) doc = openxmllib.openXmlDocument(test_file_path) toc_path = os.path.join(doc._cache_dir, '[Content_Types].xml') fh = openxmllib.utils.xmlFile(toc_path, 'rb') xml = etree.parse(fh) self.failUnless(isinstance(xml.getroot(), etree._Element), "Expected an XML element") return
def _getOpenXmlText(self, filename, c): logger.debug(u"OpenXmlText: %s" % filename) try: doc = openxmllib.openXmlDocument(path=filename) logger.debug(u"%s\n" % doc.allProperties) ap = c.addConceptKeyType(u"allProperties", u"PROPERTIES") for x in doc.allProperties: logger.debug(u"cp %s:%s" % (x, doc.allProperties[x])) ap.addConceptKeyType(doc.allProperties[x], x) except Exception, msg: logger.error(u"%s" % msg)
def convert(self, orig, data, **kwargs): #orig_file = kwargs.get('filename') or 'unknown.xxx' mimetype = kwargs.get('mimetype') filename = kwargs.get('filename') or 'unknown.xxx' if mimetype is None: mimetype = mimetypes.guess_type(filename)[0] try: doc = openxmllib.openXmlDocument(orig, mimetype) data.setData(doc.indexableText().encode(SITE_CHARSET, 'replace')) except ValueError, e: # Crappy data provided to the transform. logger.error("Crappy file provided, returning empty text", exc_info=True) data.setData('')
def showMetadata(self, filename): if not self.checkfile(filename): return self.log(1, "Processing %s...", filename) doc = openxmllib.openXmlDocument(path=filename) self.log(2, "Core properties:") for k, v in doc.coreProperties.items(): print "%s: %s" % (self.recode(k), self.recode(v)) self.log(2, "Extended properties:") for k, v in doc.extendedProperties.items(): print "%s: %s" % (self.recode(k), self.recode(v)) self.log(2, "Custom properties:") for k, v in doc.customProperties.items(): print "%s: %s" % (self.recode(k), self.recode(v)) return
def getOpenXmlText(filename, ftype): logger.info("OpenXmlText: %s" % filename) document = openxmldoc doc = openxmllib.openXmlDocument(path=filename) c = Concepts(filename, ftype) logger.debug ("%s\n" % (doc.allProperties)) ap = c.addConceptKeyType("allProperties","PROPERTIES") for x in doc.allProperties: logger.info("cp %s:%s" % (x, doc.allProperties[x])) ap.addConceptKeyType(doc.allProperties[x], x) logger.info("it %s\n" % (doc.indexableText(include_properties=True))) c.addConceptKeyType(doc.indexableText(include_properties=True),"TEXT") return c
def get_metadata(self, data, mimetype, filename): "implement the utility interface" self.doc = doc = openXmlDocument(data=data, mime_type=mimetype) core = doc.coreProperties dc = { "title": core.get("title"), "subject": core.get("subject"), } return { "dc": dc, "keywords": self.keywords, "category": core.get("category"), "image": self.image, "pagecount": self.pagecount, "mimetype": mimetype, "filename": filename }
def get_slides(self, file_path): """ '/some/path' -> { '1826322': (0, <lxml.etree._ElementTree object at 0x10884fd40>), '9126312': (1, <lxml.etree._ElementTree object at 0x10884fd40>) } """ doc = openxmllib.openXmlDocument(file_path) ct_file = os.path.join(doc._cache_dir, '[Content_Types].xml') raw_xml = xmlFile(ct_file, 'rb') doc.content_types = contenttypes.ContentTypes(raw_xml) slide_dict = {} for index, tree in enumerate(doc.content_types.getTreesFor(doc, contenttypes.CT_PRESENTATION_SLIDE)): slide_id = tree.xpath('//p14:creationId/@val', namespaces={'p14': 'http://schemas.microsoft.com/office/powerpoint/2010/main'})[0] slide_dict[slide_id] = (index, tree) return slide_dict
def extract_openxml_metadata(doc_path) -> str: openxml = openxmllib.openXmlDocument(path=doc_path) metadata = "" if "creator" in openxml.coreProperties: value = openxml.coreProperties.get("creator") if value: metadata += AUTHOR + value + "\n" else: metadata += AUTHOR + NA + "\n" if "title" in openxml.coreProperties: value = openxml.coreProperties.get("title") if value: metadata += TITLE + value + "\n" else: metadata += TITLE + NA + "\n" if "lastModifiedBy" in openxml.coreProperties: value = openxml.coreProperties.get("lastModifiedBy") if value: metadata += LAST_SAVED_BY + value + "\n" else: metadata += LAST_SAVED_BY + NA + "\n" if "created" in openxml.coreProperties: value = openxml.coreProperties.get("created") if value: metadata += CREATE_TIME + value + "\n" else: metadata += CREATE_TIME + NA + "\n" if "modified" in openxml.coreProperties: value = openxml.coreProperties.get("modified") if value: metadata += MODIFIED_TIME + value + "\n" else: metadata += MODIFIED_TIME + NA + "\n" return metadata
def extractCover(self, filename): doc = openxmllib.openXmlDocument(path=filename) return doc.documentCover()
import openxmllib doc = openxmllib.openXmlDocument(path=‘example.xlsx') print doc.coreProperties print doc.extendedProperties print doc.indexableText(include_properties=True)
def setUp(self): # here it is asummed you have started the base_http_server.py self.doc = openxmllib.openXmlDocument(url=DOCX_URL) return
def setUp(self): self.document = openxmllib.openXmlDocument(self.test_file_path)
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data self.sf.debug("Received event, " + eventName + ", from " + srcModuleName) if eventData in self.results: return None else: self.results.append(eventData) for fileExt in self.opts['fileexts']: if self.checkForStop(): return None if "." + fileExt.lower() in eventData.lower(): # Fetch the file, allow much more time given that these files are # typically large. ret = self.sf.fetchUrl(eventData, timeout=self.opts['timeout'], useragent=self.opts['_useragent'], dontMangle=True, sizeLimit=10000000) if ret['content'] is None: self.sf.error("Unable to fetch file for meta analysis: " + eventData, False) return None if len(ret['content']) < 512: self.sf.error("Strange content encountered, size of " + str(len(ret['content'])), False) return None meta = None data = None # Based on the file extension, handle it if fileExt.lower() == "pdf": try: raw = StringIO(ret['content']) #data = metapdf.MetaPdfReader().read_metadata(raw) pdf = PyPDF2.PdfFileReader(raw, strict=False) data = pdf.getDocumentInfo() meta = str(data) self.sf.debug("Obtained meta data from " + eventData) except BaseException as e: self.sf.error("Unable to parse meta data from: " + eventData + "(" + str(e) + ")", False) return None if fileExt.lower() in ["pptx", "docx", "xlsx"]: try: mtype = mimetypes.guess_type(eventData)[0] doc = openxmllib.openXmlDocument(data=ret['content'], mime_type=mtype) self.sf.debug("Office type: " + doc.mimeType) data = doc.allProperties meta = str(data) except ValueError as e: self.sf.error("Unable to parse meta data from: " + eventData + "(" + str(e) + ")", False) return None except lxml.etree.XMLSyntaxError as e: self.sf.error("Unable to parse XML within: " + eventData + "(" + str(e) + ")", False) return None except BaseException as e: self.sf.error("Unable to process file: " + eventData + "(" + str(e) + ")", False) return None if fileExt.lower() in ["jpg", "jpeg", "tiff"]: try: raw = StringIO(ret['content']) data = exifread.process_file(raw) if data is None or len(data) == 0: continue meta = str(data) except BaseException as e: self.sf.error("Unable to parse meta data from: " + eventData + "(" + str(e) + ")", False) return None if meta is not None and data is not None: evt = SpiderFootEvent("RAW_FILE_META_DATA", meta, self.__name__, event) self.notifyListeners(evt) val = None try: if "/Producer" in data: val = data['/Producer'] if "/Creator" in data: if "/Producer" in data: if data['/Creator'] != data['/Producer']: val = data['/Creator'] else: val = data['/Creator'] if "Application" in data: val = data['Application'] if "Image Software" in data: val = str(data['Image Software']) except BaseException as e: self.sf.error("Failed to parse PDF, " + eventData + ": " + str(e), False) return None if val is not None: # Strip non-ASCII val = ''.join([i if ord(i) < 128 else ' ' for i in val]) evt = SpiderFootEvent("SOFTWARE_USED", val, self.__name__, event) self.notifyListeners(evt)
from docx import Document import openxmllib doc = openxmllib.openXmlDocument(path='office.docx') print(doc) document = Document('/Users/disturber/Downloads/lekinterkaps.doc') for table in document.tables: for row in table.rows: for cell in row.cells: print(cell.text)
def setUp(self): test_file_path = os.path.join(TEST_FILES_IN, ALL_IN_FILES[0]) self.doc = openxmllib.openXmlDocument(test_file_path) return
def setUp(self): test_file_path = os.path.join(TEST_FILES_IN, ALL_IN_FILES[0]) self.doc = openxmllib.openXmlDocument(path=test_file_path) test_template_file_path = os.path.join(TEST_FILES_IN, ALL_IN_TEMPLATE_FILES[0]) self.template = openxmllib.openXmlDocument(path=test_template_file_path) return
def setUp(self): # here it is asummed you have started the base_http_server.py self.doc = openxmllib.openXmlDocument(DOCX_URL)
import openxmllib doc = openxmllib.openXmlDocument(path="example.pptx") print ("%s\n" % (doc.coreProperties)) for x in doc.coreProperties: print("%s:%s" % (x, doc.coreProperties[x])) for x in doc.extendedProperties: print("%s:%s" % (x, doc.extendedProperties[x])) print ("%s\n" % (doc.indexableText(include_properties=True)))
def setUp(self): test_file_path = os.path.join(TEST_FILES_IN, ALL_IN_FILES[1]) self.doc = openxmllib.openXmlDocument(path=test_file_path) return
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data self.sf.debug("Received event, " + eventName + ", from " + srcModuleName) if eventData in self.results: return None else: self.results.append(eventData) for fileExt in self.opts['fileexts']: if self.checkForStop(): return None if "." + fileExt.lower() in eventData.lower(): # Fetch the file, allow much more time given that these files are # typically large. ret = self.sf.fetchUrl(eventData, timeout=self.opts['timeout'], useragent=self.opts['_useragent'], dontMangle=True) if ret['content'] is None: self.sf.error( "Unable to fetch file for meta analysis: " + eventData, False) return None if len(ret['content']) < 512: self.sf.error( "Strange content encountered, size of " + str(len(ret['content'])), False) return None meta = None data = None # Based on the file extension, handle it if fileExt.lower() == "pdf": try: raw = StringIO(ret['content']) data = metapdf.MetaPdfReader().read_metadata(raw) meta = str(data) self.sf.debug("Obtained meta data from " + eventData) except BaseException as e: self.sf.error( "Unable to parse meta data from: " + eventData + "(" + str(e) + ")", False) if fileExt.lower() in ["pptx", "docx", "xlsx"]: try: mtype = mimetypes.guess_type(eventData)[0] doc = openxmllib.openXmlDocument(data=ret['content'], mime_type=mtype) self.sf.debug("Office type: " + doc.mimeType) data = doc.allProperties meta = str(data) except ValueError as e: self.sf.error( "Unable to parse meta data from: " + eventData + "(" + str(e) + ")", False) except lxml.etree.XMLSyntaxError as e: self.sf.error( "Unable to parse XML within: " + eventData + "(" + str(e) + ")", False) except BaseException as e: self.sf.error( "Unable to process file: " + eventData + "(" + str(e) + ")", False) if fileExt.lower() in ["jpg", "jpeg", "tiff"]: try: raw = StringIO(ret['content']) data = exifread.process_file(raw) if data is None or len(data) == 0: return None meta = str(data) except BaseException as e: self.sf.error( "Unable to parse meta data from: " + eventData + "(" + str(e) + ")", False) if meta is not None: evt = SpiderFootEvent("RAW_FILE_META_DATA", meta, self.__name__, event) self.notifyListeners(evt) val = None if "/Producer" in data: val = data['/Producer'] if "/Creator" in data: if "/Producer" in data: if data['/Creator'] != data['/Producer']: val = data['/Creator'] else: val = data['/Creator'] if "Application" in data: val = data['Application'] if "Image Software" in data: val = str(data['Image Software']) if val is not None: # Strip non-ASCII val = ''.join( [i if ord(i) < 128 else ' ' for i in val]) evt = SpiderFootEvent("SOFTWARE_USED", val, self.__name__, event) self.notifyListeners(evt)