Beispiel #1
0
    def test_frommime(self):
        """Construction with a mimetype help"""

        # From file path
        mime_type = ct.CT_WORDPROC_DOCX_PUBLIC
        test_file_path = os.path.join(TEST_FILES_IN, ALL_IN_FILES[0])
        doc = openxmllib.openXmlDocument(path=test_file_path,
                                         mime_type=mime_type)
        self.assertTrue(
            isinstance(doc, openxmllib.wordprocessing.WordprocessingDocument),
            "Failed to create with mime type %s" % mime_type)
        self.assertEqual(doc.mimeType, mime_type)

        # From file object
        fh = open(test_file_path, 'rb')
        doc = openxmllib.openXmlDocument(file_=fh, mime_type=mime_type)
        fh.close()
        self.assertTrue(
            isinstance(doc, openxmllib.wordprocessing.WordprocessingDocument),
            "Failed to create with mime type %s" % mime_type)
        self.assertEqual(doc.mimeType, mime_type)

        # From file content
        fh = open(test_file_path, 'rb')
        doc = openxmllib.openXmlDocument(data=fh.read(), mime_type=mime_type)
        fh.close()
        self.assertTrue(
            isinstance(doc, openxmllib.wordprocessing.WordprocessingDocument),
            "Failed to create with mime type %s" % mime_type)
        self.assertEqual(doc.mimeType, mime_type)
        return
Beispiel #2
0
    def setUp(self):

        file_paths = [os.path.join(TEST_FILES_IN, fn) for fn in ALL_IN_FILES]
        self.docs = [openxmllib.openXmlDocument(path=pth) for pth in file_paths]

        cover_file_paths = [os.path.join(TEST_FILES_IN, fn) for fn in ALL_IN_COVER_FILES]
        self.coverdocs = [openxmllib.openXmlDocument(path=pth) for pth in cover_file_paths]
    def setUp(self):

        file_paths = [os.path.join(TEST_FILES_IN, fn) for fn in ALL_IN_FILES]
        self.docs = [openxmllib.openXmlDocument(path=pth) for pth in file_paths]

        cover_file_paths = [os.path.join(TEST_FILES_IN, fn) for fn in ALL_IN_COVER_FILES]
        self.coverdocs = [openxmllib.openXmlDocument(path=pth) for pth in cover_file_paths]
    def test_frommime(self):
        """Construction with a mimetype help"""

        # From file path
        mime_type = ct.CT_WORDPROC_DOCX_PUBLIC
        test_file_path = os.path.join(TEST_FILES_IN, ALL_IN_FILES[0])
        doc = openxmllib.openXmlDocument(path=test_file_path, mime_type=mime_type)
        self.failUnless(isinstance(doc, openxmllib.wordprocessing.WordprocessingDocument),
                        "Failed to create with mime type %s" % mime_type)
        self.failUnlessEqual(doc.mimeType, mime_type)

        # From file object
        fh = file(test_file_path, 'rb')
        doc = openxmllib.openXmlDocument(file_=fh, mime_type=mime_type)
        fh.close()
        self.failUnless(isinstance(doc, openxmllib.wordprocessing.WordprocessingDocument),
                        "Failed to create with mime type %s" % mime_type)
        self.failUnlessEqual(doc.mimeType, mime_type)

        # From file content
        fh = file(test_file_path, 'rb')
        doc = openxmllib.openXmlDocument(data=fh.read(), mime_type=mime_type)
        fh.close()
        self.failUnless(isinstance(doc, openxmllib.wordprocessing.WordprocessingDocument),
                        "Failed to create with mime type %s" % mime_type)
        self.failUnlessEqual(doc.mimeType, mime_type)
        return
 def setUp(self):
     test_file_path = os.path.join(TEST_FILES_IN, ALL_IN_FILES[0])
     self.doc = openxmllib.openXmlDocument(path=test_file_path)
     test_template_file_path = os.path.join(TEST_FILES_IN,
                                            ALL_IN_TEMPLATE_FILES[0])
     self.template = openxmllib.openXmlDocument(
         path=test_template_file_path)
     return
Beispiel #6
0
 def get_metadata(self):
     """Returns a .modules.metadata.Metadata object
     """
     self.metadata = Metadata()
     document = openxmllib.openXmlDocument(path=self.path)
     self.metadata.add(document.allProperties, "ooxml")
     return self.metadata
Beispiel #7
0
 def get_metadata(self):
     """Returns a .modules.metadata.Metadata object
     """
     self.metadata = Metadata()
     document = openxmllib.openXmlDocument(path=self.path)
     self.metadata.add(document.allProperties, "ooxml")
     return self.metadata
Beispiel #8
0
    def handleEvent(self, event):
        eventName = event.eventType
        srcModuleName = event.module
        eventData = event.data

        sf.debug("Received event, " + eventName + ", from " + srcModuleName)

        if eventData in self.results:
            return None
        else:
            self.results.append(eventData)

        for fileExt in self.opts['fileexts']:
            if self.checkForStop():
                return None

            if "." + fileExt.lower() in eventData.lower():
                # Fetch the file, allow much more time given that these files are
                # typically large.
                ret = sf.fetchUrl(eventData,
                                  timeout=self.opts['timeout'],
                                  useragent=self.opts['_useragent'],
                                  dontMangle=True)
                if ret['content'] == None:
                    sf.error("Unable to fetch file for meta analysis: " + \
                        eventData, False)
                    return None

                if len(ret['content']) < 1024:
                    sf.error("Strange content encountered, size of " + \
                        len(res['content']), False)

                meta = None
                # Based on the file extension, handle it
                if fileExt.lower() == "pdf":
                    try:
                        data = StringIO(ret['content'])
                        meta = str(metapdf.MetaPdfReader().read_metadata(data))
                        sf.debug("Obtained meta data from " + eventData)
                    except BaseException as e:
                        sf.error("Unable to parse meta data from: " + \
                            eventData + "(" + str(e) + ")", False)
                        return None

                if fileExt.lower() in ["pptx", "docx", "xlsx"]:
                    try:
                        mtype = mimetypes.guess_type(eventData)[0]
                        doc = openxmllib.openXmlDocument(data=ret['content'],
                                                         mime_type=mtype)
                        sf.debug("Office type: " + doc.mimeType)
                        meta = str(doc.allProperties)
                    except ValueError as e:
                        sf.error("Unable to parse meta data from: " + \
                            eventData + "(" + str(e) + ")", False)

                if meta != None:
                    evt = SpiderFootEvent("RAW_FILE_META_DATA", meta,
                                          self.__name__, event)
                    self.notifyListeners(evt)
Beispiel #9
0
    def handleEvent(self, event):
        eventName = event.eventType
        srcModuleName = event.module
        eventData = event.data

        sf.debug("Received event, " + eventName + ", from " + srcModuleName)

        if eventData in self.results:
            return None
        else:
            self.results.append(eventData)

        for fileExt in self.opts['fileexts']:
            if self.checkForStop():
                return None

            if "." + fileExt.lower() in eventData.lower():
                # Fetch the file, allow much more time given that these files are
                # typically large.
                ret = sf.fetchUrl(eventData, timeout=self.opts['timeout'], 
                    useragent=self.opts['_useragent'], dontMangle=True)
                if ret['content'] == None:
                    sf.error("Unable to fetch file for meta analysis: " + \
                        eventData, False)
                    return None

                if len(ret['content']) < 1024:
                    sf.error("Strange content encountered, size of " + \
                        len(res['content']), False)

                meta = None
                # Based on the file extension, handle it
                if fileExt.lower() == "pdf":
                    try:
                        data = StringIO(ret['content'])
                        meta = str(metapdf.MetaPdfReader().read_metadata(data))
                        sf.debug("Obtained meta data from " + eventData)
                    except BaseException as e:
                        sf.error("Unable to parse meta data from: " + \
                            eventData + "(" + str(e) + ")", False)
                        return None

                if fileExt.lower() in [ "pptx", "docx", "xlsx" ]:
                    try:
                        mtype = mimetypes.guess_type(eventData)[0]
                        doc = openxmllib.openXmlDocument(data=ret['content'], mime_type=mtype)
                        sf.debug("Office type: " + doc.mimeType)
                        meta = str(doc.allProperties)
                    except ValueError as e:
                        sf.error("Unable to parse meta data from: " + \
                            eventData + "(" + str(e) + ")", False)
                    except lxml.etree.XMLSyntaxError as e:
                        sf.error("Unable to parse XML within: " + \
                            eventData + "(" + str(e) + ")", False)

                if meta != None:
                    evt = SpiderFootEvent("RAW_FILE_META_DATA", meta,
                        self.__name__, event)
                    self.notifyListeners(evt)
Beispiel #10
0
 def metadata(self):
     output = {}
     doc = openxmllib.openXmlDocument(path=self.filepath)
     for key, value in doc.coreProperties.items():
         output[key] = value
     for key, value in doc.extendedProperties.items():
         output[key] = value
     return output
Beispiel #11
0
	def metadata(self):
		output = {}
		doc = openxmllib.openXmlDocument(path=self.filepath)
		for key, value in doc.coreProperties.items():
			output[key] = value
		for key, value in doc.extendedProperties.items():
			output[key] = value
		return output
    def test_frompath(self):
        """Construction from a path"""

        for test_filename in ALL_IN_FILES:
            test_filepath = os.path.join(TEST_FILES_IN, test_filename)
            doc = openxmllib.openXmlDocument(test_filepath)
            self.failUnless(isinstance(doc, openxmllib.document.Document),
                            "%s should be processed" % test_filepath)
        return
Beispiel #13
0
    def test_frompath(self):
        """Construction from a path"""

        for test_filename in ALL_IN_FILES:
            test_filepath = os.path.join(TEST_FILES_IN, test_filename)
            doc = openxmllib.openXmlDocument(test_filepath)
            self.assertTrue(isinstance(doc, openxmllib.document.Document),
                            "%s should be processed" % test_filepath)
        return
    def getIndexableValue(self, field, instance):
        """
        getIndexableValue(self, field, instance) => (possibliy big) string
        Return the ZCatalog-indexable string for that type.
        """

        content = field.get(instance)
        content_type = field.getContentType(instance)
        doc = openxmllib.openXmlDocument(content.data, self.content_types[0])
        return doc.indexableText().encode(instance.getCharset(), 'replace')
Beispiel #15
0
    def test_xmlfile(self):
        """Working around absence of BOM support in lxml"""

        from lxml import etree
        test_file_path = os.path.join(TEST_FILES_IN, ALL_IN_FILES[0])
        doc = openxmllib.openXmlDocument(test_file_path)
        toc_path = os.path.join(doc._cache_dir, '[Content_Types].xml')
        fh = openxmllib.utils.xmlFile(toc_path, 'rb')
        xml = etree.parse(fh)
        self.assertTrue(isinstance(xml.getroot(), etree._Element), "Expected an XML element")
        return
    def test_fromfile(self):
        """Construction from a file file object"""

        for test_filename in ALL_IN_FILES:
            test_filepath = os.path.join(TEST_FILES_IN, test_filename)
            fh = file(test_filepath, 'rb')
            doc = openxmllib.openXmlDocument(test_filepath)
            fh.close()
            self.failUnless(isinstance(doc, openxmllib.document.Document),
                            "%s should be processed" % test_filepath)
        return
Beispiel #17
0
 def showWords(self, filename):
     if not self.checkfile(filename):
         return
     self.log(1, "Processing %s...", filename)
     start_time = time.time()
     doc = openxmllib.openXmlDocument(path=filename)
     text = doc.indexableText(include_properties=False)
     duration = time.time() - start_time
     print self.recode(text)
     self.log(1, "Words extracted in %s second(s)", duration)
     return
Beispiel #18
0
    def test_fromfile(self):
        """Construction from a file file object"""

        for test_filename in ALL_IN_FILES:
            test_filepath = os.path.join(TEST_FILES_IN, test_filename)
            fh = open(test_filepath, 'rb')
            doc = openxmllib.openXmlDocument(test_filepath)
            fh.close()
            self.assertTrue(isinstance(doc, openxmllib.document.Document),
                            "%s should be processed" % test_filepath)
        return
    def test_xmlfile(self):
        """Working around absence of BOM support in lxml"""

        from lxml import etree
        test_file_path = os.path.join(TEST_FILES_IN, ALL_IN_FILES[0])
        doc = openxmllib.openXmlDocument(test_file_path)
        toc_path = os.path.join(doc._cache_dir, '[Content_Types].xml')
        fh = openxmllib.utils.xmlFile(toc_path, 'rb')
        xml = etree.parse(fh)
        self.failUnless(isinstance(xml.getroot(), etree._Element), "Expected an XML element")
        return
    def _getOpenXmlText(self, filename, c):
        logger.debug(u"OpenXmlText: %s" % filename)

        try:
            doc = openxmllib.openXmlDocument(path=filename)
            logger.debug(u"%s\n" % doc.allProperties)

            ap = c.addConceptKeyType(u"allProperties", u"PROPERTIES")
            for x in doc.allProperties:
                logger.debug(u"cp %s:%s" % (x, doc.allProperties[x]))
                ap.addConceptKeyType(doc.allProperties[x], x)
        except Exception, msg:
            logger.error(u"%s" % msg)
    def convert(self, orig, data, **kwargs):

        #orig_file = kwargs.get('filename') or 'unknown.xxx'
        mimetype = kwargs.get('mimetype')
        filename = kwargs.get('filename') or 'unknown.xxx'
        if mimetype is None:
            mimetype = mimetypes.guess_type(filename)[0]
        try:
            doc = openxmllib.openXmlDocument(orig, mimetype)
            data.setData(doc.indexableText().encode(SITE_CHARSET, 'replace'))
        except ValueError, e:
            # Crappy data provided to the transform.
            logger.error("Crappy file provided, returning empty text", exc_info=True)
            data.setData('')
Beispiel #22
0
    def convert(self, orig, data, **kwargs):

        #orig_file = kwargs.get('filename') or 'unknown.xxx'
        mimetype = kwargs.get('mimetype')
        filename = kwargs.get('filename') or 'unknown.xxx'
        if mimetype is None:
            mimetype = mimetypes.guess_type(filename)[0]
        try:
            doc = openxmllib.openXmlDocument(orig, mimetype)
            data.setData(doc.indexableText().encode(SITE_CHARSET, 'replace'))
        except ValueError, e:
            # Crappy data provided to the transform.
            logger.error("Crappy file provided, returning empty text",
                         exc_info=True)
            data.setData('')
Beispiel #23
0
 def showMetadata(self, filename):
     if not self.checkfile(filename):
         return
     self.log(1, "Processing %s...", filename)
     doc = openxmllib.openXmlDocument(path=filename)
     self.log(2, "Core properties:")
     for k, v in doc.coreProperties.items():
         print "%s: %s" % (self.recode(k), self.recode(v))
     self.log(2, "Extended properties:")
     for k, v in doc.extendedProperties.items():
         print "%s: %s" % (self.recode(k), self.recode(v))
     self.log(2, "Custom properties:")
     for k, v in doc.customProperties.items():
         print "%s: %s" % (self.recode(k), self.recode(v))
     return
Beispiel #24
0
def getOpenXmlText(filename, ftype):
        logger.info("OpenXmlText: %s" % filename)

        document = openxmldoc

        doc = openxmllib.openXmlDocument(path=filename)
        c = Concepts(filename, ftype)

        logger.debug ("%s\n" % (doc.allProperties))

        ap = c.addConceptKeyType("allProperties","PROPERTIES")
        for x in doc.allProperties:
            logger.info("cp %s:%s" % (x, doc.allProperties[x]))
            ap.addConceptKeyType(doc.allProperties[x], x)

        logger.info("it %s\n" % (doc.indexableText(include_properties=True)))
        c.addConceptKeyType(doc.indexableText(include_properties=True),"TEXT")

        return c
   def get_metadata(self, data, mimetype, filename):
      "implement the utility interface"

      self.doc = doc = openXmlDocument(data=data, mime_type=mimetype)
      core = doc.coreProperties

      dc = {
         "title": core.get("title"),
         "subject": core.get("subject"),
      }

      return {
         "dc": dc,
         "keywords": self.keywords,
         "category": core.get("category"),
         "image": self.image,
         "pagecount": self.pagecount,
         "mimetype": mimetype,
         "filename": filename
      }
Beispiel #26
0
    def get_slides(self, file_path):
        """
        '/some/path' -> {
            '1826322': (0, <lxml.etree._ElementTree object at 0x10884fd40>),
            '9126312': (1, <lxml.etree._ElementTree object at 0x10884fd40>)
        }
        """

        doc = openxmllib.openXmlDocument(file_path)

        ct_file = os.path.join(doc._cache_dir, '[Content_Types].xml')
        raw_xml = xmlFile(ct_file, 'rb')
        doc.content_types = contenttypes.ContentTypes(raw_xml)

        slide_dict = {}

        for index, tree in enumerate(doc.content_types.getTreesFor(doc, contenttypes.CT_PRESENTATION_SLIDE)):
            slide_id = tree.xpath('//p14:creationId/@val', namespaces={'p14': 'http://schemas.microsoft.com/office/powerpoint/2010/main'})[0]
            slide_dict[slide_id] = (index, tree)

        return slide_dict
Beispiel #27
0
def extract_openxml_metadata(doc_path) -> str:
    openxml = openxmllib.openXmlDocument(path=doc_path)

    metadata = ""
    if "creator" in openxml.coreProperties:
        value = openxml.coreProperties.get("creator")
        if value:
            metadata += AUTHOR + value + "\n"
        else:
            metadata += AUTHOR + NA + "\n"

    if "title" in openxml.coreProperties:
        value = openxml.coreProperties.get("title")
        if value:
            metadata += TITLE + value + "\n"
        else:
            metadata += TITLE + NA + "\n"

    if "lastModifiedBy" in openxml.coreProperties:
        value = openxml.coreProperties.get("lastModifiedBy")
        if value:
            metadata += LAST_SAVED_BY + value + "\n"
        else:
            metadata += LAST_SAVED_BY + NA + "\n"

    if "created" in openxml.coreProperties:
        value = openxml.coreProperties.get("created")
        if value:
            metadata += CREATE_TIME + value + "\n"
        else:
            metadata += CREATE_TIME + NA + "\n"

    if "modified" in openxml.coreProperties:
        value = openxml.coreProperties.get("modified")
        if value:
            metadata += MODIFIED_TIME + value + "\n"
        else:
            metadata += MODIFIED_TIME + NA + "\n"

    return metadata
Beispiel #28
0
 def extractCover(self, filename):
     doc = openxmllib.openXmlDocument(path=filename)
     return doc.documentCover()
Beispiel #29
0
import openxmllib
doc = openxmllib.openXmlDocument(path=‘example.xlsx')
print doc.coreProperties
print doc.extendedProperties
print doc.indexableText(include_properties=True)
Beispiel #30
0
 def setUp(self):
     # here it is asummed you have started the base_http_server.py
     self.doc = openxmllib.openXmlDocument(url=DOCX_URL)
     return
Beispiel #31
0
 def setUp(self):
     self.document = openxmllib.openXmlDocument(self.test_file_path)
Beispiel #32
0
    def handleEvent(self, event):
        eventName = event.eventType
        srcModuleName = event.module
        eventData = event.data

        self.sf.debug("Received event, " + eventName + ", from " + srcModuleName)

        if eventData in self.results:
            return None
        else:
            self.results.append(eventData)

        for fileExt in self.opts['fileexts']:
            if self.checkForStop():
                return None

            if "." + fileExt.lower() in eventData.lower():
                # Fetch the file, allow much more time given that these files are
                # typically large.
                ret = self.sf.fetchUrl(eventData, timeout=self.opts['timeout'],
                                       useragent=self.opts['_useragent'], dontMangle=True,
                                       sizeLimit=10000000)
                if ret['content'] is None:
                    self.sf.error("Unable to fetch file for meta analysis: " +
                                  eventData, False)
                    return None

                if len(ret['content']) < 512:
                    self.sf.error("Strange content encountered, size of " +
                                  str(len(ret['content'])), False)
                    return None

                meta = None
                data = None
                # Based on the file extension, handle it
                if fileExt.lower() == "pdf":
                    try:
                        raw = StringIO(ret['content'])
                        #data = metapdf.MetaPdfReader().read_metadata(raw)
                        pdf = PyPDF2.PdfFileReader(raw, strict=False)
                        data = pdf.getDocumentInfo()
                        meta = str(data)
                        self.sf.debug("Obtained meta data from " + eventData)
                    except BaseException as e:
                        self.sf.error("Unable to parse meta data from: " +
                                      eventData + "(" + str(e) + ")", False)
                        return None

                if fileExt.lower() in ["pptx", "docx", "xlsx"]:
                    try:
                        mtype = mimetypes.guess_type(eventData)[0]
                        doc = openxmllib.openXmlDocument(data=ret['content'], mime_type=mtype)
                        self.sf.debug("Office type: " + doc.mimeType)
                        data = doc.allProperties
                        meta = str(data)
                    except ValueError as e:
                        self.sf.error("Unable to parse meta data from: " +
                                      eventData + "(" + str(e) + ")", False)
                        return None
                    except lxml.etree.XMLSyntaxError as e:
                        self.sf.error("Unable to parse XML within: " +
                                      eventData + "(" + str(e) + ")", False)
                        return None
                    except BaseException as e:
                        self.sf.error("Unable to process file: " +
                                      eventData + "(" + str(e) + ")", False)
                        return None

                if fileExt.lower() in ["jpg", "jpeg", "tiff"]:
                    try:
                        raw = StringIO(ret['content'])
                        data = exifread.process_file(raw)
                        if data is None or len(data) == 0:
                            continue
                        meta = str(data)
                    except BaseException as e:
                        self.sf.error("Unable to parse meta data from: " +
                                      eventData + "(" + str(e) + ")", False)
                        return None

                if meta is not None and data is not None:
                    evt = SpiderFootEvent("RAW_FILE_META_DATA", meta,
                                          self.__name__, event)
                    self.notifyListeners(evt)

                    val = None
                    try:
                        if "/Producer" in data:
                            val = data['/Producer']

                        if "/Creator" in data:
                            if "/Producer" in data:
                                if data['/Creator'] != data['/Producer']:
                                    val = data['/Creator']
                            else:
                                val = data['/Creator']

                        if "Application" in data:
                            val = data['Application']

                        if "Image Software" in data:
                            val = str(data['Image Software'])
                    except BaseException as e:
                        self.sf.error("Failed to parse PDF, " + eventData + ": " + str(e), False)
                        return None

                    if val is not None:
                        # Strip non-ASCII
                        val = ''.join([i if ord(i) < 128 else ' ' for i in val])
                        evt = SpiderFootEvent("SOFTWARE_USED", val,
                                              self.__name__, event)
                        self.notifyListeners(evt)
Beispiel #33
0
from docx import Document
import openxmllib

doc = openxmllib.openXmlDocument(path='office.docx')
print(doc)
document = Document('/Users/disturber/Downloads/lekinterkaps.doc')
for table in document.tables:
    for row in table.rows:
        for cell in row.cells:
            print(cell.text)
Beispiel #34
0
 def setUp(self):
     test_file_path = os.path.join(TEST_FILES_IN, ALL_IN_FILES[0])
     self.doc = openxmllib.openXmlDocument(test_file_path)
     return
 def setUp(self):
     test_file_path = os.path.join(TEST_FILES_IN, ALL_IN_FILES[0])
     self.doc = openxmllib.openXmlDocument(path=test_file_path)
     test_template_file_path = os.path.join(TEST_FILES_IN, ALL_IN_TEMPLATE_FILES[0])
     self.template = openxmllib.openXmlDocument(path=test_template_file_path)
     return
Beispiel #36
0
 def setUp(self):
     self.document = openxmllib.openXmlDocument(self.test_file_path)
 def setUp(self):
     # here it is asummed you have started the base_http_server.py
     self.doc = openxmllib.openXmlDocument(DOCX_URL)
Beispiel #38
0
import openxmllib

doc = openxmllib.openXmlDocument(path="example.pptx")

print ("%s\n" % (doc.coreProperties))

for x in doc.coreProperties:
    print("%s:%s" % (x, doc.coreProperties[x]))

for x in doc.extendedProperties:
    print("%s:%s" % (x, doc.extendedProperties[x]))

print ("%s\n" % (doc.indexableText(include_properties=True)))
 def setUp(self):
     test_file_path = os.path.join(TEST_FILES_IN, ALL_IN_FILES[1])
     self.doc = openxmllib.openXmlDocument(path=test_file_path)
     return
Beispiel #40
0
    def handleEvent(self, event):
        eventName = event.eventType
        srcModuleName = event.module
        eventData = event.data

        self.sf.debug("Received event, " + eventName + ", from " +
                      srcModuleName)

        if eventData in self.results:
            return None
        else:
            self.results.append(eventData)

        for fileExt in self.opts['fileexts']:
            if self.checkForStop():
                return None

            if "." + fileExt.lower() in eventData.lower():
                # Fetch the file, allow much more time given that these files are
                # typically large.
                ret = self.sf.fetchUrl(eventData,
                                       timeout=self.opts['timeout'],
                                       useragent=self.opts['_useragent'],
                                       dontMangle=True)
                if ret['content'] is None:
                    self.sf.error(
                        "Unable to fetch file for meta analysis: " + eventData,
                        False)
                    return None

                if len(ret['content']) < 512:
                    self.sf.error(
                        "Strange content encountered, size of " +
                        str(len(ret['content'])), False)
                    return None

                meta = None
                data = None
                # Based on the file extension, handle it
                if fileExt.lower() == "pdf":
                    try:
                        raw = StringIO(ret['content'])
                        data = metapdf.MetaPdfReader().read_metadata(raw)
                        meta = str(data)
                        self.sf.debug("Obtained meta data from " + eventData)
                    except BaseException as e:
                        self.sf.error(
                            "Unable to parse meta data from: " + eventData +
                            "(" + str(e) + ")", False)

                if fileExt.lower() in ["pptx", "docx", "xlsx"]:
                    try:
                        mtype = mimetypes.guess_type(eventData)[0]
                        doc = openxmllib.openXmlDocument(data=ret['content'],
                                                         mime_type=mtype)
                        self.sf.debug("Office type: " + doc.mimeType)
                        data = doc.allProperties
                        meta = str(data)
                    except ValueError as e:
                        self.sf.error(
                            "Unable to parse meta data from: " + eventData +
                            "(" + str(e) + ")", False)
                    except lxml.etree.XMLSyntaxError as e:
                        self.sf.error(
                            "Unable to parse XML within: " + eventData + "(" +
                            str(e) + ")", False)
                    except BaseException as e:
                        self.sf.error(
                            "Unable to process file: " + eventData + "(" +
                            str(e) + ")", False)

                if fileExt.lower() in ["jpg", "jpeg", "tiff"]:
                    try:
                        raw = StringIO(ret['content'])
                        data = exifread.process_file(raw)
                        if data is None or len(data) == 0:
                            return None
                        meta = str(data)
                    except BaseException as e:
                        self.sf.error(
                            "Unable to parse meta data from: " + eventData +
                            "(" + str(e) + ")", False)

                if meta is not None:
                    evt = SpiderFootEvent("RAW_FILE_META_DATA", meta,
                                          self.__name__, event)
                    self.notifyListeners(evt)

                    val = None
                    if "/Producer" in data:
                        val = data['/Producer']

                    if "/Creator" in data:
                        if "/Producer" in data:
                            if data['/Creator'] != data['/Producer']:
                                val = data['/Creator']
                        else:
                            val = data['/Creator']

                    if "Application" in data:
                        val = data['Application']

                    if "Image Software" in data:
                        val = str(data['Image Software'])

                    if val is not None:
                        # Strip non-ASCII
                        val = ''.join(
                            [i if ord(i) < 128 else ' ' for i in val])
                        evt = SpiderFootEvent("SOFTWARE_USED", val,
                                              self.__name__, event)
                        self.notifyListeners(evt)
Beispiel #41
0
 def extractCover(self, filename):
     doc = openxmllib.openXmlDocument(path=filename)
     return doc.documentCover()