def check_external_relationships(self): """ Check whether this file has external relationships (remote template, OLE object, etc). :returns: :py:class:`Indicator` """ ext_rels = Indicator( 'ext_rels', 0, name='External Relationships', _type=int, risk=RISK.NONE, description= 'External relationships such as remote templates, remote OLE objects, etc', hide_if_false=False) self.indicators.append(ext_rels) # this check only works for OpenXML files if not self.ftg.is_openxml(): return ext_rels # to collect relationship types: rel_types = set() # open an XmlParser, using a BytesIO instead of filename (to work in memory) xmlparser = ooxml.XmlParser(self.data_bytesio) for rel_type, target in oleobj.find_external_relationships(xmlparser): log.debug('External relationship: type={} target={}'.format( rel_type, target)) rel_types.add(rel_type) ext_rels.value += 1 if ext_rels.value > 0: ext_rels.description = 'External relationships found: {} - use oleobj for details'.format( ', '.join(rel_types)) ext_rels.risk = RISK.HIGH return ext_rels
def test_iter_subfiles(self): """ test that limitation on few subfiles works """ testfile = join(DATA_BASE_DIR, 'msodde', 'dde-test.xlsx') subfiles = ['xl/theme/theme1.xml', 'docProps/app.xml'] parser = ooxml.XmlParser(testfile) for subfile, elem, depth in parser.iter_xml(subfiles): if self.DO_DEBUG: print(u'{0} {1}{2}'.format(subfile, ' ' * depth, ooxml.debug_str(elem))) if subfile not in subfiles: self.fail('should have been skipped: {0}'.format(subfile)) if depth == 0: subfiles.remove(subfile) self.assertEqual(subfiles, [], 'missed subfile(s) {0}'.format(subfiles))
def process_xlsx(filepath): """ process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm) """ dde_links = [] parser = ooxml.XmlParser(filepath) for subfile, elem, _ in parser.iter_xml(): tag = elem.tag.lower() if tag == 'ddelink' or tag.endswith('}ddelink'): # we have found a dde link. Try to get more info about it link_info = ['DDE-Link'] if 'ddeService' in elem.attrib: link_info.append(elem.attrib['ddeService']) if 'ddeTopic' in elem.attrib: link_info.append(elem.attrib['ddeTopic']) dde_links.append(u' '.join(link_info)) # binary parts, e.g. contained in .xlsb for subfile, content_type, handle in parser.iter_non_xml(): try: logging.info( 'Parsing non-xml subfile {0} with content type {1}'.format( subfile, content_type)) for record in xls_parser.parse_xlsb_part(handle, content_type, subfile): logging.debug('{0}: {1}'.format(subfile, record)) if isinstance(record, xls_parser.XlsbBeginSupBook) and \ record.link_type == \ xls_parser.XlsbBeginSupBook.LINK_TYPE_DDE: dde_links.append('DDE-Link ' + record.string1 + ' ' + record.string2) except Exception: if content_type.startswith('application/vnd.ms-excel.') or \ content_type.startswith('application/vnd.ms-office.'): # pylint: disable=bad-indentation # should really be able to parse these either as xml or records log_func = logging.warning elif content_type.startswith('image/') or content_type == \ 'application/vnd.openxmlformats-officedocument.' + \ 'spreadsheetml.printerSettings': # understandable that these are not record-base log_func = logging.debug else: # default log_func = logging.info log_func('Failed to parse {0} of content type {1}'.format( subfile, content_type)) # in any case: continue with next return u'\n'.join(dde_links)
def process_xlsx(filepath): """process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm)""" dde_links = [] parser = ooxml.XmlParser(filepath) for _, elem, _ in parser.iter_xml(): tag = elem.tag.lower() if tag == "ddelink" or tag.endswith("}ddelink"): # we have found a dde link. Try to get more info about it link_info = [] if "ddeService" in elem.attrib: link_info.append(elem.attrib["ddeService"]) if "ddeTopic" in elem.attrib: link_info.append(elem.attrib["ddeTopic"]) dde_links.append(u" ".join(link_info)) # binary parts, e.g. contained in .xlsb for subfile, content_type, handle in parser.iter_non_xml(): try: logger.info("Parsing non-xml subfile {0} with content type {1}".format(subfile, content_type)) for record in xls_parser.parse_xlsb_part(handle, content_type, subfile): logger.debug("{0}: {1}".format(subfile, record)) if isinstance(record, xls_parser.XlsbBeginSupBook) and record.link_type == xls_parser.XlsbBeginSupBook.LINK_TYPE_DDE: dde_links.append(record.string1 + " " + record.string2) except Exception as exc: if content_type.startswith("application/vnd.ms-excel.") or content_type.startswith( "application/vnd.ms-office." ): # pylint: disable=bad-indentation # should really be able to parse these either as xml or records log_func = logger.warning elif ( content_type.startswith("image/") or content_type == "application/vnd.openxmlformats-officedocument." + "spreadsheetml.printerSettings" ): # understandable that these are not record-base log_func = logger.debug else: # default log_func = logger.info log_func('Failed to parse {0} of content type {1} ("{2}")'.format(subfile, content_type, str(exc))) # in any case: continue with next return u"\n".join(dde_links)
def test_iter_tags(self): """ test that limitation to tags works """ testfile = join(DATA_BASE_DIR, 'msodde', 'harmless-clean.docm') nmspc = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' tag = '{' + nmspc + '}p' parser = ooxml.XmlParser(testfile) n_found = 0 for subfile, elem, depth in parser.iter_xml(tags=tag): n_found += 1 self.assertEqual(elem.tag, tag) # also check that children are present n_children = 0 for child in elem: n_children += 1 self.assertFalse(child.tag == '') self.assertTrue( n_children > 0, 'no children for elem {0}'.format(ooxml.debug_str(elem))) self.assertEqual(n_found, 7)
def process_excel_xml(filepath): """ find dde links in xml files created with excel 2003 or excel 2007+ TODO: did not manage to create dde-link in the 2007+-xml-format. Find out whether this is possible at all. If so, extend this function """ dde_links = [] parser = ooxml.XmlParser(filepath) for _, elem, _ in parser.iter_xml(): tag = elem.tag.lower() if tag != 'cell' and not tag.endswith('}cell'): continue # we are only interested in cells formula = None for key in elem.keys(): if key.lower() == 'formula' or key.lower().endswith('}formula'): formula = elem.get(key) break if formula is None: continue log.debug('found cell with formula {0}'.format(formula)) match = re.match(XML_DDE_FORMAT, formula) if match: dde_links.append(u' '.join(match.groups()[:2])) return u'\n'.join(dde_links)
def test_iter_all(self): """ test iter_xml without args """ expect_subfiles = dict([ ('[Content_Types].xml', 11), ('_rels/.rels', 4), ('word/_rels/document.xml.rels', 6), ('word/document.xml', 102), ('word/theme/theme1.xml', 227), ('word/settings.xml', 40), ('word/fontTable.xml', 25), ('word/webSettings.xml', 3), ('docProps/app.xml', 26), ('docProps/core.xml', 10), ('word/styles.xml', 441), ]) n_elems = 0 testfile = join(DATA_BASE_DIR, 'msodde', 'harmless-clean.docx') for subfile, elem, depth in ooxml.XmlParser(testfile).iter_xml(): n_elems += 1 if depth > 0: continue # now depth == 0; should occur once at end of every subfile if subfile not in expect_subfiles: self.fail('Subfile {0} not expected'.format(subfile)) self.assertEqual( n_elems, expect_subfiles[subfile], 'wrong number of elems ({0}) yielded from {1}'.format( n_elems, subfile)) _ = expect_subfiles.pop(subfile) n_elems = 0 self.assertEqual( len(expect_subfiles), 0, 'Forgot to iterate through subfile(s) {0}'.format( expect_subfiles.keys()))
def process_xlsx(filepath, filed_filter_mode=None): """ process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm) """ dde_links = [] parser = ooxml.XmlParser(filepath) for subfile, elem, _ in parser.iter_xml(): tag = elem.tag.lower() if tag == 'ddelink' or tag.endswith('}ddelink'): # we have found a dde link. Try to get more info about it link_info = ['DDE-Link'] if 'ddeService' in elem.attrib: link_info.append(elem.attrib['ddeService']) if 'ddeTopic' in elem.attrib: link_info.append(elem.attrib['ddeTopic']) dde_links.append(u' '.join(link_info)) # binary parts, e.g. contained in .xlsb for subfile, content_type, handle in parser.iter_non_xml(): if content_type == 'application/vnd.openxmlformats-officedocument.' + \ 'spreadsheetml.printerSettings': continue # printer settings if not content_type.startswith('application/vnd.ms-excel.') and \ not content_type.startswith('application/vnd.ms-office.'): # pylint: disable=bad-indentation logging.warning('Unexpected content type: ' + content_type) # try parsing anyway logging.info('Parsing non-xml subfile {0} with content type {1}' .format(subfile, content_type)) for record in xls_parser.parse_xlsb_part(handle, content_type, subfile): logging.debug('{0}: {1}'.format(subfile, record)) if isinstance(record, xls_parser.XlsbBeginSupBook) and \ record.link_type == \ xls_parser.XlsbBeginSupBook.LINK_TYPE_DDE: dde_links.append('DDE-Link ' + record.string1 + ' ' + record.string2) return u'\n'.join(dde_links)
def process_docx(filepath, field_filter_mode=None): """ find dde-links (and other fields) in Word 2007+ files """ parser = ooxml.XmlParser(filepath) all_fields = [] level = 0 ddetext = u'' for _, subs, depth in parser.iter_xml(tags=TAG_W_P + TAG_W_FLDSIMPLE): if depth == 0: # at end of subfile: level = 0 # reset if subs.tag in TAG_W_FLDSIMPLE: # concatenate the attribute of the field, if present: attrib_instr = subs.attrib.get(ATTR_W_INSTR[0]) or \ subs.attrib.get(ATTR_W_INSTR[1]) if attrib_instr is not None: all_fields.append(unquote(attrib_instr)) continue # have a TAG_W_P for curr_elem in subs: # check if w:r; parse children to pull out first FLDCHAR/INSTRTEXT elem = None if curr_elem.tag in TAG_W_R: for child in curr_elem: if child.tag in TAG_W_FLDCHAR or \ child.tag in TAG_W_INSTRTEXT: elem = child break if elem is None: continue # no fldchar or instrtext in this w:r else: elem = curr_elem if elem is None: raise BadOOXML(filepath, 'Got "None"-Element from iter_xml') # check if FLDCHARTYPE and whether "begin" or "end" tag attrib_type = elem.attrib.get(ATTR_W_FLDCHARTYPE[0]) or \ elem.attrib.get(ATTR_W_FLDCHARTYPE[1]) if attrib_type is not None: if attrib_type == "begin": level += 1 if attrib_type == "end": level -= 1 if level == 0 or level == -1: # edge-case; level gets -1 all_fields.append(ddetext) ddetext = u'' level = 0 # reset edge-case # concatenate the text of the field, if present: if elem.tag in TAG_W_INSTRTEXT and elem.text is not None: # expand field code if QUOTED ddetext += unquote(elem.text) # apply field command filter log.debug('filtering with mode "{0}"'.format(field_filter_mode)) if field_filter_mode in (FIELD_FILTER_ALL, None): clean_fields = all_fields elif field_filter_mode == FIELD_FILTER_DDE: clean_fields = [field for field in all_fields if FIELD_DDE_REGEX.match(field)] elif field_filter_mode == FIELD_FILTER_BLACKLIST: # check if fields are acceptable and should not be returned clean_fields = [field for field in all_fields if not field_is_blacklisted(field.strip())] else: raise ValueError('Unexpected field_filter_mode: "{0}"' .format(field_filter_mode)) return u'\n'.join(clean_fields)