def strip_namespace_inplace(etree, namespace=None, remove_from_attr=True): """ Takes a parsed ET structure and does an in-place removal of all namespaces, or removes a specific namespacem (by its URL). Can make node searches simpler in structures with unpredictable namespaces and in content given to be non-mixed. By default does so for node names as well as attribute names. (doesn't remove the namespace definitions, but apparently ElementTree serialization omits any that are unused) Note that for attributes that are unique only because of namespace, this may attributes to be overwritten. For example: <e p:at="bar" at="quu"> would become: <e at="bar"> I don't think I've seen any XML where this matters, though. """ if namespace == None: # all namespaces for elem in etree.getiterator(): tagname = elem.tag if not isinstance(elem.tag, str): continue if tagname[0] == '{': elem.tag = tagname[tagname.index('}', 1) + 1:] if remove_from_attr: to_delete = [] to_set = {} for attr_name in elem.attrib: if attr_name[0] == '{': old_val = elem.attrib[attr_name] to_delete.append(attr_name) attr_name = attr_name[attr_name.index('}', 1) + 1:] to_set[attr_name] = old_val for key in to_delete: elem.attrib.pop(key) elem.attrib.update(to_set) else: # asked to remove specific namespace. ns = '{%s}' % namespace nsl = len(ns) for elem in etree.getiterator(): if elem.tag.startswith(ns): elem.tag = elem.tag[nsl:] if remove_from_attr: to_delete = [] to_set = {} for attr_name in elem.attrib: if attr_name.startswith(ns): old_val = elem.attrib[attr_name] to_delete.append(attr_name) attr_name = attr_name[nsl:] to_set[attr_name] = old_val for key in to_delete: elem.attrib.pop(key) elem.attrib.update(to_set)
def dt_docxml_to_text(self, filename): texts = "" document = zipfile.ZipFile(filename) xml_content = document.read('word/document.xml') root = ET(xml_content) sections = [] for section in root.getiterator(self.PARA): texts = '' for node in section.getiterator(self.TEXT): if node.text: texts += node.text sections.append(''.join(texts)) texts = '\n\n'.join(sections) document.close() return texts
def xml_parse(xml: XmlElementTree, lang: bool): max_confidence = -float("inf") max_biom = -float("inf") text = '' language = "" biom = False if int(xml.attrib['success']) == 1: for child in xml.getiterator(): print(child.tag) if child.tag == "class": cur = float(child.attrib['confidence']) if cur >= max_biom: language = child.text[-2::] max_biom = float(child.attrib['confidence']) elif child.tag == "variant": cur = float(child.attrib['confidence']) if cur >= max_confidence: text = child.text max_confidence = float(child.attrib['confidence']) if max_confidence != -float("inf") or (max_biom != -float("inf") and lang): return text, language else: # Создавать собственные исключения для обработки бизнес-логики - правило хорошего тона raise SpeechException('No text found.\n\nResponse:') else: raise SpeechException('No text found.\n\nResponse:')
def getRecordList(self): # Extraction of the fields and values to map to the SQL statement. # The method returns a list of dictionnaries reticLog.logInfo(self.logList, "( " + self.name + " ) getRecordList Start") msg = StringIO.StringIO() msg.write(self.msg) #print raw_input('msg here...') #print msg msg.seek(0) recordList = [] msgFormat = '' xmlRoot = None if self.metadata.has_key(msgFormat) : if self.metadata['msgFormat'] == 'xml': xmlRoot = ElementTree(msg) msgFormat = 'xml' else: msgFormat = 'flat' if msgFormat == '': try: xmlRoot = ElementTree(file=msg) msgFormat = 'xml' except: msgFormat = 'flat' reticLog.logDebug(self.logList, "Input format detected : " + msgFormat) # I am here if msgFormat == 'xml': recNb = 0 #Create an iterator iter = xmlRoot.getiterator() #traverse the xml tree for element in iter: if element.getchildren(): for child in element.getchildren(): prepRecord = {} if child.getchildren(): for subChild in child.getchildren(): if subChild.text == '-999': continue prepRecord[subChild.tag] = subChild.text recordList.append(prepRecord) reticLog.logDebug(self.logList, "All records processed.") #here for processing flat file elif msgFormat == 'flat': raise Exception('Do not support flat file at this time') reticLog.logInfo(self.logList, "( " + self.name + " ) getRecordList End") return recordList
def get_conf_attr(fileName,mode="log"): res = [] root = ElementTree(file=fileName) #Create an iterator iter = root.getiterator() if mode == "log": for element in iter: #Next the attributes (available on the instance itself using #the Python dictionary protocol) if element.keys() and (element.tag == 'logger'): res.append(element.attrib) #extract source attributes elif mode == "source": for element in iter: if element.keys() and (element.tag == 'source'): res = element.attrib #extract pipe attributes elif mode == "pipe": for element in iter: if element.keys and (element.tag == 'pipe'): tempRes = element.attrib tempRes['fieldNames'] = {} tempRes['fieldLength'] = {} #pipe children tag, for fields if element.getchildren(): fieldIndex = 0 for child in element.getchildren(): if child.tag == 'field': tempRes['fieldNames'][fieldIndex] = child.attrib['name'] fieldIndex += 1 res = tempRes #extract sink attributes elif mode == 'sink': for element in iter: if element.keys() and (element.tag == 'sink'): res = element.attrib if element.getchildren(): for child in element.getchildren(): res['dbFactroyArg'] = child.attrib #extract source attributes else: import sys print "unkown component configuration mode..." sys.exit() return res