Example #1
0
def strip_namespace_inplace(etree, namespace=None, remove_from_attr=True):
    """ Takes a parsed ET structure and does an in-place removal of all namespaces,
        or removes a specific namespacem (by its URL).

        Can make node searches simpler in structures with unpredictable namespaces
        and in content given to be non-mixed.

        By default does so for node names as well as attribute names.
        (doesn't remove the namespace definitions, but apparently
         ElementTree serialization omits any that are unused)

        Note that for attributes that are unique only because of namespace,
        this may attributes to be overwritten.
        For example: <e p:at="bar" at="quu">   would become: <e at="bar">

        I don't think I've seen any XML where this matters, though.
    """
    if namespace == None:  # all namespaces
        for elem in etree.getiterator():
            tagname = elem.tag
            if not isinstance(elem.tag, str):
                continue
            if tagname[0] == '{':
                elem.tag = tagname[tagname.index('}', 1) + 1:]

            if remove_from_attr:
                to_delete = []
                to_set = {}
                for attr_name in elem.attrib:
                    if attr_name[0] == '{':
                        old_val = elem.attrib[attr_name]
                        to_delete.append(attr_name)
                        attr_name = attr_name[attr_name.index('}', 1) + 1:]
                        to_set[attr_name] = old_val
                for key in to_delete:
                    elem.attrib.pop(key)
                elem.attrib.update(to_set)

    else:  # asked to remove specific namespace.
        ns = '{%s}' % namespace
        nsl = len(ns)
        for elem in etree.getiterator():
            if elem.tag.startswith(ns):
                elem.tag = elem.tag[nsl:]

            if remove_from_attr:
                to_delete = []
                to_set = {}
                for attr_name in elem.attrib:
                    if attr_name.startswith(ns):
                        old_val = elem.attrib[attr_name]
                        to_delete.append(attr_name)
                        attr_name = attr_name[nsl:]
                        to_set[attr_name] = old_val
                for key in to_delete:
                    elem.attrib.pop(key)
                elem.attrib.update(to_set)
Example #2
0
def dt_docxml_to_text(self, filename):

    texts = ""

    document = zipfile.ZipFile(filename)

    xml_content = document.read('word/document.xml')

    root = ET(xml_content)

    sections = []

    for section in root.getiterator(self.PARA):
        texts = ''
        for node in section.getiterator(self.TEXT):
            if node.text:
                texts += node.text

        sections.append(''.join(texts))

    texts = '\n\n'.join(sections)

    document.close()

    return texts
Example #3
0
def xml_parse(xml: XmlElementTree, lang: bool):
    max_confidence = -float("inf")
    max_biom = -float("inf")
    text = ''
    language = ""
    biom = False
    if int(xml.attrib['success']) == 1:
        for child in xml.getiterator():
            print(child.tag)
            if child.tag == "class":
                cur = float(child.attrib['confidence'])
                if cur >= max_biom:
                    language = child.text[-2::]
                    max_biom = float(child.attrib['confidence'])
            elif child.tag == "variant":
                cur = float(child.attrib['confidence'])
                if cur >= max_confidence:
                    text = child.text
                    max_confidence = float(child.attrib['confidence'])

        if max_confidence != -float("inf") or (max_biom != -float("inf")
                                               and lang):
            return text, language
        else:
            # Создавать собственные исключения для обработки бизнес-логики - правило хорошего тона
            raise SpeechException('No text found.\n\nResponse:')
    else:
        raise SpeechException('No text found.\n\nResponse:')
Example #4
0
    def getRecordList(self):
        # Extraction of the fields and values to map to the SQL statement.
        # The method returns a list of dictionnaries 
        reticLog.logInfo(self.logList, "( " + self.name + " ) getRecordList Start")        
        msg = StringIO.StringIO()
        msg.write(self.msg)
        #print raw_input('msg here...')
        #print msg
        msg.seek(0)
        recordList = []
        msgFormat = ''
        xmlRoot = None
        if self.metadata.has_key(msgFormat) : 
            if self.metadata['msgFormat'] == 'xml':
                xmlRoot = ElementTree(msg) 
                msgFormat = 'xml'
            else:
                msgFormat = 'flat'        
                

        if msgFormat == '':
            try:
                xmlRoot = ElementTree(file=msg)
                msgFormat = 'xml'
            except:
                msgFormat = 'flat'

        reticLog.logDebug(self.logList, "Input format detected :  " + msgFormat)
        # I am here
        if msgFormat == 'xml':
            recNb = 0
            #Create an iterator
            iter = xmlRoot.getiterator()
            #traverse the xml tree
            for element in iter:
                if element.getchildren():
                    for child in element.getchildren():
                        prepRecord = {}
                        if child.getchildren():
                            for subChild in child.getchildren():
                                if subChild.text == '-999':
                                    continue
                                prepRecord[subChild.tag] = subChild.text
                            recordList.append(prepRecord)                
            reticLog.logDebug(self.logList, "All records processed.")
        #here for processing flat file
        elif msgFormat == 'flat':
            raise Exception('Do not support flat file at this time')
                
        reticLog.logInfo(self.logList, "( " + self.name + " ) getRecordList End")                                
        return recordList
Example #5
0
def get_conf_attr(fileName,mode="log"):
    res = []
    root = ElementTree(file=fileName)
    #Create an iterator
    iter = root.getiterator()
    if mode == "log":
        for element in iter:
        #Next the attributes (available on the instance itself using
        #the Python dictionary protocol)
            if element.keys() and (element.tag == 'logger'):
                res.append(element.attrib)
    #extract source attributes
    elif mode == "source":
        for element in iter:
            if element.keys() and (element.tag == 'source'):
                res = element.attrib
    #extract pipe attributes
    elif mode == "pipe":
        for element in iter:
            if element.keys and (element.tag == 'pipe'):
                tempRes = element.attrib
                tempRes['fieldNames'] = {}
                tempRes['fieldLength'] = {}
                #pipe children tag, for fields
                if element.getchildren():
                    fieldIndex = 0
                    for child in element.getchildren():
                        if child.tag == 'field':
                            tempRes['fieldNames'][fieldIndex] = child.attrib['name']
                            fieldIndex += 1
                res = tempRes
    #extract sink attributes
    elif mode == 'sink':
        for element in iter:
            if element.keys() and (element.tag == 'sink'):
                res = element.attrib
                if element.getchildren():
                    for child in element.getchildren():
                        res['dbFactroyArg'] = child.attrib
    #extract source attributes
    else:
        import sys
        print "unkown component configuration mode..."
        sys.exit()
    return res