def __init__(self, xml_file_name_or_src): """ Read and parse xml file using modified version of standard python xml.etree.ElementTree. xml_file_name_or_src can be a file name like: "content.xml" OR can be xml source. """ #xml_file_name_or_src = xml_file_name_or_src.decode('utf-8') if xml_file_name_or_src.endswith('.xml') and len(xml_file_name_or_src)<256: self.xml_file_name_or_src = xml_file_name_or_src fInp = io.open(xml_file_name_or_src, 'rt', encoding='utf-8') xml_src = fInp.read() fInp.close() else: self.xml_file_name_or_src = None xml_src = xml_file_name_or_src self.xml_header = '' # Assume no header unless found at head of file match = header_re.match(xml_src) if match: #print( 'Found XML Header: ' + match.group(0) ) self.xml_header = match.group(0) # will need \n when serialized # ns entries like: ('urn:oasis:names:tc:opendocument:xmlns:table:1.0', u'table') self.nsOD = OrderedDict() # rev_ns entries like: (u'table', 'urn:oasis:names:tc:opendocument:xmlns:table:1.0') self.rev_nsOD = OrderedDict() # qname entries like: ('{urn:oasis:names:tc:opendocument:xmlns:office:1.0}document-content', u'office:document-content') self.qnameOD = OrderedDict() events = ("start", "end", "start-ns", "end-ns") context = ET.iterparse(StringIO(xml_src), events=events) for event, elem in context: if event=="start": #print('type elem.tag =', type(elem.tag)) #print(' type("}") =',type("}")) sL = elem.tag.split('}') if len(sL) == 2: name = sL[1] uri = sL[0][1:] self.qnameOD[elem.tag] = '%s:%s'%(self.nsOD[uri], name) for qname,v in elem.attrib.items(): sL = qname.split('}') if len(sL) == 2: name = sL[1] uri = sL[0][1:] self.qnameOD[qname] = '%s:%s'%(self.nsOD[uri], name) if event=="start-ns": self.nsOD[elem[1]] = elem[0] # like: ('urn:oasis:names:tc:opendocument:xmlns:table:1.0', u'table') self.rev_nsOD[elem[0]] = elem[1] # like: (u'table', 'urn:oasis:names:tc:opendocument:xmlns:table:1.0') self.context = context #self.root = ET.ElementTree( context.root ) self.root = context.root self.parentD = {} # index=child Element object, value=parent Element object self.depthD = {} # index=Element object, value = depth in xml tree self.original_posD = {} # index=Element object, value=tuple of child position (e.g. (0,3,1)) self.get_elem_from_orig_posD = {} # reverse lookup of "original_posD" self.max_depth = 0 self.short_pathD = {} # index=Element, value = short name (like: "ns0:name1/ns1:xyz/ns3:abc") # After building tree, create self.parentD for all Elements self.parentD[self.root] = None self.depthD[self.root] = 0 self.short_pathD[self.root] = self.qnameOD[ self.root.tag ] # no calc req'd... just = qname self.original_posD[self.root] = (0,) # tuple of position temp_short_path_counterD = {} # just used here to help count occurances of short path self.short_path_counterD = {} # index=Element, value=short path counter value self.short_path_parent_counterD = {} # index=Element, value=parent's short path counter value self.short_path_counterD[self.root] = 1 # 1st (and only) occurance self.short_path_parent_counterD[self.root] = 1 # 1st (and only) occurance for parent in self.root.iter(): try: for ichild, child in enumerate(parent.getchildren()): self.parentD[child] = parent self.depthD[child] = self.depthD[parent] + 1 self.max_depth = max(self.max_depth, self.depthD[child]) L = list(self.original_posD[parent]) L.append( ichild ) self.original_posD[child] = tuple( L ) short_path = self.get_short_path( child ) self.short_pathD[child] = short_path temp_short_path_counterD[(parent,short_path)] = temp_short_path_counterD.get((parent,short_path), 0) + 1 self.short_path_counterD[child] = '%s,%s'%(self.short_path_counterD[parent], temp_short_path_counterD[(parent,short_path)]) self.short_path_parent_counterD[child] = '%s'%self.short_path_counterD[parent] except: print( 'NOTICE: No children for:', parent ) for key,item in self.original_posD.items(): self.get_elem_from_orig_posD[item] = key # get elem from original_posD
def __init__(self, xml_file_name_or_src): """ Read and parse xml file using modified version of standard python xml.etree.ElementTree. xml_file_name_or_src can be a file name like: "content.xml" OR can be xml source. """ #xml_file_name_or_src = xml_file_name_or_src.decode('utf-8') if xml_file_name_or_src.endswith( '.xml') and len(xml_file_name_or_src) < 256: self.xml_file_name_or_src = xml_file_name_or_src fInp = io.open(xml_file_name_or_src, 'rt', encoding='utf-8') xml_src = fInp.read() fInp.close() else: self.xml_file_name_or_src = None xml_src = xml_file_name_or_src self.xml_header = '' # Assume no header unless found at head of file match = header_re.match(xml_src) if match: #print( 'Found XML Header: ' + match.group(0) ) self.xml_header = match.group(0) # will need \n when serialized # ns entries like: ('urn:oasis:names:tc:opendocument:xmlns:table:1.0', u'table') self.nsOD = OrderedDict() # rev_ns entries like: (u'table', 'urn:oasis:names:tc:opendocument:xmlns:table:1.0') self.rev_nsOD = OrderedDict() # qname entries like: ('{urn:oasis:names:tc:opendocument:xmlns:office:1.0}document-content', u'office:document-content') self.qnameOD = OrderedDict() events = ("start", "end", "start-ns", "end-ns") context = ET.iterparse(StringIO(xml_src), events=events) for event, elem in context: if event == "start": #print('type elem.tag =', type(elem.tag)) #print(' type("}") =',type("}")) sL = elem.tag.split('}') if len(sL) == 2: name = sL[1] uri = sL[0][1:] self.qnameOD[elem.tag] = '%s:%s' % (self.nsOD[uri], name) for qname, v in list(elem.attrib.items()): sL = qname.split('}') if len(sL) == 2: name = sL[1] uri = sL[0][1:] self.qnameOD[qname] = '%s:%s' % (self.nsOD[uri], name) if event == "start-ns": self.nsOD[elem[1]] = elem[ 0] # like: ('urn:oasis:names:tc:opendocument:xmlns:table:1.0', u'table') self.rev_nsOD[elem[0]] = elem[ 1] # like: (u'table', 'urn:oasis:names:tc:opendocument:xmlns:table:1.0') self.context = context #self.root = ET.ElementTree( context.root ) self.root = context.root self.parentD = { } # index=child Element object, value=parent Element object self.depthD = {} # index=Element object, value = depth in xml tree self.original_posD = { } # index=Element object, value=tuple of child position (e.g. (0,3,1)) self.get_elem_from_orig_posD = {} # reverse lookup of "original_posD" self.max_depth = 0 self.short_pathD = { } # index=Element, value = short name (like: "ns0:name1/ns1:xyz/ns3:abc") # After building tree, create self.parentD for all Elements self.parentD[self.root] = None self.depthD[self.root] = 0 self.short_pathD[self.root] = self.qnameOD[ self.root.tag] # no calc req'd... just = qname self.original_posD[self.root] = (0, ) # tuple of position temp_short_path_counterD = { } # just used here to help count occurances of short path self.short_path_counterD = { } # index=Element, value=short path counter value self.short_path_parent_counterD = { } # index=Element, value=parent's short path counter value self.short_path_counterD[self.root] = 1 # 1st (and only) occurance self.short_path_parent_counterD[ self.root] = 1 # 1st (and only) occurance for parent in self.root.iter(): try: for ichild, child in enumerate(parent.getchildren()): self.parentD[child] = parent self.depthD[child] = self.depthD[parent] + 1 self.max_depth = max(self.max_depth, self.depthD[child]) L = list(self.original_posD[parent]) L.append(ichild) self.original_posD[child] = tuple(L) short_path = self.get_short_path(child) self.short_pathD[child] = short_path temp_short_path_counterD[( parent, short_path)] = temp_short_path_counterD.get( (parent, short_path), 0) + 1 self.short_path_counterD[child] = '%s,%s' % ( self.short_path_counterD[parent], temp_short_path_counterD[(parent, short_path)]) self.short_path_parent_counterD[ child] = '%s' % self.short_path_counterD[parent] except: print('NOTICE: No children for:', parent) for key, item in list(self.original_posD.items()): self.get_elem_from_orig_posD[ item] = key # get elem from original_posD