def __str__(self): strings = [self.elemType.name, self.name] tokentype = self.allNMTOKENs and "NMTOKEN" or "CDATA" # if there is only one attribute value, and at least MIN_FIXED # occurrences of it, treat it as FIXED numVals = len(self.values) if numVals == 1 and self._occurrences >= MIN_FIXED: singleton = iter(self.values).next() fixedVal = escape_attrib(singleton) for s in tokentype, "#FIXED", '"%s"' % fixedVal: strings.append(s) else: # check if it is id # TODO: this may give the wrong answer, we should check # whether the value sets of two candidate-ID attributes # overlap, in which case they can't both be IDs !!) if self.unique and self.allNames and self._occurrences >= MIN_ID_VALUES: # ID values must be Names strings.append("ID") # check if is it is enumeration elif ( self.allNMTOKENs # Enumeration values must be NMTOKENs and self._occurrences >= MIN_ENUMERATION_INSTANCES and numVals <= self._occurrences / MIN_ENUMERATION_RATIO and numVals <= MAX_ENUMERATION_VALUES ): strings.append("(%s)" % " | ".join(self.values)) else: strings.append(tokentype) # If the attribute is present on every instance of the # element, treat it as required strings.append(self._occurrences == self.elemType._occurrences and "#REQUIRED" or "#IMPLIED") return "<!ATTLIST %s>" % " ".join(strings)
def __str__(self): strings = [self.elemType.name, self.name] tokentype = self.allNMTOKENs and 'NMTOKEN' or 'CDATA' # if there is only one attribute value, and at least MIN_FIXED # occurrences of it, treat it as FIXED numVals = len(self.values) if numVals == 1 and self._occurrences >= MIN_FIXED: singleton = iter(self.values).next() fixedVal = escape_attrib(singleton) for s in tokentype, '#FIXED', '"%s"' % fixedVal: strings.append(s) else: # check if it is id # TODO: this may give the wrong answer, we should check # whether the value sets of two candidate-ID attributes # overlap, in which case they can't both be IDs !!) if (self.unique and self.allNames # ID values must be Names and self._occurrences >= MIN_ID_VALUES): strings.append('ID') # check if is it is enumeration elif (self.allNMTOKENs # Enumeration values must be NMTOKENs and self._occurrences >= MIN_ENUMERATION_INSTANCES and numVals <= self._occurrences / MIN_ENUMERATION_RATIO and numVals <= MAX_ENUMERATION_VALUES): strings.append('(%s)' % ' | '.join(self.values)) else: strings.append(tokentype) # If the attribute is present on every instance of the # element, treat it as required strings.append( self._occurrences == self.elemType._occurrences and '#REQUIRED' or '#IMPLIED') return '<!ATTLIST %s>' % ' '.join(strings)
def __init__(self, source): self.elements = [] # map element names to ElementTypes elemDict = {} # elements currently open elemStack = [] namespaces = deque([("xml", "http://www.w3.org/XML/1998/namespace")]) pendingNamespaces = [] for event, elem in iterparse(source, events=("start", "end", "start-ns", "end-ns")): if event == "start-ns": namespaces.appendleft(elem) pendingNamespaces.append(elem) elif event == "end-ns": namespaces.popleft() elif event == "start": # add the namespace declarations as attributes for prefix, url in pendingNamespaces: attr = prefix and "xmlns:%s" % prefix or "xmlns" elem.attrib[attr] = escape_attrib(url) del pendingNamespaces[:] # convert name from clark format to prefix:local name = _clark_to_orig(elem.tag, namespaces) elemType = elemDict.get(name) if elemType is None: elemType = ElementType(name) elemDict[name] = elemType self.elements.append(elemType) elemType._occurrences += 1 # update atttibute declarations for attr, value in elem.items(): # convert attribute names from clark format to prefix:local elemType.updateAttribute(_clark_to_orig(attr, namespaces), value) # keep track of the nesting and sequence of child elements if elemStack: parentEntry = elemStack[-1] parent = parentEntry.elemType # for sequencing, we're interested in consecutive groups # of the same child element type isFirstInGroup = parentEntry.latestChild != name if isFirstInGroup: parentEntry.latestChild = name parentEntry.groupIndex += 1 parent.setChildInfo(name, parentEntry.groupIndex) else: parent.getChildInfo(name, parentEntry.groupIndex).repeatable = True # fi elemStack elemStack.append(_StackEntry(elemType)) elif event == "end": entry = elemStack.pop() elemType = entry.elemType for txt in elem.text, elem.tail: if txt is not None and not txt.isspace(): elemType._hasCharacterContent = True break # check that all expected children are accounted for. # If the number of child element groups in this parent element # is less than the number in previous elements, then the # absent children are marked as optional for c in elemType.iterChildInfo(entry.groupIndex + 1, None): c.optional = True elem.clear()
def __init__(self, source): self.elements = [] # map element names to ElementTypes elemDict = {} # elements currently open elemStack = [] # map (father-name, child-name) to _childInfo childInfoDict = {} namespaces = deque([('xml','http://www.w3.org/XML/1998/namespace')]) pendingNamespaces = [] for event,elem in iterparse(source, events=('start', 'end', 'start-ns', 'end-ns')): if event == 'start-ns': namespaces.appendleft(elem) pendingNamespaces.append(elem) elif event == 'end-ns': namespaces.popleft() elif event == 'start': # add the namespace declarations as attributes for prefix,url in pendingNamespaces: attr = prefix and 'xmlns:%s' % prefix or 'xmlns' elem.attrib[attr] = escape_attrib(url) del pendingNamespaces[:] # convert name from clark format to prefix:local name = _clark_to_orig(elem.tag,namespaces) elemType = elemDict.get(name) if elemType is None: elemType = ElementType(name) elemDict[name] = elemType self.elements.append(elemType) elemType._occurrences += 1 # update atttibute declarations for attr,value in elem.items(): # convert attribute names from clark format to prefix:local elemType.updateAttribute(_clark_to_orig(attr,namespaces),value) # keep track of the nesting and sequence of child elements if elemStack: parentEntry = elemStack[-1] # for sequencing, we're interested in consecutive groups # of the same child element type isFirstInGroup = parentEntry.latestChild != name if isFirstInGroup: parentEntry.latestChild = name parentEntry.sequenceNumber += 1 # check if we've seen this child of this parent before parent = parentEntry.elemType childInfo = childInfoDict.get((parent.name,name)) if childInfo is None: # this is the first time we've seen this child # belonging to this parent. if the child is not on # the first instance of the parent, then we allow it # as an optional element childInfo = _ChildInfo(name, parent._occurrences>1) childInfoDict[parent.name,name] = childInfo parent._children.append(childInfo) elif ( # we've seen this child before: check if it makes # parent non-consecutive parent._occurrences == 1 and isFirstInGroup # check whether the position of this group of children in # this parent element is the same as its position in # previous instances of the parent. or len(parent._children) <= parentEntry.sequenceNumber or parent._children[parentEntry.sequenceNumber].name != name): parent._sequenced = False # if there's more than one child element, mark it as repeatable if not isFirstInGroup: childInfo.repeatable = True #fi elemStack elemStack.append(_StackEntry(elemType)) elif event == 'end': entry = elemStack.pop() elemType = entry.elemType for txt in elem.text, elem.tail: if txt is not None and not txt.isspace(): elemType._hasCharacterContent = True break # check that all expected children are accounted for. # If the number of child element groups in this parent element # is less than the number in previous elements, then the # absent children are marked as optional if elemType._sequenced: for c in elemType._children[entry.sequenceNumber+1:]: c.optional = True elem.clear()
def __init__(self, source): self.elements = [] # map element names to ElementTypes elemDict = {} # elements currently open elemStack = [] # map (father-name, child-name) to _childInfo childInfoDict = {} namespaces = deque([('xml', 'http://www.w3.org/XML/1998/namespace')]) pendingNamespaces = [] for event, elem in iterparse(source, events=('start', 'end', 'start-ns', 'end-ns')): if event == 'start-ns': namespaces.appendleft(elem) pendingNamespaces.append(elem) elif event == 'end-ns': namespaces.popleft() elif event == 'start': # add the namespace declarations as attributes for prefix, url in pendingNamespaces: attr = prefix and 'xmlns:%s' % prefix or 'xmlns' elem.attrib[attr] = escape_attrib(url) del pendingNamespaces[:] # convert name from clark format to prefix:local name = _clark_to_orig(elem.tag, namespaces) elemType = elemDict.get(name) if elemType is None: elemType = ElementType(name) elemDict[name] = elemType self.elements.append(elemType) elemType._occurrences += 1 # update atttibute declarations for attr, value in elem.items(): # convert attribute names from clark format to prefix:local elemType.updateAttribute(_clark_to_orig(attr, namespaces), value) # keep track of the nesting and sequence of child elements if elemStack: parentEntry = elemStack[-1] # for sequencing, we're interested in consecutive groups # of the same child element type isFirstInGroup = parentEntry.latestChild != name if isFirstInGroup: parentEntry.latestChild = name parentEntry.sequenceNumber += 1 # check if we've seen this child of this parent before parent = parentEntry.elemType childInfo = childInfoDict.get((parent.name, name)) if childInfo is None: # this is the first time we've seen this child # belonging to this parent. if the child is not on # the first instance of the parent, then we allow it # as an optional element childInfo = _ChildInfo(name, parent._occurrences > 1) childInfoDict[parent.name, name] = childInfo parent._children.append(childInfo) elif ( # we've seen this child before: check if it makes # parent non-consecutive parent._occurrences == 1 and isFirstInGroup # check whether the position of this group of children in # this parent element is the same as its position in # previous instances of the parent. or len(parent._children) <= parentEntry.sequenceNumber or parent._children[parentEntry.sequenceNumber].name != name): parent._sequenced = False # if there's more than one child element, mark it as repeatable if not isFirstInGroup: childInfo.repeatable = True #fi elemStack elemStack.append(_StackEntry(elemType)) elif event == 'end': entry = elemStack.pop() elemType = entry.elemType for txt in elem.text, elem.tail: if txt is not None and not txt.isspace(): elemType._hasCharacterContent = True break # check that all expected children are accounted for. # If the number of child element groups in this parent element # is less than the number in previous elements, then the # absent children are marked as optional if elemType._sequenced: for c in elemType._children[entry.sequenceNumber + 1:]: c.optional = True elem.clear()
def __init__(self, source): self.elements = [] # map element names to ElementTypes elemDict = {} # elements currently open elemStack = [] namespaces = deque([('xml','http://www.w3.org/XML/1998/namespace')]) pendingNamespaces = [] for event,elem in iterparse(source, events=('start', 'end', 'start-ns', 'end-ns')): if event == 'start-ns': namespaces.appendleft(elem) pendingNamespaces.append(elem) elif event == 'end-ns': namespaces.popleft() elif event == 'start': # add the namespace declarations as attributes for prefix,url in pendingNamespaces: attr = prefix and 'xmlns:%s' % prefix or 'xmlns' elem.attrib[attr] = escape_attrib(url) del pendingNamespaces[:] # convert name from clark format to prefix:local name = _clark_to_orig(elem.tag,namespaces) elemType = elemDict.get(name) if elemType is None: elemType = ElementType(name) elemDict[name] = elemType self.elements.append(elemType) elemType._occurrences += 1 # update atttibute declarations for attr,value in elem.items(): # convert attribute names from clark format to prefix:local elemType.updateAttribute(_clark_to_orig(attr,namespaces),value) # keep track of the nesting and sequence of child elements if elemStack: parentEntry = elemStack[-1] parent = parentEntry.elemType # for sequencing, we're interested in consecutive groups # of the same child element type isFirstInGroup = parentEntry.latestChild != name if isFirstInGroup: parentEntry.latestChild = name parentEntry.groupIndex += 1 parent.setChildInfo(name,parentEntry.groupIndex) else: parent.getChildInfo(name,parentEntry.groupIndex).repeatable = True #fi elemStack elemStack.append(_StackEntry(elemType)) elif event == 'end': entry = elemStack.pop() elemType = entry.elemType for txt in elem.text, elem.tail: if txt is not None and not txt.isspace(): elemType._hasCharacterContent = True break # check that all expected children are accounted for. # If the number of child element groups in this parent element # is less than the number in previous elements, then the # absent children are marked as optional for c in elemType.iterChildInfo(entry.groupIndex+1,None): c.optional = True elem.clear()