def __str__(self):
     strings = [self.elemType.name, self.name]
     tokentype = self.allNMTOKENs and "NMTOKEN" or "CDATA"
     # if there is only one attribute value, and at least MIN_FIXED
     # occurrences of it, treat it as FIXED
     numVals = len(self.values)
     if numVals == 1 and self._occurrences >= MIN_FIXED:
         singleton = iter(self.values).next()
         fixedVal = escape_attrib(singleton)
         for s in tokentype, "#FIXED", '"%s"' % fixedVal:
             strings.append(s)
     else:
         # check if it is id
         # TODO: this may give the wrong answer, we should check
         # whether the value sets of two candidate-ID attributes
         # overlap, in which case they can't both be IDs !!)
         if self.unique and self.allNames and self._occurrences >= MIN_ID_VALUES:  # ID values must be Names
             strings.append("ID")
         # check if is it is enumeration
         elif (
             self.allNMTOKENs  # Enumeration values must be NMTOKENs
             and self._occurrences >= MIN_ENUMERATION_INSTANCES
             and numVals <= self._occurrences / MIN_ENUMERATION_RATIO
             and numVals <= MAX_ENUMERATION_VALUES
         ):
             strings.append("(%s)" % " | ".join(self.values))
         else:
             strings.append(tokentype)
         # If the attribute is present on every instance of the
         # element, treat it as required
         strings.append(self._occurrences == self.elemType._occurrences and "#REQUIRED" or "#IMPLIED")
     return "<!ATTLIST %s>" % " ".join(strings)
Esempio n. 2
0
 def __str__(self):
     strings = [self.elemType.name, self.name]
     tokentype = self.allNMTOKENs and 'NMTOKEN' or 'CDATA'
     # if there is only one attribute value, and at least MIN_FIXED
     # occurrences of it, treat it as FIXED
     numVals = len(self.values)
     if numVals == 1 and self._occurrences >= MIN_FIXED:
         singleton = iter(self.values).next()
         fixedVal = escape_attrib(singleton)
         for s in tokentype, '#FIXED', '"%s"' % fixedVal:
             strings.append(s)
     else:
         # check if it is id
         # TODO: this may give the wrong answer, we should check
         # whether the value sets of two candidate-ID attributes
         # overlap, in which case they can't both be IDs !!)
         if (self.unique and self.allNames  # ID values must be Names
                 and self._occurrences >= MIN_ID_VALUES):
             strings.append('ID')
         # check if is it is enumeration
         elif (self.allNMTOKENs  # Enumeration values must be NMTOKENs
               and self._occurrences >= MIN_ENUMERATION_INSTANCES
               and numVals <= self._occurrences / MIN_ENUMERATION_RATIO
               and numVals <= MAX_ENUMERATION_VALUES):
             strings.append('(%s)' % ' | '.join(self.values))
         else:
             strings.append(tokentype)
         # If the attribute is present on every instance of the
         # element, treat it as required
         strings.append(
             self._occurrences == self.elemType._occurrences and '#REQUIRED'
             or '#IMPLIED')
     return '<!ATTLIST %s>' % ' '.join(strings)
    def __init__(self, source):
        self.elements = []
        # map element names to ElementTypes
        elemDict = {}
        # elements currently open
        elemStack = []
        namespaces = deque([("xml", "http://www.w3.org/XML/1998/namespace")])
        pendingNamespaces = []
        for event, elem in iterparse(source, events=("start", "end", "start-ns", "end-ns")):
            if event == "start-ns":
                namespaces.appendleft(elem)
                pendingNamespaces.append(elem)

            elif event == "end-ns":
                namespaces.popleft()

            elif event == "start":
                # add the namespace declarations as attributes
                for prefix, url in pendingNamespaces:
                    attr = prefix and "xmlns:%s" % prefix or "xmlns"
                    elem.attrib[attr] = escape_attrib(url)
                del pendingNamespaces[:]
                # convert name from clark format to prefix:local
                name = _clark_to_orig(elem.tag, namespaces)
                elemType = elemDict.get(name)
                if elemType is None:
                    elemType = ElementType(name)
                    elemDict[name] = elemType
                    self.elements.append(elemType)
                elemType._occurrences += 1
                # update atttibute declarations
                for attr, value in elem.items():
                    # convert attribute names from clark format to prefix:local
                    elemType.updateAttribute(_clark_to_orig(attr, namespaces), value)
                # keep track of the nesting and sequence of child elements
                if elemStack:
                    parentEntry = elemStack[-1]
                    parent = parentEntry.elemType
                    # for sequencing, we're interested in consecutive groups
                    # of the same child element type
                    isFirstInGroup = parentEntry.latestChild != name
                    if isFirstInGroup:
                        parentEntry.latestChild = name
                        parentEntry.groupIndex += 1
                        parent.setChildInfo(name, parentEntry.groupIndex)
                    else:
                        parent.getChildInfo(name, parentEntry.groupIndex).repeatable = True
                # fi elemStack
                elemStack.append(_StackEntry(elemType))

            elif event == "end":
                entry = elemStack.pop()
                elemType = entry.elemType
                for txt in elem.text, elem.tail:
                    if txt is not None and not txt.isspace():
                        elemType._hasCharacterContent = True
                        break
                # check that all expected children are accounted for.
                # If the number of child element groups in this parent element
                # is less than the number in previous elements, then the
                # absent children are marked as optional
                for c in elemType.iterChildInfo(entry.groupIndex + 1, None):
                    c.optional = True
                elem.clear()
Esempio n. 4
0
    def __init__(self, source):
        self.elements = []
        # map element names to ElementTypes
        elemDict = {}
        # elements currently open
        elemStack = []
        # map (father-name, child-name) to _childInfo
        childInfoDict = {}
        namespaces = deque([('xml','http://www.w3.org/XML/1998/namespace')])
        pendingNamespaces = []
        for event,elem in iterparse(source, events=('start', 'end',
                                                    'start-ns', 'end-ns')):
            if event == 'start-ns':
                namespaces.appendleft(elem)
                pendingNamespaces.append(elem)

            elif event == 'end-ns':
                namespaces.popleft()

            elif event == 'start':
                # add the namespace declarations as attributes
                for prefix,url in pendingNamespaces:
                    attr = prefix and 'xmlns:%s' % prefix or 'xmlns'
                    elem.attrib[attr] = escape_attrib(url)
                del pendingNamespaces[:]
                # convert name from clark format to prefix:local
                name = _clark_to_orig(elem.tag,namespaces)
                elemType = elemDict.get(name)
                if elemType is None:
                    elemType = ElementType(name)
                    elemDict[name] = elemType
                    self.elements.append(elemType)
                elemType._occurrences += 1
                # update atttibute declarations
                for attr,value in elem.items():
                    # convert attribute names from clark format to prefix:local
                    elemType.updateAttribute(_clark_to_orig(attr,namespaces),value)
                # keep track of the nesting and sequence of child elements
                if elemStack:
                    parentEntry = elemStack[-1]
                    # for sequencing, we're interested in consecutive groups
                    # of the same child element type
                    isFirstInGroup = parentEntry.latestChild != name
                    if isFirstInGroup:
                        parentEntry.latestChild = name
                        parentEntry.sequenceNumber += 1
                    # check if we've seen this child of this parent before
                    parent = parentEntry.elemType
                    childInfo = childInfoDict.get((parent.name,name))
                    if childInfo is None:
                        # this is the first time we've seen this child
                        # belonging to this parent. if the child is not on
                        # the first instance of the parent, then we allow it
                        # as an optional element
                        childInfo = _ChildInfo(name, parent._occurrences>1)
                        childInfoDict[parent.name,name] = childInfo
                        parent._children.append(childInfo)
                    elif (
                        # we've seen this child before: check if it makes
                        # parent non-consecutive
                        parent._occurrences == 1 and isFirstInGroup
                        # check whether the position of this group of children in
                        # this parent element is the same as its position in
                        # previous instances of the parent.
                        or len(parent._children) <= parentEntry.sequenceNumber
                        or parent._children[parentEntry.sequenceNumber].name != name):
                            parent._sequenced = False
                    # if there's more than one child element, mark it as repeatable
                    if not isFirstInGroup:
                        childInfo.repeatable = True
                #fi elemStack
                elemStack.append(_StackEntry(elemType))

            elif event == 'end':
                entry = elemStack.pop()
                elemType = entry.elemType
                for txt in elem.text, elem.tail:
                    if txt is not None and not txt.isspace():
                        elemType._hasCharacterContent = True
                        break
                # check that all expected children are accounted for.
                # If the number of child element groups in this parent element
                # is less than the number in previous elements, then the
                # absent children are marked as optional
                if elemType._sequenced:
                    for c in elemType._children[entry.sequenceNumber+1:]:
                        c.optional = True
                elem.clear()
Esempio n. 5
0
    def __init__(self, source):
        self.elements = []
        # map element names to ElementTypes
        elemDict = {}
        # elements currently open
        elemStack = []
        # map (father-name, child-name) to _childInfo
        childInfoDict = {}
        namespaces = deque([('xml', 'http://www.w3.org/XML/1998/namespace')])
        pendingNamespaces = []
        for event, elem in iterparse(source,
                                     events=('start', 'end', 'start-ns',
                                             'end-ns')):
            if event == 'start-ns':
                namespaces.appendleft(elem)
                pendingNamespaces.append(elem)

            elif event == 'end-ns':
                namespaces.popleft()

            elif event == 'start':
                # add the namespace declarations as attributes
                for prefix, url in pendingNamespaces:
                    attr = prefix and 'xmlns:%s' % prefix or 'xmlns'
                    elem.attrib[attr] = escape_attrib(url)
                del pendingNamespaces[:]
                # convert name from clark format to prefix:local
                name = _clark_to_orig(elem.tag, namespaces)
                elemType = elemDict.get(name)
                if elemType is None:
                    elemType = ElementType(name)
                    elemDict[name] = elemType
                    self.elements.append(elemType)
                elemType._occurrences += 1
                # update atttibute declarations
                for attr, value in elem.items():
                    # convert attribute names from clark format to prefix:local
                    elemType.updateAttribute(_clark_to_orig(attr, namespaces),
                                             value)
                # keep track of the nesting and sequence of child elements
                if elemStack:
                    parentEntry = elemStack[-1]
                    # for sequencing, we're interested in consecutive groups
                    # of the same child element type
                    isFirstInGroup = parentEntry.latestChild != name
                    if isFirstInGroup:
                        parentEntry.latestChild = name
                        parentEntry.sequenceNumber += 1
                    # check if we've seen this child of this parent before
                    parent = parentEntry.elemType
                    childInfo = childInfoDict.get((parent.name, name))
                    if childInfo is None:
                        # this is the first time we've seen this child
                        # belonging to this parent. if the child is not on
                        # the first instance of the parent, then we allow it
                        # as an optional element
                        childInfo = _ChildInfo(name, parent._occurrences > 1)
                        childInfoDict[parent.name, name] = childInfo
                        parent._children.append(childInfo)
                    elif (
                            # we've seen this child before: check if it makes
                            # parent non-consecutive
                            parent._occurrences == 1 and isFirstInGroup
                            # check whether the position of this group of children in
                            # this parent element is the same as its position in
                            # previous instances of the parent.
                            or
                            len(parent._children) <= parentEntry.sequenceNumber
                            or
                            parent._children[parentEntry.sequenceNumber].name
                            != name):
                        parent._sequenced = False
                    # if there's more than one child element, mark it as repeatable
                    if not isFirstInGroup:
                        childInfo.repeatable = True
                #fi elemStack
                elemStack.append(_StackEntry(elemType))

            elif event == 'end':
                entry = elemStack.pop()
                elemType = entry.elemType
                for txt in elem.text, elem.tail:
                    if txt is not None and not txt.isspace():
                        elemType._hasCharacterContent = True
                        break
                # check that all expected children are accounted for.
                # If the number of child element groups in this parent element
                # is less than the number in previous elements, then the
                # absent children are marked as optional
                if elemType._sequenced:
                    for c in elemType._children[entry.sequenceNumber + 1:]:
                        c.optional = True
                elem.clear()
    def __init__(self, source):
        self.elements = []
        # map element names to ElementTypes
        elemDict = {}
        # elements currently open
        elemStack = []
        namespaces = deque([('xml','http://www.w3.org/XML/1998/namespace')])
        pendingNamespaces = []
        for event,elem in iterparse(source, events=('start', 'end',
                                                    'start-ns', 'end-ns')):
            if event == 'start-ns':
                namespaces.appendleft(elem)
                pendingNamespaces.append(elem)

            elif event == 'end-ns':
                namespaces.popleft()

            elif event == 'start':
                # add the namespace declarations as attributes
                for prefix,url in pendingNamespaces:
                    attr = prefix and 'xmlns:%s' % prefix or 'xmlns'
                    elem.attrib[attr] = escape_attrib(url)
                del pendingNamespaces[:]
                # convert name from clark format to prefix:local
                name = _clark_to_orig(elem.tag,namespaces)
                elemType = elemDict.get(name)
                if elemType is None:
                    elemType = ElementType(name)
                    elemDict[name] = elemType
                    self.elements.append(elemType)
                elemType._occurrences += 1
                # update atttibute declarations
                for attr,value in elem.items():
                    # convert attribute names from clark format to prefix:local
                    elemType.updateAttribute(_clark_to_orig(attr,namespaces),value)
                # keep track of the nesting and sequence of child elements
                if elemStack:
                    parentEntry = elemStack[-1]
                    parent = parentEntry.elemType
                    # for sequencing, we're interested in consecutive groups
                    # of the same child element type
                    isFirstInGroup = parentEntry.latestChild != name
                    if isFirstInGroup:
                        parentEntry.latestChild = name
                        parentEntry.groupIndex += 1
                        parent.setChildInfo(name,parentEntry.groupIndex)
                    else:
                        parent.getChildInfo(name,parentEntry.groupIndex).repeatable = True
                #fi elemStack
                elemStack.append(_StackEntry(elemType))

            elif event == 'end':
                entry = elemStack.pop()
                elemType = entry.elemType
                for txt in elem.text, elem.tail:
                    if txt is not None and not txt.isspace():
                        elemType._hasCharacterContent = True
                        break
                # check that all expected children are accounted for.
                # If the number of child element groups in this parent element
                # is less than the number in previous elements, then the
                # absent children are marked as optional
                for c in elemType.iterChildInfo(entry.groupIndex+1,None):
                    c.optional = True
                elem.clear()