def handle_data(self,data): if self.stack: if '#PCDATA' not in HTML_DTD.get(self.stack[-1],[]) and not strip(data): # this is probably ignorable whitespace self._cont_handler.ignorableWhitespace(data) else: self._cont_handler.characters(to_xml_string(data,self._encoding))
def handle_data(self, data): if self.stack: if '#PCDATA' not in HTML_DTD.get(self.stack[-1], []) and not strip(data): # this is probably ignorable whitespace self._cont_handler.ignorableWhitespace(data) else: self._cont_handler.characters( to_xml_string(data, self._encoding))
def finish_starttag(self, tag, attrs): """uses the HTML DTD to automatically generate events for missing tags""" # guess omitted close tags while self.stack and \ upper(self.stack[-1]) in HTML_OPT_END and \ tag not in HTML_DTD.get(self.stack[-1],[]): self.unknown_endtag(self.stack[-1]) del self.stack[-1] if self.stack and tag not in HTML_DTD.get(self.stack[-1],[]) and self.verbose: print 'Warning : trying to add %s as a child of %s'%\ (tag,self.stack[-1]) self.unknown_starttag(tag,attrs) if upper(tag) in HTML_FORBIDDEN_END: # close immediately tags for which we won't get an end self.unknown_endtag(tag) return 0 else: self.stack.append(tag) return 1
def finish_starttag(self, tag, attrs): """uses the HTML DTD to automatically generate events for missing tags""" # guess omitted close tags while self.stack and \ upper(self.stack[-1]) in HTML_OPT_END and \ tag not in HTML_DTD.get(self.stack[-1],[]): self.unknown_endtag(self.stack[-1]) del self.stack[-1] if self.stack and tag not in HTML_DTD.get(self.stack[-1], []) and self.verbose: print 'Warning : trying to add %s as a child of %s'%\ (tag,self.stack[-1]) self.unknown_starttag(tag, attrs) if upper(tag) in HTML_FORBIDDEN_END: # close immediately tags for which we won't get an end self.unknown_endtag(tag) return 0 else: self.stack.append(tag) return 1
def finish_starttag(self, tagname, attrs): unicodeTagName = unicode(tagname, self._charset) lowerTagName = string.lower(unicodeTagName) if not HTML_DTD.has_key(lowerTagName): # Skip any tags not defined in HTML 4.01 return element = self._ownerDoc.createElementNS(EMPTY_NAMESPACE, unicodeTagName) # Allows for multiple META tags in a document if lowerTagName == 'meta': lowered = map( lambda (name, value): (string.lower(name), string.lower(value)), attrs) if ('http-equiv', 'content-type') in lowered: for (name, value) in lowered: if name == 'content': match = g_reCharset.search(value) if match: self._charset = match.group('charset') # Add any attributes to the tag for (name, value) in attrs: element.setAttributeNS(EMPTY_NAMESPACE, unicode(name, self._charset), unicode(value, self._charset)) # Look for its parent for i in range(1, len(self._stack)): parent = self._stack[-i] if lowerTagName in HTML_DTD[string.lower(parent.tagName)]: parent.appendChild(element) if i > 1: self._stack = self._stack[:-i + 1] if HTML_DTD[lowerTagName]: self._stack.append(element) return # no parent found if not self._hasHtml and lowerTagName == 'html': self._stack[0].appendChild(element) self._stack.append(element) self._hasHtml = 1 return
def finish_starttag(self, tagname, attrs): unicodeTagName = unicode(tagname, self._charset) lowerTagName = string.lower(unicodeTagName) if not HTML_DTD.has_key(lowerTagName): # Skip any tags not defined in HTML 4.01 return element = self._ownerDoc.createElementNS(EMPTY_NAMESPACE, unicodeTagName) # Allows for multiple META tags in a document if lowerTagName == 'meta': lowered = map(lambda (name, value): (string.lower(name), string.lower(value)), attrs) if ('http-equiv', 'content-type') in lowered: for (name, value) in lowered: if name == 'content': match = g_reCharset.search(value) if match: self._charset = match.group('charset') # Add any attributes to the tag for (name, value) in attrs: element.setAttributeNS(EMPTY_NAMESPACE, unicode(name, self._charset), unicode(value, self._charset)) # Look for its parent for i in range(1, len(self._stack)): parent = self._stack[-i] if lowerTagName in HTML_DTD[string.lower(parent.tagName)]: parent.appendChild(element) if i > 1: self._stack = self._stack[:-i+1] if HTML_DTD[lowerTagName]: self._stack.append(element) return # no parent found if not self._hasHtml and lowerTagName == 'html': self._stack[0].appendChild(element) self._stack.append(element) self._hasHtml = 1 return
def _4dom_createHTMLElement(self, tagName): lowered = string.lower(tagName) if not HTML_DTD.has_key(lowered): raise TypeError('Unknown HTML Element: %s' % tagName) if lowered in NoClassTags: from HTMLElement import HTMLElement return HTMLElement(self, tagName) #FIXME: capitalize() broken with unicode in Python 2.0 #normTagName = string.capitalize(tagName) capitalized = string.upper(tagName[0]) + lowered[1:] element = HTMLTagMap.get(capitalized, capitalized) module = 'HTML%sElement' % element if not self._html.has_key(module): #Try to import it (should never fail) __import__('xml.dom.html.%s' % module) # Class and module have the same name klass = getattr(self._html[module], module) return klass(self, tagName)