def characters(self, string): if string.strip(): self.textOK() line=column=0 pc=' ' for c in string: # latin characters double encoded as utf-8 if 0x80 <= ord(c) <= 0xBF: if 0xC2 <= ord(pc) <= 0xC3: try: string.encode('iso-8859-1').decode('utf-8') from validators import BadCharacters self.log(BadCharacters({"parent":self.parent.name, "element":self.name}), offset=(line,max(1,column-1))) except: pass pc = c # win1252 if 0x80 <= ord(c) <= 0x9F or c == u'\ufffd': from validators import BadCharacters self.log(BadCharacters({"parent":self.parent.name, "element":self.name}), offset=(line,column)) column=column+1 if ord(c) in (10,13): column=0 line=line+1 self.value = self.value + string
def startElementNS(self, name, qname, attrs): # RSS 2.0 arbitrary restriction on extensions feedtype = self.getFeedType() if (not qname) and feedtype and ( feedtype == TYPE_RSS2) and self.name.find('_') >= 0: from logging import NotInANamespace self.log( NotInANamespace({ "parent": self.name, "element": name, "namespace": '""' })) # ensure element is "namespace well formed" if name.find(':') != -1: from logging import MissingNamespace self.log(MissingNamespace({"parent": self.name, "element": name})) # ensure all attribute namespaces are properly defined for (namespace, attr) in attrs.keys(): if ':' in attr and not namespace: from logging import MissingNamespace self.log( MissingNamespace({ "parent": self.name, "element": attr })) for c in attrs.get((namespace, attr)): if 0x80 <= ord(c) <= 0x9F or c == u'\ufffd': from validators import BadCharacters self.log(BadCharacters({"parent": name, "element": attr})) # eat children self.push(eater(), name, attrs)
def characters(self, string): for c in string: if 0x80 <= ord(c) <= 0x9F or c == u'\ufffd': from validators import BadCharacters self.log(BadCharacters({"parent":self.parent.name, "element":self.name})) if (self.type=='xhtml') and string.strip() and not self.value.strip(): self.log(MissingXhtmlDiv({"parent":self.parent.name, "element":self.name})) validatorBase.characters(self,string)
def characters(self, string): for c in string: if 0x80 <= ord(c) <= 0x9F or c == u'\ufffd': from validators import BadCharacters self.log( BadCharacters({ "parent": self.parent.name, "element": self.name }))
def handle_charref(self, name): if name.startswith('x'): value = int(name[1:], 16) else: value = int(name) if 0x80 <= value <= 0x9F or value == 0xfffd: self.log( BadCharacters({ "parent": self.element.parent.name, "element": self.element.name, "value": "&#" + name + ";" }))
def startElementNS(self, name, qname, attrs): if attrs.has_key((u'http://www.w3.org/XML/1998/namespace', u'lang')): self.xmlLang = attrs.getValue( (u'http://www.w3.org/XML/1998/namespace', u'lang')) if self.xmlLang: from validators import iso639_validate iso639_validate(self.log, self.xmlLang, "xml:lang", name) from validators import eater feedtype = self.getFeedType() if (not qname) and feedtype and (feedtype != TYPE_RSS2): from logging import UndeterminableVocabulary self.log( UndeterminableVocabulary({ "parent": self.name, "element": name, "namespace": '""' })) qname = "null" if qname in self.dispatcher.defaultNamespaces: qname = None nm_qname = near_miss(qname) if nearly_namespaces.has_key(nm_qname): prefix = nearly_namespaces[nm_qname] qname, name = None, prefix + "_" + name if prefix == 'itunes' and not self.itunes and not self.parent.itunes: if hasattr(self, 'setItunes'): self.setItunes(True) # ensure all attribute namespaces are properly defined for (namespace, attr) in attrs.keys(): if ':' in attr and not namespace: from logging import MissingNamespace self.log( MissingNamespace({ "parent": self.name, "element": attr })) if qname == 'http://purl.org/atom/ns#': from logging import ObsoleteNamespace self.log(ObsoleteNamespace({"element": "feed"})) for key, string in attrs.items(): for c in string: if 0x80 <= ord(c) <= 0x9F or c == u'\ufffd': from validators import BadCharacters self.log( BadCharacters({ "parent": name, "element": key[-1] })) if qname: handler = self.unknown_starttag(name, qname, attrs) name = "unknown_" + name self.child = name else: try: self.child = name if name.startswith('dc_'): # handle "Qualified" Dublin Core handler = getattr( self, "do_" + name.replace("-", "_").split('.')[0])() else: handler = getattr(self, "do_" + name.replace("-", "_"))() except AttributeError: if name.find(':') != -1: from logging import MissingNamespace self.log( MissingNamespace({ "parent": self.name, "element": name })) handler = eater() elif name.startswith('xhtml_'): from logging import MisplacedXHTMLContent self.log( MisplacedXHTMLContent({ "parent": ':'.join(self.name.split("_", 1)), "element": name })) handler = eater() else: try: from extension import Questionable # requalify the name with the default namespace qname = name from logging import TYPE_APP_CATEGORIES, TYPE_APP_SERVICE if self.getFeedType() in [ TYPE_APP_CATEGORIES, TYPE_APP_SERVICE ]: if qname.startswith('app_'): qname = qname[4:] if name.find('_') < 0 and self.name.find('_') >= 0: if 'http://www.w3.org/2005/Atom' in self.dispatcher.defaultNamespaces: qname = 'atom_' + qname # is this element questionable? handler = getattr(Questionable(), "do_" + qname.replace("-", "_"))() from logging import QuestionableUsage self.log( QuestionableUsage({ "parent": ':'.join(self.name.split("_", 1)), "element": qname })) except AttributeError: from logging import UndefinedElement self.log( UndefinedElement({ "parent": ':'.join(self.name.split("_", 1)), "element": name })) handler = eater() self.push(handler, name, attrs) # MAP - always append name, even if already exists (we need this to # check for too many hour elements in skipHours, and it doesn't # hurt anything else) self.children.append(self.child)
def startElementNS(self, name, qname, attrs): if attrs.has_key((u'http://www.w3.org/XML/1998/namespace', u'lang')): self.xmlLang = attrs.getValue( (u'http://www.w3.org/XML/1998/namespace', u'lang')) if self.xmlLang: from validators import iso639_validate iso639_validate(self.log, self.xmlLang, "xml:lang", name) from validators import eater feedtype = self.getFeedType() if (not qname) and feedtype and (feedtype != TYPE_RSS2): from logging import UndeterminableVocabulary self.log( UndeterminableVocabulary({ "parent": self.name, "element": name, "namespace": '""' })) qname = "null" if qname in self.defaultNamespaces: qname = None nm_qname = near_miss(qname) if nearly_namespaces.has_key(nm_qname): prefix = nearly_namespaces[nm_qname] qname, name = None, prefix + "_" + name if prefix == 'itunes' and not self.itunes and not self.parent.itunes: if hasattr(self, 'setItunes'): self.setItunes(True) # ensure all attribute namespaces are properly defined for (namespace, attr) in attrs.keys(): if ':' in attr and not namespace: from logging import MissingNamespace self.log( MissingNamespace({ "parent": self.name, "element": attr })) for key, string in attrs.items(): for c in string: if 0x80 <= ord(c) <= 0x9F: from validators import BadCharacters self.log( BadCharacters({ "parent": name, "element": key[-1] })) if qname: handler = self.unknown_starttag(name, qname, attrs) name = "unknown_" + name else: try: self.child = name handler = getattr(self, "do_" + name.replace("-", "_"))() except AttributeError: if name.find(':') != -1: from logging import MissingNamespace self.log( MissingNamespace({ "parent": self.name, "element": name })) handler = eater() elif not qname: from logging import UndefinedElement self.log( UndefinedElement({ "parent": ':'.join(self.name.split("_", 1)), "element": name })) handler = eater() else: handler = self.unknown_starttag(name, qname, attrs) name = "unknown_" + name self.push(handler, name, attrs) # MAP - always append name, even if already exists (we need this to # check for too many hour elements in skipHours, and it doesn't # hurt anything else) self.children.append(name)