def _decodeDeclaration(sig, dec, permitted, loggedEvents):
    sig = _normaliseNewlines(dec(sig)[0])
    eo = _encodingFromDecl(sig)
    if not (eo):
        _logEvent(
            loggedEvents,
            logging.UnicodeError({
                'exception':
                'This XML file (apparently ' + permitted[0] +
                ') requires an encoding declaration'
            }), (1, 1))
    elif permitted and not (eo[0].upper() in permitted):
        if _hasCodec(eo[0]):
            # see if the codec is an alias of one of the permitted encodings
            codec = codecs.lookup(eo[0])
            for encoding in permitted:
                if _hasCodec(encoding) and codecs.lookup(
                        encoding)[-1] == codec[-1]:
                    break
            else:
                _logEvent(
                    loggedEvents,
                    logging.UnicodeError({
                        'exception':
                        'This XML file claims an encoding of ' + eo[0] +
                        ', but looks more like ' + permitted[0]
                    }), eo[1])
    return eo
Exemple #2
0
def _decodePostBOMDeclaration(sig, dec, permitted, loggedEvents, fallback=None):
  sig = _normaliseNewlines(dec(sig)[0])
  eo = _encodingFromDecl(sig)
  if eo and not(eo[0].upper() in permitted):
    _logEvent(loggedEvents,
      logging.UnicodeError({'exception': 'Document starts with ' + permitted[0] + ' BOM marker but has incompatible declaration of ' + eo[0]}), eo[1])
    return None
  else:
    return eo or (fallback, None)
Exemple #3
0
def _validate(aString, firstOccurrenceOnly, loggedEvents, base, encoding, selfURIs=None):
  """validate RSS from string, returns validator object"""
  from xml.sax import make_parser, handler
  from base import SAXDispatcher
  from exceptions import UnicodeError
  from cStringIO import StringIO

  # By now, aString should be Unicode
  source = InputSource()
  source.setByteStream(StringIO(xmlEncoding.asUTF8(aString)))

  validator = SAXDispatcher(base, selfURIs or [base], encoding)
  validator.setFirstOccurrenceOnly(firstOccurrenceOnly)

  validator.loggedEvents += loggedEvents

  # experimental RSS-Profile draft 1.06 support
  validator.setLiterals(re.findall('&(\w+);',aString))

  xmlver = re.match("^<\?\s*xml\s+version\s*=\s*['\"]([-a-zA-Z0-9_.:]*)['\"]",aString)
  if xmlver and xmlver.group(1)<>'1.0':
    validator.log(logging.BadXmlVersion({"version":xmlver.group(1)}))

  try:
    from xml.sax.expatreader import ExpatParser
    class fake_dtd_parser(ExpatParser):
      def reset(self):
        ExpatParser.reset(self)
        self._parser.UseForeignDTD(1)
    parser = fake_dtd_parser()
  except:
    parser = make_parser()

  parser.setFeature(handler.feature_namespaces, 1)
  parser.setContentHandler(validator)
  parser.setErrorHandler(validator)
  parser.setEntityResolver(validator)
  if hasattr(parser, '_ns_stack'):
    # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
    # PyXML doesn't have this problem, and it doesn't have _ns_stack either
    parser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})

  def xmlvalidate(log):
    import libxml2
    from StringIO import StringIO
    from random import random

    prefix="...%s..." % str(random()).replace('0.','')
    msg=[]
    libxml2.registerErrorHandler(lambda msg,str: msg.append(str), msg)

    input = libxml2.inputBuffer(StringIO(xmlEncoding.asUTF8(aString)))
    reader = input.newTextReader(prefix)
    reader.SetParserProp(libxml2.PARSER_VALIDATE, 1)
    ret = reader.Read()
    while ret == 1: ret = reader.Read()

    msg=''.join(msg)
    for line in msg.splitlines():
      if line.startswith(prefix): log(line.split(':',4)[-1].strip())
  validator.xmlvalidator=xmlvalidate

  try:
    parser.parse(source)
  except SAXException:
    pass
  except UnicodeError:
    import sys
    exctype, value = sys.exc_info()[:2]
    validator.log(logging.UnicodeError({"exception":value}))

  if validator.getFeedType() == TYPE_RSS1:
    try:
      from rdflib.syntax.parsers.RDFXMLHandler import RDFXMLHandler

      class Handler(RDFXMLHandler):
        ns_prefix_map = {}
        prefix_ns_map = {}
        def add(self, triple): pass
        def __init__(self, dispatcher):
          RDFXMLHandler.__init__(self, self)
          self.dispatcher=dispatcher
        def error(self, message):
          self.dispatcher.log(InvalidRDF({"message": message}))
    
      source.getByteStream().reset()
      parser.reset()
      parser.setContentHandler(Handler(parser.getContentHandler()))
      parser.setErrorHandler(handler.ErrorHandler())
      parser.parse(source)
    except:
      pass

  return validator
def decode(mediaType, charset, bs, loggedEvents, fallback=None):
    eo = _detect(bs, loggedEvents, fallback=None)

    # Check declared encodings
    if eo and eo[1] and _hasCodec(eo[0]):
        if not (isCommon(eo[0])):
            _logEvent(loggedEvents, ObscureEncoding({"encoding": eo[0]}),
                      eo[1])
        elif not (isStandard(eo[0])):
            _logEvent(loggedEvents, NonstdEncoding({"encoding": eo[0]}), eo[1])

    if eo:
        encoding = eo[0]
    else:
        encoding = None

    if charset and encoding and charset.lower() != encoding.lower():
        # RFC 3023 requires us to use 'charset', but a number of aggregators
        # ignore this recommendation, so we should warn.
        loggedEvents.append(
            logging.EncodingMismatch({
                "charset": charset,
                "encoding": encoding
            }))

    if mediaType and mediaType.startswith("text/") and charset is None:
        loggedEvents.append(logging.TextXml({}))

        # RFC 3023 requires text/* to default to US-ASCII.  Issue a warning
        # if this occurs, but continue validation using the detected encoding
        try:
            bs.decode("US-ASCII")
        except:
            if not encoding:
                try:
                    bs.decode(fallback)
                    encoding = fallback
                except:
                    pass
            if encoding and encoding.lower() != 'us-ascii':
                loggedEvents.append(
                    logging.EncodingMismatch({
                        "charset": "US-ASCII",
                        "encoding": encoding
                    }))

    enc = charset or encoding

    if enc is None:
        loggedEvents.append(logging.MissingEncoding({}))
        enc = fallback
    elif not (_hasCodec(enc)):
        if eo:
            _logEvent(loggedEvents, logging.UnknownEncoding({'encoding': enc}),
                      eo[1])
        else:
            _logEvent(loggedEvents, logging.UnknownEncoding({'encoding': enc}))
        enc = fallback

    if enc is None:
        return enc, None

    dec = getdecoder(enc)
    try:
        return enc, dec(bs)[0]
    except UnicodeError, ue:
        salvage = dec(bs, 'replace')[0]
        if 'start' in ue.__dict__:
            # XXX 'start' is in bytes, not characters. This is wrong for multibyte
            #  encodings
            pos = _position(salvage, ue.start)
        else:
            pos = None

        _logEvent(loggedEvents, logging.UnicodeError({"exception": ue}), pos)

        return enc, salvage