Example #1
0
def _validate(aString, firstOccurrenceOnly, loggedEvents, base, encoding, selfURIs=None):
  """validate RSS from string, returns validator object"""
  from xml.sax import make_parser, handler
  from base import SAXDispatcher
  from exceptions import UnicodeError
  from cStringIO import StringIO

  # By now, aString should be Unicode
  source = InputSource()
  source.setByteStream(StringIO(xmlEncoding.asUTF8(aString)))

  validator = SAXDispatcher(base, selfURIs or [base], encoding)
  validator.setFirstOccurrenceOnly(firstOccurrenceOnly)

  validator.loggedEvents += loggedEvents

  # experimental RSS-Profile draft 1.06 support
  validator.setLiterals(re.findall('&(\w+);',aString))

  xmlver = re.match("^<\?\s*xml\s+version\s*=\s*['\"]([-a-zA-Z0-9_.:]*)['\"]",aString)
  if xmlver and xmlver.group(1)<>'1.0':
    validator.log(logging.BadXmlVersion({"version":xmlver.group(1)}))

  try:
    from xml.sax.expatreader import ExpatParser
    class fake_dtd_parser(ExpatParser):
      def reset(self):
        ExpatParser.reset(self)
        self._parser.UseForeignDTD(1)
    parser = fake_dtd_parser()
  except:
    parser = make_parser()

  parser.setFeature(handler.feature_namespaces, 1)
  parser.setContentHandler(validator)
  parser.setErrorHandler(validator)
  parser.setEntityResolver(validator)
  if hasattr(parser, '_ns_stack'):
    # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
    # PyXML doesn't have this problem, and it doesn't have _ns_stack either
    parser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})

  def xmlvalidate(log):
    import libxml2
    from StringIO import StringIO
    from random import random

    prefix="...%s..." % str(random()).replace('0.','')
    msg=[]
    libxml2.registerErrorHandler(lambda msg,str: msg.append(str), msg)

    input = libxml2.inputBuffer(StringIO(xmlEncoding.asUTF8(aString)))
    reader = input.newTextReader(prefix)
    reader.SetParserProp(libxml2.PARSER_VALIDATE, 1)
    ret = reader.Read()
    while ret == 1: ret = reader.Read()

    msg=''.join(msg)
    for line in msg.splitlines():
      if line.startswith(prefix): log(line.split(':',4)[-1].strip())
  validator.xmlvalidator=xmlvalidate

  try:
    parser.parse(source)
  except SAXException:
    pass
  except UnicodeError:
    import sys
    exctype, value = sys.exc_info()[:2]
    validator.log(logging.UnicodeError({"exception":value}))

  if validator.getFeedType() == TYPE_RSS1:
    try:
      from rdflib.syntax.parsers.RDFXMLHandler import RDFXMLHandler

      class Handler(RDFXMLHandler):
        ns_prefix_map = {}
        prefix_ns_map = {}
        def add(self, triple): pass
        def __init__(self, dispatcher):
          RDFXMLHandler.__init__(self, self)
          self.dispatcher=dispatcher
        def error(self, message):
          self.dispatcher.log(InvalidRDF({"message": message}))
    
      source.getByteStream().reset()
      parser.reset()
      parser.setContentHandler(Handler(parser.getContentHandler()))
      parser.setErrorHandler(handler.ErrorHandler())
      parser.parse(source)
    except:
      pass

  return validator
class DataImporter(Digester):
    def __init__(self, ictx, file):
        Digester.__init__(self)
        self._ictx = ictx
        self._file = file
        self._input = InputSource(file.name)
        self._input.setByteStream(BZ2File(file.name, 'r'))
        self._conn = ictx['conn'].connection
        self._cursor = self._conn.cursor()
        self.success = self._closed = False
        self._add_rules()

    def _add_rules(self):
        self.addOnBegin('packet', self._check_packet)
        self.addOnBeginAndEnd('packet/transaction/event', self._on_event, self._on_event_end)
        self.addOnBody('packet/transaction/event/keys/column', self._on_key_column)
        self.addOnBody('packet/transaction/event/values/column', self._on_value_column)
        self.addOnFinish(self._on_finish)

    def _check_packet(self, tag, attrs):
        if self._ictx['schema_seq'] != int(attrs.getValue('schema_seq')):
            raise Exception('<packet> schema_seq: {0} not matched the expected seq number {1}',
                            attrs.getValue('schema_seq'), self._ictx['replication_seq'])

        if self._ictx['replication_seq'] != int(attrs.getValue('replication_seq')):
            raise Exception('<packet> replication_seq: {0} not matched the expected seq number {1}',
                            attrs.getValue('replication_seq'), self._ictx['replication_seq'])

    def _on_key_column(self, tag, attrs, val):
        event = self.peek()
        event['keys'][attrs.getValue('name')] = val

    def _on_value_column(self, tag, attrs, val):
        event = self.peek()
        isNull = attrs.getValue("null") if attrs.has_key('null') else None
        event['values'][attrs.getValue('name')] = val if isNull != "yes" else None

    def _on_event(self, tag, attrs):
        event = {
            'op': attrs.getValue('op'),
            'table': attrs.getValue('table'),
            'keys': OrderedDict(), #array of tuples column name -> column val
            'values': OrderedDict() #array of tuples column name -> column val
        }
        self.push(event)

    def _on_event_end(self, tag):
        event = self.pop()
        type = event['op']
        table = event['table']
        keys = event['keys']
        values = event['values']
        params = []
        if type == 'I':
            sql_columns = ', '.join(values.keys())
            sql_values = ', '.join(['%s'] * len(values))
            sql = 'INSERT INTO %s (%s) VALUES (%s)' % (table, sql_columns, sql_values)
            params = values.values()
        elif type == 'U':
            sql_values = ', '.join('%s=%%s' % i for i in values)
            sql = 'UPDATE %s SET %s' % (table, sql_values)
            params = values.values()
        elif type == 'D':
            sql = 'DELETE FROM %s' % table
        else:
            raise Exception('Invalid <event> op: %s' % type)

        if type == 'D' or type == 'U':
            sql += ' WHERE ' + ' AND '.join('%s%s%%s' % (i, ' IS ' if keys[i] is None else '=') for i in keys.keys())
            params.extend(keys.values())

        #print '%s %s' % (sql, params)
        self._cursor.execute(sql, params)

    def _on_finish(self):
        pass

    def load(self):
        logger.warning('Saving dataset....')
        self.parse(self._input)
        self.success = True

    def recover(self):
        """ This is duty hack to remove weird characters presented in some replications files.
            Using the tidy tool.
        """
        logger.warning('Trying to recover invalid XML...')
        originalXML = None
        fixedXML = None
        try:
            originalXML = tempfile.NamedTemporaryFile(suffix='.xml', delete=False) #bunzipped tmp
            fixedXML = tempfile.NamedTemporaryFile(suffix='.xml', delete=False) #fixed tmp
            fixedXML.close()

            #Fetch uncompressed file data to recover
            bzf = self._input.getByteStream()
            bzf.seek(0)
            shutil.copyfileobj(bzf, originalXML)
            originalXML.close()

            cmd = ['tidy', '-xml', '-o', fixedXML.name, originalXML.name]
            logger.warning('Running: %s', ' '.join(cmd))
            ret = subprocess.call(cmd)
            if ret:
                #raise Exception('Failed to fix XML data, ret=%s' % ret)
                pass

            #ready to load
            self.close()
            self._file = file(fixedXML.name, 'r')
            self._input = InputSource(fixedXML.name)
            self._input.setByteStream(self._file)
            self._cursor = self._conn.cursor()
            self.success = self._closed = False
            self.reset()
            self._add_rules()
            self.load()
        finally:
            for f in [originalXML, fixedXML]:
                if f and not f.closed:
                    f.close()
                if f and os.path.exists(f.name):
                    os.unlink(f.name)


    def close(self):
        if self._closed:
            return
        try:
            if self.success:
                self._conn.commit()
                logger.warning('Done')
            else:
                logger.warning('Rolling back transaction. Seq number: {0}'.format(self._ictx['replication_seq']))
                self._conn.rollback()
            self._cursor.close()
        finally:
            self._closed = True
            self._input.getByteStream().close()
            self._file.close()
Example #3
0
def _validate(aString, firstOccurrenceOnly, loggedEvents, base, encoding, selfURIs=None, mediaType=None):
  """validate RSS from string, returns validator object"""
  from xml.sax import make_parser, handler
  from .base import SAXDispatcher
  from exceptions import UnicodeError
  from cStringIO import StringIO

  if re.match("^\s+<\?xml",aString) and re.search("<generator.*wordpress.*</generator>",aString):
    lt = aString.find('<'); gt = aString.find('>')
    if lt > 0 and gt > 0 and lt < gt:
      loggedEvents.append(logging.WPBlankLine({'line':1,'column':1}))
      # rearrange so that other errors can be found
      aString = aString[lt:gt+1]+aString[0:lt]+aString[gt+1:]

  # By now, aString should be Unicode
  source = InputSource()
  source.setByteStream(StringIO(xmlEncoding.asUTF8(aString)))

  validator = SAXDispatcher(base, selfURIs or [base], encoding)
  validator.setFirstOccurrenceOnly(firstOccurrenceOnly)

  if mediaType == 'application/atomsvc+xml':
    validator.setFeedType(TYPE_APP_SERVICE)
  elif mediaType ==  'application/atomcat+xml':
    validator.setFeedType(TYPE_APP_CATEGORIES)

  validator.loggedEvents += loggedEvents

  # experimental RSS-Profile support
  validator.rssCharData = [s.find('&#x')>=0 for s in aString.split('\n')]

  xmlver = re.match("^<\?\s*xml\s+version\s*=\s*['\"]([-a-zA-Z0-9_.:]*)['\"]",aString)
  if xmlver and xmlver.group(1) != '1.0':
    validator.log(logging.BadXmlVersion({"version":xmlver.group(1)}))

  try:
    from xml.sax.expatreader import ExpatParser
    class fake_dtd_parser(ExpatParser):
      def reset(self):
        ExpatParser.reset(self)
        self._parser.UseForeignDTD(1)
    parser = fake_dtd_parser()
  except:
    parser = make_parser()

  parser.setFeature(handler.feature_namespaces, 1)
  parser.setContentHandler(validator)
  parser.setErrorHandler(validator)
  parser.setEntityResolver(validator)
  if hasattr(parser, '_ns_stack'):
    # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
    # PyXML doesn't have this problem, and it doesn't have _ns_stack either
    parser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})

  def xmlvalidate(log):
    import libxml2
    from StringIO import StringIO
    from random import random

    prefix="...%s..." % str(random()).replace('0.','')
    msg=[]
    libxml2.registerErrorHandler(lambda msg,str: msg.append(str), msg)

    input = libxml2.inputBuffer(StringIO(xmlEncoding.asUTF8(aString)))
    reader = input.newTextReader(prefix)
    reader.SetParserProp(libxml2.PARSER_VALIDATE, 1)
    ret = reader.Read()
    while ret == 1: ret = reader.Read()

    msg=''.join(msg)
    for line in msg.splitlines():
      if line.startswith(prefix): log(line.split(':',4)[-1].strip())
  validator.xmlvalidator=xmlvalidate

  try:
    parser.parse(source)
  except SAXException:
    pass
  except UnicodeError:
    import sys
    exctype, value = sys.exc_info()[:2]
    validator.log(logging.UnicodeError({"exception":value}))

  if validator.getFeedType() == TYPE_RSS1:
    try:
      from rdflib.syntax.parsers.RDFXMLHandler import RDFXMLHandler

      class Handler(RDFXMLHandler):
        ns_prefix_map = {}
        prefix_ns_map = {}
        def add(self, triple): pass
        def __init__(self, dispatcher):
          RDFXMLHandler.__init__(self, self)
          self.dispatcher=dispatcher
        def error(self, message):
          self.dispatcher.log(InvalidRDF({"message": message}))

      source.getByteStream().reset()
      parser.reset()
      parser.setContentHandler(Handler(parser.getContentHandler()))
      parser.setErrorHandler(handler.ErrorHandler())
      parser.parse(source)
    except:
      pass

  return validator
Example #4
0
class DataImporter(Digester):
    def __init__(self, ictx, file):
        Digester.__init__(self)
        self._ictx = ictx
        self._file = file
        self._input = InputSource(file.name)
        self._input.setByteStream(BZ2File(file.name, 'r'))
        self._conn = ictx['conn'].connection
        self._cursor = self._conn.cursor()
        self.success = self._closed = False
        self._add_rules()

    def _add_rules(self):
        self.addOnBegin('packet', self._check_packet)
        self.addOnBeginAndEnd('packet/transaction/event', self._on_event,
                              self._on_event_end)
        self.addOnBody('packet/transaction/event/keys/column',
                       self._on_key_column)
        self.addOnBody('packet/transaction/event/values/column',
                       self._on_value_column)
        self.addOnFinish(self._on_finish)

    def _check_packet(self, tag, attrs):
        if self._ictx['schema_seq'] != int(attrs.getValue('schema_seq')):
            raise Exception(
                '<packet> schema_seq: {0} not matched the expected seq number {1}',
                attrs.getValue('schema_seq'), self._ictx['replication_seq'])

        if self._ictx['replication_seq'] != int(
                attrs.getValue('replication_seq')):
            raise Exception(
                '<packet> replication_seq: {0} not matched the expected seq number {1}',
                attrs.getValue('replication_seq'),
                self._ictx['replication_seq'])

    def _on_key_column(self, tag, attrs, val):
        event = self.peek()
        event['keys'][attrs.getValue('name')] = val

    def _on_value_column(self, tag, attrs, val):
        event = self.peek()
        isNull = attrs.getValue("null") if attrs.has_key('null') else None
        event['values'][attrs.getValue(
            'name')] = val if isNull != "yes" else None

    def _on_event(self, tag, attrs):
        event = {
            'op': attrs.getValue('op'),
            'table': attrs.getValue('table'),
            'keys': OrderedDict(),  #array of tuples column name -> column val
            'values': OrderedDict()  #array of tuples column name -> column val
        }
        self.push(event)

    def _on_event_end(self, tag):
        event = self.pop()
        type = event['op']
        table = event['table']
        keys = event['keys']
        values = event['values']
        params = []
        if type == 'I':
            sql_columns = ', '.join(values.keys())
            sql_values = ', '.join(['%s'] * len(values))
            sql = 'INSERT INTO %s (%s) VALUES (%s)' % (table, sql_columns,
                                                       sql_values)
            params = values.values()
        elif type == 'U':
            sql_values = ', '.join('%s=%%s' % i for i in values)
            sql = 'UPDATE %s SET %s' % (table, sql_values)
            params = values.values()
        elif type == 'D':
            sql = 'DELETE FROM %s' % table
        else:
            raise Exception('Invalid <event> op: %s' % type)

        if type == 'D' or type == 'U':
            sql += ' WHERE ' + ' AND '.join(
                '%s%s%%s' % (i, ' IS ' if keys[i] is None else '=')
                for i in keys.keys())
            params.extend(keys.values())

        #print '%s %s' % (sql, params)
        self._cursor.execute(sql, params)

    def _on_finish(self):
        pass

    def load(self):
        logger.warning('Saving dataset....')
        self.parse(self._input)
        self.success = True

    def recover(self):
        """ This is duty hack to remove weird characters presented in some replications files.
            Using the tidy tool.
        """
        logger.warning('Trying to recover invalid XML...')
        originalXML = None
        fixedXML = None
        try:
            originalXML = tempfile.NamedTemporaryFile(
                suffix='.xml', delete=False)  #bunzipped tmp
            fixedXML = tempfile.NamedTemporaryFile(suffix='.xml',
                                                   delete=False)  #fixed tmp
            fixedXML.close()

            #Fetch uncompressed file data to recover
            bzf = self._input.getByteStream()
            bzf.seek(0)
            shutil.copyfileobj(bzf, originalXML)
            originalXML.close()

            cmd = ['tidy', '-xml', '-o', fixedXML.name, originalXML.name]
            logger.warning('Running: %s', ' '.join(cmd))
            ret = subprocess.call(cmd)
            if ret:
                #raise Exception('Failed to fix XML data, ret=%s' % ret)
                pass

            #ready to load
            self.close()
            self._file = file(fixedXML.name, 'r')
            self._input = InputSource(fixedXML.name)
            self._input.setByteStream(self._file)
            self._cursor = self._conn.cursor()
            self.success = self._closed = False
            self.reset()
            self._add_rules()
            self.load()
        finally:
            for f in [originalXML, fixedXML]:
                if f and not f.closed:
                    f.close()
                if f and os.path.exists(f.name):
                    os.unlink(f.name)

    def close(self):
        if self._closed:
            return
        try:
            if self.success:
                self._conn.commit()
                logger.warning('Done')
            else:
                logger.warning(
                    'Rolling back transaction. Seq number: {0}'.format(
                        self._ictx['replication_seq']))
                self._conn.rollback()
            self._cursor.close()
        finally:
            self._closed = True
            self._input.getByteStream().close()
            self._file.close()
Example #5
0
def _validate(aString,
              firstOccurrenceOnly,
              loggedEvents,
              base,
              encoding,
              selfURIs=None,
              mediaType=None):
    """validate RSS from string, returns validator object"""
    from xml.sax import make_parser, handler
    from .base import SAXDispatcher
    from exceptions import UnicodeError
    from cStringIO import StringIO

    if re.match("^\s+<\?xml", aString) and re.search(
            "<generator.*wordpress.*</generator>", aString):
        lt = aString.find('<')
        gt = aString.find('>')
        if lt > 0 and gt > 0 and lt < gt:
            loggedEvents.append(logging.WPBlankLine({'line': 1, 'column': 1}))
            # rearrange so that other errors can be found
            aString = aString[lt:gt + 1] + aString[0:lt] + aString[gt + 1:]

    # By now, aString should be Unicode
    source = InputSource()
    source.setByteStream(StringIO(xmlEncoding.asUTF8(aString)))

    validator = SAXDispatcher(base, selfURIs or [base], encoding)
    validator.setFirstOccurrenceOnly(firstOccurrenceOnly)

    if mediaType == 'application/atomsvc+xml':
        validator.setFeedType(TYPE_APP_SERVICE)
    elif mediaType == 'application/atomcat+xml':
        validator.setFeedType(TYPE_APP_CATEGORIES)

    validator.loggedEvents += loggedEvents

    # experimental RSS-Profile support
    validator.rssCharData = [s.find('&#x') >= 0 for s in aString.split('\n')]

    xmlver = re.match(
        "^<\?\s*xml\s+version\s*=\s*['\"]([-a-zA-Z0-9_.:]*)['\"]", aString)
    if xmlver and xmlver.group(1) != '1.0':
        validator.log(logging.BadXmlVersion({"version": xmlver.group(1)}))

    try:
        from xml.sax.expatreader import ExpatParser

        class fake_dtd_parser(ExpatParser):
            def reset(self):
                ExpatParser.reset(self)
                self._parser.UseForeignDTD(1)

        parser = fake_dtd_parser()
    except:
        parser = make_parser()

    parser.setFeature(handler.feature_namespaces, 1)
    parser.setContentHandler(validator)
    parser.setErrorHandler(validator)
    parser.setEntityResolver(validator)
    if hasattr(parser, '_ns_stack'):
        # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
        # PyXML doesn't have this problem, and it doesn't have _ns_stack either
        parser._ns_stack.append(
            {'http://www.w3.org/XML/1998/namespace': 'xml'})

    def xmlvalidate(log):
        import libxml2
        from StringIO import StringIO
        from random import random

        prefix = "...%s..." % str(random()).replace('0.', '')
        msg = []
        libxml2.registerErrorHandler(lambda msg, str: msg.append(str), msg)

        input = libxml2.inputBuffer(StringIO(xmlEncoding.asUTF8(aString)))
        reader = input.newTextReader(prefix)
        reader.SetParserProp(libxml2.PARSER_VALIDATE, 1)
        ret = reader.Read()
        while ret == 1:
            ret = reader.Read()

        msg = ''.join(msg)
        for line in msg.splitlines():
            if line.startswith(prefix): log(line.split(':', 4)[-1].strip())

    validator.xmlvalidator = xmlvalidate

    try:
        parser.parse(source)
    except SAXException:
        pass
    except UnicodeError:
        import sys
        exctype, value = sys.exc_info()[:2]
        validator.log(logging.UnicodeError({"exception": value}))

    if validator.getFeedType() == TYPE_RSS1:
        try:
            from rdflib.syntax.parsers.RDFXMLHandler import RDFXMLHandler

            class Handler(RDFXMLHandler):
                ns_prefix_map = {}
                prefix_ns_map = {}

                def add(self, triple):
                    pass

                def __init__(self, dispatcher):
                    RDFXMLHandler.__init__(self, self)
                    self.dispatcher = dispatcher

                def error(self, message):
                    self.dispatcher.log(InvalidRDF({"message": message}))

            source.getByteStream().reset()
            parser.reset()
            parser.setContentHandler(Handler(parser.getContentHandler()))
            parser.setErrorHandler(handler.ErrorHandler())
            parser.parse(source)
        except:
            pass

    return validator