Esempio n. 1
0
class MWXMLDump2CDB(MWXMLDumpFilter):

    def __init__(self, path):
        MWXMLDumpFilter.__init__(self)
        self._maker = CDBMaker(path)
        self._key = self._value = None
        return

    def close(self):
        MWXMLDumpFilter.close(self)
        self._maker.finish()
        return

    def start_page(self, pageid, title):
        MWXMLDumpFilter.start_page(self, pageid, title)
        self._maker.add('%s:title' % pageid, title.encode('utf-8'))
        self._revs = []
        return

    def start_revision(self, pageid, title, revid, timestamp):
        MWXMLDumpFilter.start_revision(self, pageid, title, revid, timestamp)
        self._revs.append(revid)
        return

    def end_page(self, pageid, title):
        MWXMLDumpFilter.end_page(self, pageid, title)
        revs = ' '.join( str(revid) for revid in self._revs )
        self._maker.add('%s:revs' % pageid, revs)
        return
    
    def open_file(self, pageid, title, revid, timestamp):
        print >>sys.stderr, (pageid, title, revid)
        self._key = '%s/%s:text' % (pageid, revid)
        self._value = StringIO()
        return GzipFile(mode='w', fileobj=self._value)

    def close_file(self, fp):
        fp.close()
        self._maker.add(self._key, self._value.getvalue())
        self._key = self._value = None
        return

    def write_file(self, fp, text):
        fp.write(text.encode('utf-8'))
        return
Esempio n. 2
0
class MWCDB2Text(object):

    def __init__(self, srcpath, dstpath, factory):
        self.reader = CDBReader(srcpath)
        self.writer = CDBMaker(dstpath)
        self.factory = factory
        return

    def close(self):
        self.writer.finish()
        return

    def convert(self, pageid, revision=0):
        key = '%d/%d' % (pageid, revision)
        srcbuf = StringIO(self.reader[key])
        src = GzipFile(mode='r', fileobj=srcbuf)
        dstbuf = StringIO()
        dst = GzipFile(mode='w', fileobj=dstbuf)
        textparser = self.factory('utf-8')
        textparser.feed_text(src.read().decode('utf-8'))
        textparser.close()
        textparser.convert(dst)
        src.close()
        dst.close()
        self.writer.add(key, dstbuf.getvalue())
        key = '%d:title' % pageid
        self.writer.add(key, self.reader[key])
        return

    def convert_all(self):
        for key in self.reader:
            try:
                i = key.rindex('/')
                pageid = int(key[:i])
                revision = int(key[i+1:])
            except ValueError:
                continue
            print >>sys.stderr, (pageid,revision)
            self.convert(pageid, revision)
        return
Esempio n. 3
0
 def __init__(self, srcpath, dstpath, factory):
     self.reader = CDBReader(srcpath)
     self.writer = CDBMaker(dstpath)
     self.factory = factory
     return
Esempio n. 4
0
 def __init__(self, path):
     MWXMLDumpFilter.__init__(self)
     self._maker = CDBMaker(path)
     self._key = self._value = None
     return