Ejemplo n.º 1
0
 def assertCMSIdx(self, cms, fname, keys):
   path = os.path.join(os.path.join(cms.basedir, 'idx'), fname)
   db = CDBReader(path)
   r = []
   for k in db.iterkeys():
     if k[0] == '\x00':
       (docid,sentid) = struct.unpack('>xll', k)
       r.append((docid, sentid))
     elif k[0] == '\xfd':
       pass
     elif k[0] == '\xfe':
       pass
     elif k == '\xff':
       pass
     else:
       (c,k) = (k[0], k[1:])
       w = k
       if '\x10' <= c and c <= '\x13':
         w = unicode(k, 'utf-8')
       elif c == '\x20':
         w = u''.join( unichr(0x3000+ord(c)) for c in k )
       elif c == '\xf0':
         if len(k) == 2:
           w = '%04d' % struct.unpack('>h', k)
         elif len(k) == 3:
           w = '%04d/%02d' % struct.unpack('>hb', k)
         elif len(k) == 4:
           w = '%04d/%02d/%02d' % struct.unpack('>hbb', k)
       r.append(w)
   self.assertEqual(r, keys)
   return
Ejemplo n.º 2
0
 def assertCMSIdx(self, cms, fname, keys):
     path = os.path.join(os.path.join(cms.basedir, 'idx'), fname)
     db = CDBReader(path)
     r = []
     for k in db.iterkeys():
         if k[0] == '\x00':
             (docid, sentid) = struct.unpack('>xll', k)
             r.append((docid, sentid))
         elif k[0] == '\xfd':
             pass
         elif k[0] == '\xfe':
             pass
         elif k == '\xff':
             pass
         else:
             (c, k) = (k[0], k[1:])
             w = k
             if '\x10' <= c and c <= '\x13':
                 w = unicode(k, 'utf-8')
             elif c == '\x20':
                 w = u''.join(unichr(0x3000 + ord(c)) for c in k)
             elif c == '\xf0':
                 if len(k) == 2:
                     w = '%04d' % struct.unpack('>h', k)
                 elif len(k) == 3:
                     w = '%04d/%02d' % struct.unpack('>hb', k)
                 elif len(k) == 4:
                     w = '%04d/%02d/%02d' % struct.unpack('>hbb', k)
             r.append(w)
     self.assertEqual(r, keys)
     return
Ejemplo n.º 3
0
class WikiDBReader(object):

    def __init__(self, path, ext='', codec='utf-8'):
        self._reader = CDBReader(path)
        self.ext = ext
        self.codec = codec
        return

    def __iter__(self):
        return self.get_pageids()

    def __getitem__(self, pageid):
        return self.get_page(pageid)

    def _get_data(self, key):
        data = self._reader[key]
        data = decompress(key, data)
        return data.decode(self.codec, 'ignore')

    def get_pageids(self):
        for key in self._reader.iterkeys():
            if key.endswith(':title'):
                (pageid,_,_) = key.partition(':')
                yield int(pageid)
        return

    def get_page(self, pageid):
        key = ('%s:title' % pageid)
        title = self._reader[key].decode(self.codec, 'ignore')
        key = ('%s:revs' % pageid)
        revids = self._reader[key].split(' ')
        return (title, revids)

    def get_wiki(self, pageid, revid):
        key = '%s/%s:wiki' % (pageid, revid)
        key += self.ext
        return self._get_data(key)

    def get_text(self, pageid, revid):
        key = '%s/%s:text' % (pageid, revid)
        key += self.ext
        return self._get_data(key)
Ejemplo n.º 4
0
Archivo: mwcdb.py Proyecto: euske/pymwp
class WikiDBReader(object):
    def __init__(self, path, ext="", codec="utf-8"):
        self._reader = CDBReader(path)
        self.ext = ext
        self.codec = codec
        return

    def __iter__(self):
        return self.get_pageids()

    def __getitem__(self, pageid):
        return self.get_page(pageid)

    def _get_data(self, key):
        data = self._reader[key]
        data = decompress(key, data)
        return data.decode(self.codec, "ignore")

    def get_pageids(self):
        for key in self._reader.iterkeys():
            if key.endswith(":title"):
                (pageid, _, _) = key.partition(":")
                yield int(pageid)
        return

    def get_page(self, pageid):
        key = "%s:title" % pageid
        title = self._reader[key].decode(self.codec, "ignore")
        key = "%s:revs" % pageid
        revids = self._reader[key].split(" ")
        return (title, revids)

    def get_wiki(self, pageid, revid):
        key = "%s/%s:wiki" % (pageid, revid)
        key += self.ext
        return self._get_data(key)

    def get_text(self, pageid, revid):
        key = "%s/%s:text" % (pageid, revid)
        key += self.ext
        return self._get_data(key)
Ejemplo n.º 5
0
class WikiDBReader(object):
    def __init__(self, path, ext='', codec='utf-8'):
        self._reader = CDBReader(path)
        self.ext = ext
        self.codec = codec
        return

    def __iter__(self):
        return self.get_pageids()

    def __getitem__(self, pageid):
        return self.get_page(pageid)

    def _get_data(self, key):
        data = self._reader[key]
        data = decompress(key, data)
        return data.decode(self.codec, 'ignore')

    def get_pageids(self):
        for key in self._reader.iterkeys():
            if key.endswith(':title'):
                (pageid, _, _) = key.partition(':')
                yield int(pageid)
        return

    def get_page(self, pageid):
        key = ('%s:title' % pageid)
        title = self._reader[key].decode(self.codec, 'ignore')
        key = ('%s:revs' % pageid)
        revids = self._reader[key].split(' ')
        return (title, revids)

    def get_wiki(self, pageid, revid):
        key = '%s/%s:wiki' % (pageid, revid)
        key += self.ext
        return self._get_data(key)

    def get_text(self, pageid, revid):
        key = '%s/%s:text' % (pageid, revid)
        key += self.ext
        return self._get_data(key)
Ejemplo n.º 6
0
Archivo: mwcdb.py Proyecto: euske/pymwp
 def __init__(self, path, ext="", codec="utf-8"):
     self._reader = CDBReader(path)
     self.ext = ext
     self.codec = codec
     return
Ejemplo n.º 7
0
 def __init__(self, path, ext='', codec='utf-8'):
     self._reader = CDBReader(path)
     self.ext = ext
     self.codec = codec
     return
Ejemplo n.º 8
0
 def __init__(self, path, ext='', codec='utf-8'):
     self._reader = CDBReader(path)
     self.ext = ext
     self.codec = codec
     return