def __init__(self, dirname, verbose=False): self.verbose = verbose self.dirname = dirname self.mode = None self._last_unindexed_loc = None self._db = TarDB(os.path.join(dirname, 'tar')) self._labeldb = LabelDB(os.path.join(dirname, 'label')) Corpus.__init__(self, os.path.join(dirname, 'idx'), 'idx') return
def create(dirname): os.mkdir(os.path.join(dirname, 'tar')) os.mkdir(os.path.join(dirname, 'idx')) os.mkdir(os.path.join(dirname, 'label')) TarDB.create(os.path.join(dirname, 'tar')) return
def __init__(self, basedir): self.basedir = basedir self._tar = TarDB(os.path.join(basedir, 'tar')) self._text = TextDB(os.path.join(basedir, 'text')) return
class MailCorpus(Corpus): class MailCorpusError(Exception): pass class DatabaseLocked(MailCorpusError): pass SMALL_MERGE = 20 LARGE_MERGE = 2000 singleton_handler = None @classmethod def register_singleton_handler(klass, handler): klass.singleton_handler = handler return def _get_singleton(self): return MailCorpus.singleton_handler(self.dirname) def __getstate__(self): odict = Corpus.__getstate__(self) # there odict values are never treated seriously. del odict['mode'] del odict['_db'] del odict['_labeldb'] del odict['_last_unindexed_loc'] return odict def __init__(self, dirname, verbose=False): self.verbose = verbose self.dirname = dirname self.mode = None self._last_unindexed_loc = None self._db = TarDB(os.path.join(dirname, 'tar')) self._labeldb = LabelDB(os.path.join(dirname, 'label')) Corpus.__init__(self, os.path.join(dirname, 'idx'), 'idx') return def __len__(self): return len(self._db) def __repr__(self): return '<MailCorpus: dirname=%r, db=%r, last_unindexed_loc=%r>' % \ (self.dirname, self._db, self._last_unindexed_loc) @staticmethod def create(dirname): os.mkdir(os.path.join(dirname, 'tar')) os.mkdir(os.path.join(dirname, 'idx')) os.mkdir(os.path.join(dirname, 'label')) TarDB.create(os.path.join(dirname, 'tar')) return def set_writable(self): if self.mode == 'r+': return if self.mode == 'r': self.close() try: self.open('r+') except MailCorpus.DatabaseLocked: self.open('r') raise return def get_labeldb(self): return self._labeldb def open(self, mode='r'): try: self._db.open(mode) except TarDB.LockError: raise MailCorpus.DatabaseLocked('Database locked.') self._last_unindexed_loc = None self.mode = mode return def merge(self, large=False): from fooling.merger import Merger docs_threshold = self.SMALL_MERGE if large: docs_threshold = self.LARGE_MERGE Merger(self, max_docs_threshold=docs_threshold).run(True) return def flush(self, notice=None, force=False): from fooling.indexer import Indexer if force: self._last_unindexed_loc = len(self)-1 if self._last_unindexed_loc: indexer = Indexer(self, verbose=self.verbose) prevloc = int(self.index_lastloc() or '-1') lastloc = int(self._last_unindexed_loc) # notice is a function that receives the number of docs being indexed. if notice: notice(lastloc - prevloc) for i in xrange(prevloc+1, lastloc+1): indexer.index_doc(str(i), indexyomi=config.INDEX_YOMI) indexer.finish() self.merge(force) self._last_unindexed_loc = None return def close(self, notice=None): self.flush(notice) self.mode = None self._db.close() self._labeldb.close() return def get_message(self, loc): (info, data) = self._db.get_record(int(loc)) fp = gzip.GzipFile(fileobj=StringIO.StringIO(data)) data = fp.read() fp.close() return data def add_message(self, data, labels, mtime=0): import time info = TarInfo(self._labels2name(len(self._db), labels)) info.mtime = mtime or int(time.time()) fp = StringIO.StringIO() gz = gzip.GzipFile(mode='w', fileobj=fp) gz.write(data) gz.close() recno = self._db.add_record(info, fp.getvalue()) self._labeldb.add_label(recno, labels) self._last_unindexed_loc = str(recno) return self._last_unindexed_loc # Internal routine to access TarDB. def _labels2name(self, recno, labels): labels = ''.join(sorted(labels)) if labels and not labels.isalnum(): raise AssertionError('Invalid labels: %r' % labels) return '%08x.%s' % (recno, labels) FILENAME_PAT = re.compile(r'[0-9a-f]{8}\.(.*)') def _name2labels(self, name): m = self.FILENAME_PAT.match(name) if not m: raise AssertionError('Invalid file name: %r' % name) return set(m.group(1)) def get_message_labels(self, loc): info = self._db.get_info(int(loc)) return self._name2labels(info.name) def add_message_label(self, loc, labels): recno = int(loc) info = self._db.get_info(recno) labels1 = self._name2labels(info.name).union(set(labels)) info.name = self._labels2name(recno, labels1) self._db.set_info(recno, info) self._labeldb.add_label(recno, labels) return def del_message_label(self, loc, labels): recno = int(loc) info = self._db.get_info(recno) labels1 = self._name2labels(info.name).difference(set(labels)) info.name = self._labels2name(recno, labels1) self._db.set_info(recno, info) self._labeldb.del_label(recno, labels) return def mark_deleted(self, loc): self.add_message_label(loc, config.LABEL4DELETED) return # Corpus methods def loc_exists(self, loc): recno = int(loc) return 0 <= recno and recno < len(self._db) def loc_fp(self, loc): return StringIO.StringIO(self.get_message(loc)) def loc_mtime(self, loc): info = self._db.get_info(int(loc)) return info.mtime def loc_size(self, loc): return len(self.get_message(loc)) def get_doc(self, loc): info = self._db.get_info(int(loc)) return EMailDocumentWithLabel(self, loc, info.mtime)
class MessageDB: MAX_TEXT_SIZE = 100000 def __init__(self, basedir): self.basedir = basedir self._tar = TarDB(os.path.join(basedir, 'tar')) self._text = TextDB(os.path.join(basedir, 'text')) return def create(self): os.makedirs(self.basedir) self._tar.create() self._text.create() return def open(self): self._tar.open() self._text.open() return def close(self): self._tar.close() self._text.close() return def flush(self): self._tar.flush() self._text.flush() return def add_file(self, data): recno = self._tar.next_recno() info = TarInfo('%08d' % recno) self._tar.add_record(info, bytes2gzip(data)) msg = message_from_bytes(data) text = cutoff(msg2str(msg), self.MAX_TEXT_SIZE) self._text.add_text(recno, text) for tag in msg2tags(msg): self._text.add_tag(recno, tag) return recno def search_tag(self, tags): result = None for tag in tags: recs = set(self._text.search_tag(tag)) if result is None: result = recs else: result.update_intersection(recs) for recno in sorted(result, reverse=True): yield self._text.get_text(recno) return def search_text(self, qs): result = None for q in qs: recs = set(self._text.search_text(q)) if result is None: result = recs else: result.update_intersection(recs) for recno in sorted(result, reverse=True): yield self._text.get_text(recno) return