def flush(self, notice=None, force=False): from fooling.indexer import Indexer if force: self._last_unindexed_loc = len(self)-1 if self._last_unindexed_loc: indexer = Indexer(self, verbose=self.verbose) prevloc = int(self.index_lastloc() or '-1') lastloc = int(self._last_unindexed_loc) # notice is a function that receives the number of docs being indexed. if notice: notice(lastloc - prevloc) for i in xrange(prevloc+1, lastloc+1): indexer.index_doc(str(i), indexyomi=config.INDEX_YOMI) indexer.finish() self.merge(force) self._last_unindexed_loc = None return
def index(argv): import getopt, locale def usage(): print 'usage: %s [-v] [-F|-U|-N|-R] [-Y] [-b basedir] [-p prefix] [-c corpustype] [-t doctype] [-e encoding] [-D maxdocs] [-T maxterms] idxdir [file ...]' % argv[0] sys.exit(2) try: (opts, args) = getopt.getopt(argv[1:], 'vFURNYb:p:c:t:e:D:T:') except getopt.GetoptError: usage() verbose = 1 mode = 0 basedir = '' prefix = 'idx' corpustype = corpus.FilesystemCorpus doctype = document.PlainTextDocument encoding = locale.getpreferredencoding() maxdocs = 1000 maxterms = 50000 indexstyle = 'normal' for (k, v) in opts: if k == '-d': verbose += 1 elif k == '-F': mode = 0 # force elif k == '-U': mode = 1 # update only elif k == '-N': mode = 2 # new document only elif k == '-R': mode = 3 # reset elif k == '-Y': indexstyle = 'yomi' elif k == '-b': basedir = v elif k == '-p': prefix = v elif k == '-c': corpustype = corpus.get_corpustype(v) elif k == '-t': doctype = document.get_doctype(v) elif k == '-e': encoding = v elif k == '-D': maxdocs = int(v) elif k == '-T': maxterms = int(v) if not args: usage() assert len(prefix) == 3 idxdir = args[0] cps = corpustype(basedir, doctype, encoding, indexstyle) cps.open() indexdb = IndexDB(idxdir, prefix) try: indexdb.create() except IndexDB.IndexDBError: pass indexdb.open() if mode == 3: indexdb.reset() mode = 0 indexer = Indexer(indexdb, cps, maxdocs, maxterms, verbose=verbose) print >>sys.stderr, \ 'Index: basedir=%r, idxdir=%r, max_docs_threshold=%d, max_terms_threshold=%d ' % \ (basedir, idxdir, maxdocs, maxterms) files = args[1:] lastmod = indexdb.index_mtime() if not files: files = sys.stdin for fname in files: fname = fname.strip() if not cps.loc_exists(fname): continue if indexdb.loc_indexed(fname): if mode == 2 or ((mode == 1) and cps.loc_mtime(fname) < lastmod): continue indexer.index_loc(fname) indexer.finish() cps.close() print >>sys.stderr, 'Done.' return