def merge(argv): import getopt def usage(): print 'usage: %s [-v] [-p prefix] [-D maxdocs] [-T maxterms] idxdir' % argv[0] sys.exit(2) try: (opts, args) = getopt.getopt(argv[1:], 'vp:D:T:') except getopt.GetoptError: usage() (verbose, prefix, max_docs_threshold, max_terms_threshold) = (1, 'idx', 2000, 50000) for (k, v) in opts: if k == '-v': verbose += 1 elif k == '-p': prefix = v elif k == '-D': max_docs_threshold = int(v) elif k == '-T': max_terms_threshold = int(v) if not args: usage() assert len(prefix) == 3 idxdir = args[0] indexdb = IndexDB(idxdir, prefix) indexdb.open() Merger(indexdb, max_docs_threshold, max_terms_threshold, verbose).run() return
def index(argv): import getopt, locale def usage(): print 'usage: %s [-v] [-F|-U|-N|-R] [-Y] [-b basedir] [-p prefix] [-c corpustype] [-t doctype] [-e encoding] [-D maxdocs] [-T maxterms] idxdir [file ...]' % argv[0] sys.exit(2) try: (opts, args) = getopt.getopt(argv[1:], 'vFURNYb:p:c:t:e:D:T:') except getopt.GetoptError: usage() verbose = 1 mode = 0 basedir = '' prefix = 'idx' corpustype = corpus.FilesystemCorpus doctype = document.PlainTextDocument encoding = locale.getpreferredencoding() maxdocs = 1000 maxterms = 50000 indexstyle = 'normal' for (k, v) in opts: if k == '-d': verbose += 1 elif k == '-F': mode = 0 # force elif k == '-U': mode = 1 # update only elif k == '-N': mode = 2 # new document only elif k == '-R': mode = 3 # reset elif k == '-Y': indexstyle = 'yomi' elif k == '-b': basedir = v elif k == '-p': prefix = v elif k == '-c': corpustype = corpus.get_corpustype(v) elif k == '-t': doctype = document.get_doctype(v) elif k == '-e': encoding = v elif k == '-D': maxdocs = int(v) elif k == '-T': maxterms = int(v) if not args: usage() assert len(prefix) == 3 idxdir = args[0] cps = corpustype(basedir, doctype, encoding, indexstyle) cps.open() indexdb = IndexDB(idxdir, prefix) try: indexdb.create() except IndexDB.IndexDBError: pass indexdb.open() if mode == 3: indexdb.reset() mode = 0 indexer = Indexer(indexdb, cps, maxdocs, maxterms, verbose=verbose) print >>sys.stderr, \ 'Index: basedir=%r, idxdir=%r, max_docs_threshold=%d, max_terms_threshold=%d ' % \ (basedir, idxdir, maxdocs, maxterms) files = args[1:] lastmod = indexdb.index_mtime() if not files: files = sys.stdin for fname in files: fname = fname.strip() if not cps.loc_exists(fname): continue if indexdb.loc_indexed(fname): if mode == 2 or ((mode == 1) and cps.loc_mtime(fname) < lastmod): continue indexer.index_loc(fname) indexer.finish() cps.close() print >>sys.stderr, 'Done.' return
def search(argv): import getopt, locale, time def usage(): print ('usage: %s [-d] [-T timeout] [-s|-Y] [-D] [-a] ' '[-c savefile] [-b basedir] [-p prefix] [-t doctype] ' '[-e encoding] [-n results] idxdir [keyword ...]') % argv[0] sys.exit(2) try: (opts, args) = getopt.getopt(argv[1:], 'dT:sYDac:b:p:t:e:n:') except getopt.GetoptError: usage() debug = 0 timeout = 0 stat = False disjunctive = False savefile = '' basedir = '' prefix = '' doctype = document.PlainTextDocument predtype = KeywordPredicate encoding = locale.getpreferredencoding() n = 10 for (k, v) in opts: if k == '-d': debug += 1 elif k == '-T': timeout = int(v) elif k == '-D': disjunctive = True elif k == '-a': stat = True elif k == '-Y': predtype = YomiKeywordPredicate elif k == '-s': predtype = StrictKeywordPredicate elif k == '-c': savefile = v elif k == '-b': basedir = v elif k == '-p': prefix = v elif k == '-t': doctype = document.get_doctype(v) elif k == '-e': encoding = v elif k == '-n': n = int(v) if doctype == document.EMailDocument: predtype = EMailPredicate t0 = time.time() if args: idxdir = args[0] keywords = args[1:] indexdb = IndexDB(idxdir, prefix) indexdb.open() preds = [ predtype(unicode(kw, encoding)) for kw in keywords ] selection = Selection(indexdb, preds, disjunctive=disjunctive) selection.set_timeout(timeout) try: show_results(selection, n, encoding) except SearchTimeout: print 'SearchTimeout.' elif savefile: selection = load_selection(savefile) selection.set_timeout(timeout) try: show_results(selection, n, encoding) except SearchTimeout: print 'SearchTimeout.' else: usage() if savefile: save_selection(savefile, selection) if stat: print '%.2f sec, %d/%d hit' % (time.time()-t0, len(selection.found_docs), selection.narrowed) return