Ejemplo n.º 1
0
def merge(argv):
  import getopt
  def usage():
    print 'usage: %s [-v] [-p prefix] [-D maxdocs] [-T maxterms] idxdir' % argv[0]
    sys.exit(2)
  try:
    (opts, args) = getopt.getopt(argv[1:], 'vp:D:T:')
  except getopt.GetoptError:
    usage()
  (verbose, prefix, max_docs_threshold, max_terms_threshold) = (1, 'idx', 2000, 50000)
  for (k, v) in opts:
    if k == '-v': verbose += 1
    elif k == '-p': prefix = v
    elif k == '-D': max_docs_threshold = int(v)
    elif k == '-T': max_terms_threshold = int(v)
  if not args: usage()
  assert len(prefix) == 3
  idxdir = args[0]
  indexdb = IndexDB(idxdir, prefix)
  indexdb.open()
  Merger(indexdb, max_docs_threshold, max_terms_threshold, verbose).run()
  return
Ejemplo n.º 2
0
def index(argv):
  import getopt, locale
  def usage():
    print 'usage: %s [-v] [-F|-U|-N|-R] [-Y] [-b basedir] [-p prefix] [-c corpustype] [-t doctype] [-e encoding] [-D maxdocs] [-T maxterms] idxdir [file ...]' % argv[0]
    sys.exit(2)
  try:
    (opts, args) = getopt.getopt(argv[1:], 'vFURNYb:p:c:t:e:D:T:')
  except getopt.GetoptError:
    usage()
  verbose = 1
  mode = 0
  basedir = ''
  prefix = 'idx'
  corpustype = corpus.FilesystemCorpus
  doctype = document.PlainTextDocument
  encoding = locale.getpreferredencoding()
  maxdocs = 1000
  maxterms = 50000
  indexstyle = 'normal'
  for (k, v) in opts:
    if k == '-d': verbose += 1
    elif k == '-F': mode = 0 # force
    elif k == '-U': mode = 1 # update only
    elif k == '-N': mode = 2 # new document only
    elif k == '-R': mode = 3 # reset
    elif k == '-Y': indexstyle = 'yomi'
    elif k == '-b': basedir = v
    elif k == '-p': prefix = v
    elif k == '-c': corpustype = corpus.get_corpustype(v)
    elif k == '-t': doctype = document.get_doctype(v)
    elif k == '-e': encoding = v
    elif k == '-D': maxdocs = int(v)
    elif k == '-T': maxterms = int(v)
  if not args: usage()
  assert len(prefix) == 3
  idxdir = args[0]
  cps = corpustype(basedir, doctype, encoding, indexstyle)
  cps.open()
  indexdb = IndexDB(idxdir, prefix)
  try:
    indexdb.create()
  except IndexDB.IndexDBError:
    pass
  indexdb.open()
  if mode == 3:
    indexdb.reset()
    mode = 0
  indexer = Indexer(indexdb, cps, maxdocs, maxterms, verbose=verbose)
  print >>sys.stderr, \
        'Index: basedir=%r, idxdir=%r, max_docs_threshold=%d, max_terms_threshold=%d ' % \
        (basedir, idxdir, maxdocs, maxterms)

  files = args[1:]
  lastmod = indexdb.index_mtime()
  if not files:
    files = sys.stdin
  for fname in files:
    fname = fname.strip()
    if not cps.loc_exists(fname): continue
    if indexdb.loc_indexed(fname):
      if mode == 2 or ((mode == 1) and cps.loc_mtime(fname) < lastmod): continue
    indexer.index_loc(fname)

  indexer.finish()
  cps.close()
  print >>sys.stderr, 'Done.'
  return
Ejemplo n.º 3
0
def search(argv):
  import getopt, locale, time
  def usage():
    print ('usage: %s [-d] [-T timeout] [-s|-Y] [-D] [-a] '
           '[-c savefile] [-b basedir] [-p prefix] [-t doctype] '
           '[-e encoding] [-n results] idxdir [keyword ...]') % argv[0]
    sys.exit(2)
  try:
    (opts, args) = getopt.getopt(argv[1:], 'dT:sYDac:b:p:t:e:n:')
  except getopt.GetoptError:
    usage()
  debug = 0
  timeout = 0
  stat = False
  disjunctive = False
  savefile = ''
  basedir = ''
  prefix = ''
  doctype = document.PlainTextDocument
  predtype = KeywordPredicate
  encoding = locale.getpreferredencoding()
  n = 10
  for (k, v) in opts:
    if k == '-d': debug += 1
    elif k == '-T': timeout = int(v)
    elif k == '-D': disjunctive = True
    elif k == '-a': stat = True
    elif k == '-Y': predtype = YomiKeywordPredicate
    elif k == '-s': predtype = StrictKeywordPredicate
    elif k == '-c': savefile = v
    elif k == '-b': basedir = v
    elif k == '-p': prefix = v
    elif k == '-t': doctype = document.get_doctype(v)
    elif k == '-e': encoding = v
    elif k == '-n': n = int(v)

  if doctype == document.EMailDocument:
    predtype = EMailPredicate

  t0 = time.time()
  if args:
    idxdir = args[0]
    keywords = args[1:]
    indexdb = IndexDB(idxdir, prefix)
    indexdb.open()
    preds = [ predtype(unicode(kw, encoding)) for kw in keywords ]
    selection = Selection(indexdb, preds, disjunctive=disjunctive)
    selection.set_timeout(timeout)
    try:
      show_results(selection, n, encoding)
    except SearchTimeout:
      print 'SearchTimeout.'
  elif savefile:
    selection = load_selection(savefile)
    selection.set_timeout(timeout)
    try:
      show_results(selection, n, encoding)
    except SearchTimeout:
      print 'SearchTimeout.'
  else:
    usage()
  
  if savefile:
    save_selection(savefile, selection)

  if stat:
    print '%.2f sec, %d/%d hit' % (time.time()-t0, len(selection.found_docs), selection.narrowed)
  return