Beispiel #1
0
def main():
  logging.basicConfig(level=logging.DEBUG)
  if len(sys.argv) < 2:
    logger.error("Must specify a command! Try 'help'")
    sys.exit(-1)

  command = sys.argv[1]

  if command == 'help':
    # Display a help message
    logger.info("The following commands are supported:")
    logger.info("    index             : builds indexes - warning: can take many hours")
    logger.info("    stats             : print statistics in a tab-delimited CSV format")
    logger.info("    categories [lang] : list categories for 'lang', and the number of documents in each")

  elif command == 'list':
    # List available dumps
    dumps = find_dumps()
    # TODO: Print if an index exists
    for key in sorted(dumps):
      print "  %-20s%s" % (key, dumps[key])
    
  elif command == 'index':
    # Build indices
    load_dumps(build_index=True)

  elif command == 'stats':
    root_logger = logging.getLogger()
    root_logger.level = logging.ERROR
    # Display statistics
    paths = find_dumps()
    sizes = dict((k, len(Dump(p))) for k, p in paths.iteritems())

    fields = ['lang', 'filename', 'pages', 'categories']
    outfile = csv.DictWriter(sys.stdout, fields)
    for p in sorted(sizes, key=sizes.get, reverse=True):
      dump = Dump(paths[p])
      d = dict\
            ( lang=p
            , filename=os.path.basename(paths[p])
            , pages=sizes[p]
            , categories=len(dump.categories)
            )
      outfile.writerow(d)

  elif command == 'categories':
    # Dump category distribution
    parser = optparse.OptionParser()
    parser.add_option("-l", "--language", dest="lang", help="Relevant language prefix")
    options, args = parser.parse_args(sys.argv[2:])

    dump = load_dumps([options.lang], build_index=True)[options.lang]
    cats = dump.categories

    for c in sorted(cats, key=lambda x:len(cats[x]), reverse=True):
      print "%-4d %s" % (len(cats[c]), c)

  else:
    logging.error("Unknown command: %s", command)
    logging.info("Try the 'help' command.")
Beispiel #2
0
def lang_dist(langs):
  "mapping of the distribution of documents in a given set of languages"
  dumps = utils.load_dumps(langs)
  dist = dict((d.get_dumpfile_prefix(), d.metadata['size']) for d in dumps.values())
  return dist
Beispiel #3
0
def lang_dist(langs):
  "mapping of the distribution of documents in a given set of languages"
  dumps = utils.load_dumps(langs)
  dist = dict((d.get_dumpfile_prefix(), d.metadata['size']) for d in dumps.values())
  return dist
Beispiel #4
0
def c_categories(args):
    dump = load_dumps([args.language], build_index=True)[args.lang]
    cats = dump.categories

    for c in sorted(cats, key=lambda x:len(cats[x]), reverse=True):
      print "%-4d %s" % (len(cats[c]), c)
Beispiel #5
0
def c_categories(args):
    dump = load_dumps([args.language], build_index=True)[args.lang]
    cats = dump.categories

    for c in sorted(cats, key=lambda x: len(cats[x]), reverse=True):
        print "%-4d %s" % (len(cats[c]), c)