Esempio n. 1
0
def overall_collection_stats():
    doc_lengths = []
    for prefix in all_prefixes:
        doc_lengths += raw_doc_lengths(prefix).values()
    print "All documents length mean: %.2f" % n.mean(doc_lengths)
    print "All documents length std: %.2f" % n.std(doc_lengths)
    print "All documents length median: %.2f" % n.median(doc_lengths)
    medlist, meanlist, stdlist = zip(*map(length_stats, all_prefixes))
    print "Mean of means: %.2f" % n.mean(meanlist)
    print "Mean of std: %.2f" % n.mean(stdlist)
    print "Mean of median: %.2f" % n.mean(medlist)
Esempio n. 2
0
def overall_collection_stats():
  doc_lengths = []
  for prefix in all_prefixes:
    doc_lengths += raw_doc_lengths(prefix).values()
  print "All documents length mean: %.2f" % n.mean(doc_lengths)
  print "All documents length std: %.2f" % n.std(doc_lengths)
  print "All documents length median: %.2f" % n.median(doc_lengths)
  medlist, meanlist, stdlist = zip(*map(length_stats,all_prefixes))
  print "Mean of means: %.2f" % n.mean(meanlist)
  print "Mean of std: %.2f" % n.mean(stdlist)
  print "Mean of median: %.2f" % n.mean(medlist)
Esempio n. 3
0
def doclength_histogram(path, prefix):
  values = p.array(raw_doc_lengths(prefix).values())
  num_bins = 1000
  bin_upper_limit = p.mean(values) + 3 * p.std(values)
  print "UL: "+ str(bin_upper_limit)
  bins = p.array(range(1,1001)) * (bin_upper_limit/1000.0)
  p.hist(values, bins)
  p.xlabel('Document size (unicode codepoints)')
  p.ylabel('Number of documents')
  p.title('Document Size Histogram for %s' % prefix)
  p.savefig(path, dpi=72)
  p.close()
Esempio n. 4
0
def doclength_histogram(path, prefix):
    values = p.array(raw_doc_lengths(prefix).values())
    num_bins = 1000
    bin_upper_limit = p.mean(values) + 3 * p.std(values)
    print "UL: " + str(bin_upper_limit)
    bins = p.array(range(1, 1001)) * (bin_upper_limit / 1000.0)
    p.hist(values, bins)
    p.xlabel('Document size (unicode codepoints)')
    p.ylabel('Number of documents')
    p.title('Document Size Histogram for %s' % prefix)
    p.savefig(path, dpi=72)
    p.close()
Esempio n. 5
0
def length_stats(prefix):
  doc_lengths = raw_doc_lengths(prefix).values()
  median = n.median(doc_lengths)
  mean   = n.mean(doc_lengths)
  std    = n.std(doc_lengths)
  return median, mean, std
Esempio n. 6
0
def docs_under_thresh(prefix, thresh):
    doc_lengths = raw_doc_lengths(prefix).values()
    return len(filter(lambda x: x < thresh, doc_lengths))
Esempio n. 7
0
def indices_under_thresh(prefix, thresh):
    doc_lengths = raw_doc_lengths(prefix).iteritems()
    return [id for (id, len) in doc_lengths if len < thresh]
Esempio n. 8
0
def length_stats(prefix):
    doc_lengths = raw_doc_lengths(prefix).values()
    median = n.median(doc_lengths)
    mean = n.mean(doc_lengths)
    std = n.std(doc_lengths)
    return median, mean, std