def overall_collection_stats(): doc_lengths = [] for prefix in all_prefixes: doc_lengths += raw_doc_lengths(prefix).values() print "All documents length mean: %.2f" % n.mean(doc_lengths) print "All documents length std: %.2f" % n.std(doc_lengths) print "All documents length median: %.2f" % n.median(doc_lengths) medlist, meanlist, stdlist = zip(*map(length_stats, all_prefixes)) print "Mean of means: %.2f" % n.mean(meanlist) print "Mean of std: %.2f" % n.mean(stdlist) print "Mean of median: %.2f" % n.mean(medlist)
def overall_collection_stats(): doc_lengths = [] for prefix in all_prefixes: doc_lengths += raw_doc_lengths(prefix).values() print "All documents length mean: %.2f" % n.mean(doc_lengths) print "All documents length std: %.2f" % n.std(doc_lengths) print "All documents length median: %.2f" % n.median(doc_lengths) medlist, meanlist, stdlist = zip(*map(length_stats,all_prefixes)) print "Mean of means: %.2f" % n.mean(meanlist) print "Mean of std: %.2f" % n.mean(stdlist) print "Mean of median: %.2f" % n.mean(medlist)
def doclength_histogram(path, prefix): values = p.array(raw_doc_lengths(prefix).values()) num_bins = 1000 bin_upper_limit = p.mean(values) + 3 * p.std(values) print "UL: "+ str(bin_upper_limit) bins = p.array(range(1,1001)) * (bin_upper_limit/1000.0) p.hist(values, bins) p.xlabel('Document size (unicode codepoints)') p.ylabel('Number of documents') p.title('Document Size Histogram for %s' % prefix) p.savefig(path, dpi=72) p.close()
def doclength_histogram(path, prefix): values = p.array(raw_doc_lengths(prefix).values()) num_bins = 1000 bin_upper_limit = p.mean(values) + 3 * p.std(values) print "UL: " + str(bin_upper_limit) bins = p.array(range(1, 1001)) * (bin_upper_limit / 1000.0) p.hist(values, bins) p.xlabel('Document size (unicode codepoints)') p.ylabel('Number of documents') p.title('Document Size Histogram for %s' % prefix) p.savefig(path, dpi=72) p.close()
def length_stats(prefix): doc_lengths = raw_doc_lengths(prefix).values() median = n.median(doc_lengths) mean = n.mean(doc_lengths) std = n.std(doc_lengths) return median, mean, std
def docs_under_thresh(prefix, thresh): doc_lengths = raw_doc_lengths(prefix).values() return len(filter(lambda x: x < thresh, doc_lengths))
def indices_under_thresh(prefix, thresh): doc_lengths = raw_doc_lengths(prefix).iteritems() return [id for (id, len) in doc_lengths if len < thresh]