def order_the_hose(trigram, infile, outfile): f = open_maybe_gzip(infile) fout = open_maybe_gzip(outfile, 'w') hose_filter = trigram.hose_filter(f) rows = [(d['score'], d['text']) for d in hose_filter] rows.sort() for r in rows: fout.write("%5f %s\n" % r) f.close() fout.close()
def dump_users(users, outfile): """Write all the users to a file with their scores.""" fout = open_maybe_gzip(outfile, 'w') for k, v in users.iteritems(): mean = sum(v) / len(v) fout.write("%4f %s\n" % (mean, k)) fout.close()
def partition_users(users, outfile, rejfile, threshold): if outfile is None: outfile = os.devnull if rejfile is None: rejfile = os.devnull fout = open_maybe_gzip(outfile, 'w') frej = open_maybe_gzip(rejfile, 'w') for k, v in users.iteritems(): if len(v) == 1: mean = v[0] else: mean = sum(v) / len(v) f = (fout if mean >= threshold else frej) f.write("%4f %s\n" % (mean, k)) fout.close() frej.close()
def bisect_the_hose(trigram, infile, goodfile, rejectfile, threshold): f = open_maybe_gzip(infile) if goodfile is None: goodfile = os.devnull if rejectfile is None: rejectfile = os.devnull fgood = open_maybe_gzip(goodfile, 'w') frej = open_maybe_gzip(rejectfile, 'w') if isinstance(threshold, str): threshold = trigram.probable_similarity(threshold) debug("threshold is", threshold) hose_filter = trigram.hose_filter(f) for d in hose_filter: if d['score'] >= threshold: fgood.write("%(score)5f %(text)s\n" % d) else: frej.write("%(score)5f %(text)s\n" % d) f.close() fgood.close() frej.close()