Esempio n. 1
0
def order_the_hose(trigram, infile, outfile):
    f = open_maybe_gzip(infile)
    fout = open_maybe_gzip(outfile, 'w')

    hose_filter = trigram.hose_filter(f)
    rows = [(d['score'], d['text']) for d in hose_filter]
    rows.sort()
    for r in rows:
        fout.write("%5f %s\n" % r)

    f.close()
    fout.close()
Esempio n. 2
0
def dump_users(users, outfile):
    """Write all the users to a file with their scores."""
    fout = open_maybe_gzip(outfile, 'w')
    for k, v in users.iteritems():
        mean = sum(v) / len(v)
        fout.write("%4f %s\n" % (mean, k))
    fout.close()
Esempio n. 3
0
def partition_users(users, outfile, rejfile, threshold):
    if outfile is None:
        outfile = os.devnull
    if rejfile is None:
        rejfile = os.devnull
    fout = open_maybe_gzip(outfile, 'w')
    frej = open_maybe_gzip(rejfile, 'w')
    for k, v in users.iteritems():
        if len(v) == 1:
            mean = v[0]
        else:
            mean = sum(v) / len(v)
        f = (fout if mean >= threshold else frej)
        f.write("%4f %s\n" % (mean, k))
    fout.close()
    frej.close()
Esempio n. 4
0
def bisect_the_hose(trigram, infile, goodfile, rejectfile, threshold):
    f = open_maybe_gzip(infile)
    if goodfile is None:
        goodfile = os.devnull
    if rejectfile is None:
        rejectfile = os.devnull
    fgood = open_maybe_gzip(goodfile, 'w')
    frej = open_maybe_gzip(rejectfile, 'w')
    if isinstance(threshold, str):
        threshold = trigram.probable_similarity(threshold)
        debug("threshold is", threshold)

    hose_filter = trigram.hose_filter(f)

    for d in hose_filter:
        if d['score'] >= threshold:
            fgood.write("%(score)5f %(text)s\n" % d)
        else:
            frej.write("%(score)5f %(text)s\n" % d)

    f.close()
    fgood.close()
    frej.close()