コード例 #1
0
ファイル: imperium2.py プロジェクト: alexsparrow/ml
def run(fname, recs, bag, test=False):
    punct_bag = dict([(b,a) for a, b in enumerate(["!", ".", ":", ";","?", ",", "'", '"'])])
    punct_names = {"!":"exclamation",
                   ".":"fullstop",
                   ":":"colon",
                   ";":"semicolon",
                   "?":"question",
                   ",":"comma",
                   "'":"invertedcomma",
                   '"':"quote"
                }
    print punct_bag
    out = open(fname, "w")
    if not test: out.write("Response,")
    bag_sorted = sorted(bag.items(), key=lambda x : x[1])
    punct_sorted = sorted(punct_bag.items(), key=lambda x: x[1])
    headers = ["WB_%s" % k for (k,v) in bag_sorted]
    headers += ["PUNCT_%s" % punct_names[k] for (k,v) in punct_sorted]
    headers += ["Hour", "Weekday"]
    out.write(",".join(headers))
    #out.write(",".join(["WB%d" % x for x in range(len(bag) + len(punct_bag))]))
    out.write("\n")
    for rec in recs:
        if not test: out.write({True:"1", False:"0"}[rec[0]] + ",") 
        out.write(",".join(["%d" % s for s in WordBag.vector(rec[2], bag)]))
        out.write(",")
        punct_vec = WordBag.vector(rec[3].strip('"').strip(), punct_bag)
        out.write(",".join(["%d" % s for s in punct_vec]))
        if len(rec[1].strip()):
            y = int(rec[1][:4])
            m = int(rec[1][4:6])
            d = int(rec[1][6:8])
            h = int(rec[1][8:10])
            dt = datetime.date(y, m,d)
            out.write(",%d" % int(rec[1][8:10]))
            out.write(",%d" % dt.weekday())
        else:
            out.write(",NA,NA")
        out.write("\n")
コード例 #2
0
ファイル: imperium2.py プロジェクト: alexsparrow/ml
def cmp(fname1, fname2):
    in1 = open(fname1)
    in2 = open(fname2)
    vals = [(x.split(",")[0], y.split(",")[0]) for x, y in zip(in1.readlines(), in2.readlines())]
    del vals[0]
    vals = [(float(x), float(y)) for x, y in vals]
    agree = 0
    disagree = 0
    for v1, v2 in vals:
        print v1, v2
        if (v1 > 0.5) == (v2 > 0.5): agree += 1
        else: disagree +=1
    print "Agree=%d, Disagree=%d" % (agree, disagree)

if __name__ == "__main__":
    wbag = WordBag()
    train = load_data("data/imperium.csv")
    good = {}
    good_tot = 0
    bad = {}
    bad_tot = 0

    # Bag the words and also tally the number of insults non-insults in training sets
    for rec in train:
        wbag.process(rec[2])
        for w in rec[2]:
            if rec[0]: 
                bad[w] = bad.setdefault(w, 0) + 1
                bad_tot += 1
            else: 
                good[w] = good.setdefault(w, 0) + 1
コード例 #3
0
 def get_wordbag_for_phrase(self, cursor, phrase):
     cursor.execute('SELECT word from %s where phrase = ?' % config.word_table, [phrase])
     results = cursor.fetchall()
     return WordBag(itertools.chain(*results))