def prep(type): global probs global classifiers global doclens global truedocs global r_n probs = {} classifiers = {} # read classifiers for line in open(util.projdir + "/corpus/" + type + "-class", "r"): c = line.split(' ') classifiers[c[0].split('-')[1]] = c[1].split('\n')[0] in_doc = {} r_n = 0 # read counts tmp_probs = {} for docid, c in classifiers.iteritems(): probs[docid] = {} if c == "satire": r_n += 1 if c == "true" or type == "test": truedocs.append(docid) tmp_probs[docid] = util.read_counts("/data/bag/" + type + "/" + type + "-" + docid) doclens[docid] = float(sum([c for c in tmp_probs[docid].values()])) for w in tmp_probs[docid].keys(): probs[docid][w.lower()] = probs[docid].get(w, 0.0) / doclens[docid] in_doc[docid] = {} for w in probs[docid].keys(): in_doc[docid][w] = True
def prep(type): global probs global classifiers global doclens global truedocs global r_n probs = {} classifiers = {} # read classifiers for line in open(util.projdir + "/corpus/"+type+"-class", "r"): c = line.split(' ') classifiers[c[0].split('-')[1]] = c[1].split('\n')[0] in_doc = {} r_n = 0 # read counts tmp_probs = {} for docid,c in classifiers.iteritems(): probs[docid] ={} if c=="satire": r_n += 1 if c=="true" or type=="test": truedocs.append(docid) tmp_probs[docid] = util.read_counts("/data/bag/"+type+"/"+type+"-"+docid) doclens[docid] = float(sum([c for c in tmp_probs[docid].values()])) for w in tmp_probs[docid].keys(): probs[docid][w.lower()] = probs[docid].get(w,0.0)/doclens[docid] in_doc[docid] = {} for w in probs[docid].keys(): in_doc[docid][w] = True
# grooms training data and produces initial probabilities # keep this file in $proj # document x^i for i=1,...,m is a sequence of words x_1^i,...,x_n^i # y^i for i=1,...,m is a hidden boolean. y=true --> x^i is classified as satire import util types = ["all", "satire", "true"] counts = {} # word counts sum = {} # total word count prob = {} # P(y) for type in types: counts[type] = util.read_counts(projdir + "/data/bag/training/count-"+type) # for w,c in counts[type].iteritems(): # fout_word.write(w+' %f'%(c/sum[type])+'\n') # P(y) forall y #fin_type = open(projdir + "/data/bag/training/prob-type", "r") # P(y) -- overall probability of satire categorization #for line in fin_type: # sp = line.split(' ') # prob[sp[0]] = float(sp[1]) #for type in ["satire", "true", "all"]: # prob[type] = prob[type] / prob["all"] # calculate P(y|x) forall words x in vocabulary for type in ["satire", "true"]: # for y=true,false fout_doc = open(projdir + "/data/bag/training/prob-doc-"+type, "w") # P(y|x) -- probability of satire categorization given word # const = sum[type] * prob[type] / sum["all"]