def get_results(testing, m): for pair in testing: gold_paras=[] for p in pair.paraphrases: gold_paras.append(p) if len(gold_paras)>2: subs=random.sample(gold_paras,3) else: errcount+=1 print "List too short error." continue base=[] for t in totals: if Paraphrase(t[0]) not in subs: base.append(Paraphrase(t[0])) if len(base)==m: break # a list of all paraphrases, to be ordered by score for this compound results=[] for p in probs.keys(): x=Paraphrase(p.strip()) x.score=0.0 #the seed paraphrases are not allowed in predictions if not x in subs: results.append(x) for p in results: for s in subs: try: p.score+=probs[p.name][s.name] nonerrcount+=1 #print "done" except KeyError: errcount+=1 #print errcount #print "Key Error" results.sort(key= lambda para: para.score, reverse=True) score=0.0 basescore=0.0 for b in base[0:m]: if b in gold_paras:basescore+=1.0 for r in results[0:m]: if r in gold_paras:score+=1.0 total+=(score/float(m)) basetotal+=(basescore/float(m)) acc=total/len(testing) print "predictions:" print total/len(testing) print baseacc=basetotal/len(testing) print "baseline:" print basetotal/len(testing) print errcount print nonerrcount results=[acc,baseacc] return results
def get_results(training,testing, m): w=Web1TSearch("/media/Iomega HDD/web1T/clean/") #w=Web1TSearch("/media/usb0/web1T/clean/") print "bulding probability table..." priors=make_priors(training) probs=make_prob_table(training, priors) count=0 print "done." total=0.0 basetotal=0.0 errcount=0 nonerrcount=0 #baseline of most frequent overall paraphrases totals=sorted(priors.items(), key=lambda x: x[1], reverse=True) for pair in testing: count+=1 print count print "\n\n*************************************\n\n" gold_paras=[] for p in pair.paraphrases: gold_paras.append(p) subs=[] print pair.n2 + " " + pair.n1 r= w.getNgrams(pair.n2,pair.n1) r= w.reducePats(r,pair.n2,pair.n1) sortedResults=sorted(r.iteritems(), key=lambda (k,v): (v,k),reverse=True) for s in sortedResults: p=Paraphrase(s[0].replace('_',' ') ) if p.name in priors.keys(): subs.append(p) print p.name if "be "+ p.name in priors.keys(): subs.append(Paraphrase("be "+p.name)) print p.name base=[] for t in totals: if Paraphrase(t[0]) not in subs: base.append(Paraphrase(t[0])) if len(base)==m: break # a list of all paraphrases, to be ordered by score for this compound results=[] for p in probs.keys(): x=Paraphrase(p.strip()) x.score=0.0 #the seed paraphrases are not allowed in predictions if not x in subs: results.append(x) for p in results: for s in subs: try: p.score+=probs[p.name][s.name] nonerrcount+=1 #print "done" except KeyError: errcount+=1 #print errcount #print "Key Error" results.sort(key= lambda para: para.score, reverse=True) if len(subs)==0: results=copy.copy(base) print print "Gold:" for g in gold_paras: print g.name print print "Seeds" for s in subs: print s.name print print "Predictions: " for p in results[0:m]:print p.name print print "Baseline:" for b in base: print b.name print score=0.0 basescore=0.0 for b in base[0:m]: if b in gold_paras:basescore+=1.0 for r in results[0:m]: if r in gold_paras:score+=1.0 total+=(score/float(m)) basetotal+=(basescore/float(m)) acc=total/len(testing) print "predictions:" print total/len(testing) print baseacc=basetotal/len(testing) print "baseline:" print basetotal/len(testing) print errcount print nonerrcount results=[acc,baseacc] return results
inp=raw_input() if inp in priors: seeds.append(Paraphrase(inp)) else: print "not found" print "working..." for pair in all_pairs: paras=[] for p in pair.paraphrases: if p.freq < n : continue paras.append(p) results=[] for p in probs.keys(): x=Paraphrase(p.strip()) x.score=0.0 #the seed paraphrases are not allowed in predictions if not x in seeds: results.append(x) for p in results: for s in seeds: try: p.score+=probs[p.name][s.name] #print "done" except KeyError: pass print "Key Error" results.sort(key= lambda para: para.score, reverse=True) for r in results[0:10]:
def get_results(training,testing, m): print "bulding probability table..." priors=make_priors_freq(training) probs=make_prob_table(training, priors) print "done." total=0.0 basetotal=0.0 rand_basetotal=0.0 errcount=0 nonerrcount=0 #baseline of most frequent overall paraphrases totals=sorted(priors.items(), key=lambda x: x[1], reverse=True) for pair in testing: gold_paras=[] for p in pair.paraphrases: gold_paras.append(p) if len(gold_paras)>2: subs=random.sample(gold_paras,3) else: errcount+=1 print "List too short error." continue base=[] for t in totals: if Paraphrase(t[0]) not in subs: base.append(Paraphrase(t[0])) if len(base)==m: break rand_base=[] i=0 while(i<3): p=Paraphrase(random.choice(priors.keys())) if p not in subs: rand_base.append(p) i+=1 for t in totals: if Paraphrase(t[0]) not in subs: base.append(Paraphrase(t[0])) if len(base)==m: break # a list of all paraphrases, to be ordered by score for this compound results=[] for p in probs.keys(): x=Paraphrase(p.strip()) x.score=0.0 #the seed paraphrases are not allowed in predictions if not x in subs: results.append(x) for p in results: p.score=priors[p.name] for s in subs: try: p.score=p.score*probs[p.name][s.name] nonerrcount+=1 #print "done" except KeyError: errcount+=1 #print errcount #print "Key Error" results.sort(key= lambda para: para.score, reverse=True) score=0.0 basescore=0.0 rand_basescore=0.0 for p in rand_base[0:m]: if p in gold_paras:rand_basescore+=1.0 for b in base[0:m]: if b in gold_paras:basescore+=1.0 for r in results[0:m]: if r in gold_paras:score+=1.0 total+=(score/float(m)) basetotal+=(basescore/float(m)) rand_basetotal+=(rand_basescore/float(m)) acc=total/len(testing) print "predictions:" print total/len(testing) print baseacc=basetotal/len(testing) print "most frequent baseline:" print basetotal/len(testing) rand_baseacc=rand_basetotal/len(testing) print "random baseline:" print rand_basetotal/len(testing) print errcount print nonerrcount results=[acc,baseacc, rand_baseacc] return results