def train(links): from math import exp,fabs,log fwords=most_frequent_words() classifiers=[PredicateClassifier(HasWordsPredicate([w])) for w in fwords] #classifiers.extend(PredicateClassifier(HasWordsPredicate(duo)) for duo in most_frequent_duos(fwords)) titles=[mash_post(l) for l in links] evaluations=[1. if l.evaluation else -1. for l in links] weights=[1./len(links) for l in links] trained=[] print "Training on %d features..." % len(classifiers) while True: print ".", min_error=1e6 ; best=None for c in classifiers: c.train(titles,weights,evaluations) error=sum(weights[n]*0.5*fabs(c.predict(t)-evaluations[n]) for n,t in enumerate(titles)) if error < min_error: best=c; min_error=error if min_error>=0.5: print min_error break Zt=sum(weights[n]*exp(-best.predict(t)*evaluations[n]) for n,t in enumerate(titles)) weights=[weights[n]*exp(-best.predict(t)*evaluations[n])/Zt for n,t in enumerate(titles)] alphat=0.5*log((1-min_error)/min_error) trained.append((best,alphat)) classifiers.remove(best) for c,alpha in trained: print c.predicate,c.wordgood,alpha import cPickle cPickle.dump(trained,open("adaboost.pck","wb"),-1)
def predict(link): #words=tokenize(link.title) words=mash_post(link) if sum(alpha * c.predict(words) for c,alpha in trained) >= 0: return 1. else: return -1.