def nearCrits(crit, critset, filmset, fracTrain = 0.7, learnRate = 0.024): import random from nltk import SklearnClassifier from sklearn.linear_model import LogisticRegression labeledset = makeLabeledset(crit,filmset,critset) random.shuffle(labeledset) trainset = labeledset[:int(len(labeledset)*fracTrain)] clf = SklearnClassifier(LogisticRegression(C=learnRate)) clf.train(trainset) critdist = [] baseline = clf.prob_classify({}).prob('fresh') for crit in critset: quickdic = {} quickdic[crit] = 'fresh' dist = clf.prob_classify(quickdic).prob('fresh') - baseline critdist.append( (crit,dist) ) critdist.sort(key=lambda x: x[1], reverse=True) for i in range(-10,10): print (critdist[i])
def suggestions(labeledset, featureset, num = 20): from nltk import SklearnClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC, LinearSVC from sklearn.naive_bayes import MultinomialNB clf = SklearnClassifier(LogisticRegression(C=0.024)) clf.train(labeledset) filmsSeen = [] for film in labeledset: filmsSeen.append( film[0]['title'] ) suggestions = [] for film in featureset: if film['title'] in filmsSeen: continue suggestions.append( (film['title'], clf.prob_classify(film).prob('fresh') )) suggestions.sort(key=lambda x: x[1], reverse=True) return suggestions[:num]