def runKNNSimulation(dataTrain, dataTest, holdout, train_M, test_M, hold_M): outFile = open('knnLog25.txt','a') print 'running mashable knn simulation' outFile.write('train==> %d, %d \n'%(train_M.shape[0],train_M.shape[1])) outFile.write('test==> %d, %d \n'%(test_M.shape[0],test_M.shape[1])) with SimpleTimer('time to train', outFile): clf = KNeighborsClassifier(weights='distance', ).fit(train_M, dataTrain.target) plot_learning_curve(clf, 'knn with %d neighbors' , train_M, dataTrain.target, cv=5, n_jobs=4) baseScore = clf.score(test_M, dataTest.target) baseParams = clf.get_params(True) baseNeighbors = baseParams['n_neighbors'] print 'baseline score %.3f base n_neighbors %d' % (baseScore, baseNeighbors) outFile.write('baseline score %.3f base height %d \n' % (baseScore, baseNeighbors)) res = [] with SimpleTimer('time to fine tune number of neighbors', outFile): for neighbors in range(2,baseNeighbors * 10): # print 'training for neighbors %d' % neighbors clf = KNeighborsClassifier(n_neighbors=neighbors, weights='distance').fit(train_M, dataTrain.target) score = clf.score(hold_M, holdout.target) res.append((score, neighbors)) outFile.write('%d %.3f \n' % (neighbors, score)) res = sorted(res, key=lambda x:x[0], reverse=True) print res[:5] bestNeighbors = res[0][1] print ('best number of neighbors is %d' % bestNeighbors) outFile.write('best number of neighbors is %d and score is %.3f\n' % (bestNeighbors, res[0][0])) bestClf = KNeighborsClassifier(n_neighbors=bestNeighbors, weights='distance') bestClf.fit(train_M, dataTrain.target) predicted = bestClf.predict(test_M) trainPredict = bestClf.predict(train_M) print 'testing score' outFile.write('testing score') outputScores(dataTest.target, predicted, outFile) print 'training score' outFile.write('testing score') outputScores(dataTrain.target, trainPredict, outFile) results = predicted == dataTest.target print numpy.mean(results) res = [] for i in range(len(results)): if not results[i]: res.append(i) print 'classifier got these wrong:' for i in res[:10]: print dataTest.data[i], dataTest.target[i] outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i])) ''' train_sizes, train_scores, valid_scores = learning_curve(DecisionTreeClassifier(), train_M, dataTrain.target, train_sizes=[50, 80, 110], cv=5) print train_sizes print train_scores print valid_scores ''' plot_learning_curve(bestClf, 'knn with %d neighbors' % bestNeighbors, train_M, dataTrain.target, cv=5, n_jobs=4)
def runBoosting(dataTrain, dataTest, holdout, train_M, test_M, hold_M): outFile = open('boostingLog.txt','a') print 'running boosting algo' outFile.write('train==> %d, %d \n'%(train_M.shape[0],train_M.shape[1])) outFile.write('test==> %d, %d \n'%(test_M.shape[0],test_M.shape[1])) # takes a very long time to run # score, bestDepth, num = tryVariousHyperParams(dataTrain, dataTest, train_M, test_M) bestDepth = 7 bestNum = 10000 with SimpleTimer('time to train', outFile): estimator = DecisionTreeClassifier(max_depth=bestDepth) bestClf = AdaBoostClassifier(base_estimator=estimator, n_estimators=bestNum) bestClf.fit(train_M, dataTrain.target) bestScore = bestClf.score(test_M, dataTest.target) print 'the best score %.3f' % bestScore outFile.write('depth %d, num %d score %.3f \n'%(bestDepth, bestNum, bestScore)) bestClf.fit(train_M, dataTrain.target) predicted = bestClf.predict(test_M) trainPredict = bestClf.predict(train_M) print 'testing score' outFile.write('testing score') outputScores(dataTest.target, predicted, outFile) print 'training score' outFile.write('training score') outputScores(dataTrain.target, trainPredict, outFile) results = predicted == dataTest.target res = [] for i in range(len(results)): if not results[i]: res.append(i) print 'classifier got these wrong:' for i in res[:10]: print dataTest.data[i], dataTest.target[i] outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i])) plot_learning_curve(bestClf, 'boosting with %d trees' % bestNum, train_M, dataTrain.target, cv=3, n_jobs=4)
def runSVMSimulation(dataTrain, dataTest, holdout, train_M, test_M, hold_M): kernel = "linear" outFile = open('svmSarinLog%s.txt' % kernel,'a') print 'running svm code' outFile.write('train==> %d, %d \n'%(train_M.shape[0],train_M.shape[1])) outFile.write('test==> %d, %d \n'%(test_M.shape[0],test_M.shape[1])) penalty = 0.025 with SimpleTimer('time to train', outFile): # clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=30, random_state=42) # clf = LinearSVC(C=1.0) clf = SVC(kernel=kernel, C=penalty, degree=1) clf.fit(train_M, dataTrain.target) baseScore = clf.score(test_M, dataTest.target) baseIter = 5 print 'baseline score %.3f base iter %d' % (baseScore, baseIter) outFile.write('baseline score %.3f base iter %d \n' % (baseScore, baseIter)) res = [] with SimpleTimer('number of iter', outFile): for pen in [1,5,10,15,20,30]: print 'training for neighbors %.3f' % pen clf = SVC(kernel=kernel, C=pen, degree=1) # clf = LinearSVC(loss='squared_hinge', C=1.0) clf.fit(train_M, dataTrain.target) score = clf.score(hold_M, holdout.target) res.append((score, pen)) trainPredict = clf.score(train_M, dataTrain.target) outFile.write('test %.3f %.3f \n' % (pen, score)) outFile.write('train %.3f %.3f \n' % (pen, trainPredict)) res = sorted(res, key=lambda x:x[0], reverse=True) print res[:5] bestPen = res[0][1] print ('best number of iter is %.3f' % bestPen) bestClf = SVC(kernel=kernel, C=penalty, degree=bestPen) bestClf.fit(train_M, dataTrain.target) predicted = bestClf.predict(test_M) trainPredict = bestClf.predict(train_M) print 'testing score' outFile.write('testing score') outputScores(dataTest.target, predicted, outFile) print 'training score' outFile.write('training score') outputScores(dataTrain.target, trainPredict, outFile) results = predicted == dataTest.target res = [] for i in range(len(results)): if not results[i]: res.append(i) print 'classifier got these wrong:' for i in res[:10]: print dataTest.data[i], dataTest.target[i] outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i])) plot_learning_curve(bestClf, 'svm with %s kernel & penalty %.3f' % (kernel, bestPen), train_M, dataTrain.target, cv=5, n_jobs=4) '''
def runDecisionTreeSimulation(dataTrain, dataTest, holdout, train_M, test_M, hold_M): print 'running decision tree' outFile = open('decisionTreeLog30.txt','a') outFile.write('train==> %d, %d \n'%(train_M.shape[0],train_M.shape[1])) outFile.write('test==> %d, %d \n'%(test_M.shape[0],test_M.shape[1])) with SimpleTimer('time to train', outFile): clf = DecisionTreeClassifier().fit(train_M, dataTrain.target) baseScore = clf.score(test_M, dataTest.target) initHeight = clf.tree_.max_depth print 'baseline score %.3f base height %d' % (baseScore, initHeight) outFile.write('baseline score %.3f base height %d \n' % (baseScore, initHeight)) res = [] with SimpleTimer('time to prune', outFile): for height in range(initHeight, 2 , -1): # print 'training for height %d' % height clf = DecisionTreeClassifier(max_depth=height).fit(train_M, dataTrain.target) score = clf.score(hold_M, holdout.target) res.append((score, height)) outFile.write('%d %.3f \n' % (height, score)) res = sorted(res, key=lambda x:x[0], reverse=True) print res[:5] ''' train_sizes, train_scores, valid_scores = learning_curve(DecisionTreeClassifier(), train_M, dataTrain.target, train_sizes=[50, 80, 110], cv=5) print train_sizes print train_scores print valid_scores ''' bestDepth = res[0][1] print ('best height is %d' % bestDepth) outFile.write('best depth is %d and score is %.3f \n' % (bestDepth, res[0][0])) bestClf = DecisionTreeClassifier(max_depth=bestDepth) bestClf.fit(train_M, dataTrain.target) predicted = bestClf.predict(test_M) trainPredict = bestClf.predict(train_M) print len(filter(lambda x:x==0, dataTrain.target)), len(filter(lambda x:x==0, trainPredict)) print len(filter(lambda x:x==1, dataTrain.target)), len(filter(lambda x:x==1, trainPredict)) print len(filter(lambda x:x==2, dataTrain.target)), len(filter(lambda x:x==2, trainPredict)) print 'testing score' outFile.write('testing score') outputScores(dataTest.target, predicted, outFile) print 'training score' outFile.write('testing score') outputScores(dataTrain.target, trainPredict, outFile) results = predicted == dataTest.target wrong = [] for i in range(len(results)): if not results[i]: wrong.append(i) print 'classifier got these wrong:' for i in wrong[:10]: print dataTest.data[i][0], dataTest.target[i] outFile.write('%s %d \n' % (dataTest.data[i][0], dataTest.target[i])) plot_learning_curve(bestClf, 'decision tree after pruning from %d to %d depth' % (initHeight, bestDepth), train_M, dataTrain.target, cv=5, n_jobs=4)