def runDecisionTreeSimulation(dataTrain, dataTest, dataHold, train_tfidf, test_tfidf, hold_tfidf): print 'running decision tree' outFile = open('decisionTreeLog.txt','a') outFile.write('train==> %d, %d \n'%(train_tfidf.shape[0],train_tfidf.shape[1])) outFile.write('test==> %d, %d \n'%(test_tfidf.shape[0],test_tfidf.shape[1])) with SimpleTimer('time to train', outFile): clf = DecisionTreeClassifier().fit(train_tfidf, dataTrain.target) baseScore = clf.score(test_tfidf, dataTest.target) initHeight = clf.tree_.max_depth print 'baseline score %.3f base height %d' % (baseScore, initHeight) outFile.write('baseline score %.3f base height %d \n' % (baseScore, initHeight)) res = [] with SimpleTimer('time to prune', outFile): for height in range(initHeight, 40, -25): # print 'training for height %d' % height clf = DecisionTreeClassifier(max_depth=height).fit(train_tfidf, dataTrain.target) score = clf.score(hold_tfidf, dataHold.target) res.append((score, height)) outFile.write('%d %.3f \n' % (height, score)) res = sorted(res, key=lambda x:x[0], reverse=True) print res[:5] bestDepth = res[0][1] print ('best height is %d' % bestDepth) outFile.write('best depth is %d and score is %.3f \n' % (bestDepth, res[0][0])) bestClf = DecisionTreeClassifier(max_depth=bestDepth) bestClf.fit(train_tfidf, dataTrain.target) predicted = bestClf.predict(test_tfidf) train_predict = bestClf.predict(train_tfidf) print 'testing score' outFile.write('testing score') outputScores(dataTest.target, predicted, outFile) print 'training score' outFile.write('testing score') outputScores(dataTrain.target, train_predict, outFile) results = predicted == dataTest.target wrong = [] for i in range(len(results)): if not results[i]: wrong.append(i) print 'classifier got these wrong:' for i in wrong[:10]: print dataTest.data[i], dataTest.target[i] outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i])) plot_learning_curve(bestClf, 'decision tree after pruning from %d to %d depth' % (initHeight, bestDepth), train_tfidf, dataTrain.target, cv=5, n_jobs=4)
def runBoosting(dataTrain, dataTest, holdOut, train_tfidf, test_tfidf, hold_tfidf): outFile = open('boostingLog.txt', 'a') print 'running boosting' outFile.write('train==> %d, %d \n' % (train_tfidf.shape[0], train_tfidf.shape[1])) outFile.write('test==> %d, %d \n' % (test_tfidf.shape[0], test_tfidf.shape[1])) # takes a very long time to run # score, bestDepth, num = tryVariousHyperParams(dataTrain, dataTest, train_tfidf, test_tfidf) bestDepth = 4 bestNum = 5000 with SimpleTimer('time to train', outFile): estimator = DecisionTreeClassifier(max_depth=bestDepth) bestClf = AdaBoostClassifier(base_estimator=estimator, n_estimators=bestNum) bestClf.fit(train_tfidf, dataTrain.target) bestScore = bestClf.score(test_tfidf, dataTest.target) print 'the best score %.3f' % bestScore outFile.write('depth %d, num %d score %.3f \n' % (bestDepth, bestNum, bestScore)) bestClf.fit(train_tfidf, dataTrain.target) predicted = bestClf.predict(test_tfidf) train_predict = bestClf.predict(train_tfidf) predicted = bestClf.predict(test_tfidf) print 'testing score' outFile.write('testing score') outputScores(dataTest.target, predicted, outFile) print 'training score' outFile.write('testing score') outputScores(dataTrain.target, train_predict, outFile) results = predicted == dataTest.target print numpy.mean(results) res = [] for i in range(len(results)): if not results[i]: res.append(i)
def runBoosting(dataTrain, dataTest, holdOut, train_tfidf, test_tfidf, hold_tfidf): outFile = open('boostingLog.txt','a') print 'running boosting' outFile.write('train==> %d, %d \n'%(train_tfidf.shape[0],train_tfidf.shape[1])) outFile.write('test==> %d, %d \n'%(test_tfidf.shape[0],test_tfidf.shape[1])) # takes a very long time to run # score, bestDepth, num = tryVariousHyperParams(dataTrain, dataTest, train_tfidf, test_tfidf) bestDepth = 4 bestNum = 5000 with SimpleTimer('time to train', outFile): estimator = DecisionTreeClassifier(max_depth=bestDepth) bestClf = AdaBoostClassifier(base_estimator=estimator, n_estimators=bestNum) bestClf.fit(train_tfidf, dataTrain.target) bestScore = bestClf.score(test_tfidf, dataTest.target) print 'the best score %.3f' % bestScore outFile.write('depth %d, num %d score %.3f \n'%(bestDepth, bestNum, bestScore)) bestClf.fit(train_tfidf, dataTrain.target) predicted = bestClf.predict(test_tfidf) train_predict = bestClf.predict(train_tfidf) predicted = bestClf.predict(test_tfidf) print 'testing score' outFile.write('testing score') outputScores(dataTest.target, predicted, outFile) print 'training score' outFile.write('testing score') outputScores(dataTrain.target, train_predict, outFile) results = predicted == dataTest.target print numpy.mean(results) res = [] for i in range(len(results)): if not results[i]: res.append(i)
def runSVMSimulation(dataTrain, dataTest, holdOut, train_tfidf, test_tfidf, hold_tfidf): kernel = 'poly' penalty = 1.0 outFile = open('svmLog%s.txt' % kernel, 'a') degree = 3 outFile.write('train==> %d, %d \n' % (train_tfidf.shape[0], train_tfidf.shape[1])) outFile.write('test==> %d, %d \n' % (test_tfidf.shape[0], test_tfidf.shape[1])) with SimpleTimer('time to train', outFile): # clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=30, random_state=42) clf = SVC(kernel=kernel, C=penalty, degree=degree) clf.fit(train_tfidf, dataTrain.target) baseScore = clf.score(test_tfidf, dataTest.target) baseIter = 5 print 'baseline score %.3f penalty %d' % (baseScore, baseIter) outFile.write('baseline score %.3f base height %d \n' % (baseScore, baseIter)) res = [] with SimpleTimer('number of iter', outFile): for pen in [1, 2, 3, 4, 5]: print 'training for peanalty %f' % pen # clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=itr, random_state=42) clf = SVC(kernel=kernel, C=1.0, degree=pen) clf.fit(train_tfidf, dataTrain.target) score = clf.score(hold_tfidf, holdOut.target) res.append((score, pen)) outFile.write('%.3f %.3f \n' % (pen, score)) res = sorted(res, key=lambda x: x[0], reverse=True) print res[:5] bestPen = res[0][1] print('best number of iter is %.3f' % bestPen) bestClf = SVC(kernel=kernel, C=1.0, degree=bestPen) bestClf.fit(train_tfidf, dataTrain.target) train_predict = bestClf.predict(train_tfidf) predicted = bestClf.predict(test_tfidf) print 'testing score' outFile.write('testing score') outputScores(dataTest.target, predicted, outFile) print 'training score' outFile.write('testing score') outputScores(dataTrain.target, train_predict, outFile) results = predicted == dataTest.target res = [] for i in range(len(results)): if not results[i]: res.append(i) print 'classifier got these wrong:' for i in res[:10]: print dataTest.data[i], dataTest.target[i] outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i])) plot_learning_curve(bestClf, 'svm with %s kernel & degree %.3f' % (kernel, bestPen), train_tfidf, dataTrain.target, cv=5, n_jobs=4) '''
def runDecisionTreeSimulation(dataTrain, dataTest, dataHold, train_tfidf, test_tfidf, hold_tfidf): print 'running decision tree' outFile = open('decisionTreeLog.txt', 'a') outFile.write('train==> %d, %d \n' % (train_tfidf.shape[0], train_tfidf.shape[1])) outFile.write('test==> %d, %d \n' % (test_tfidf.shape[0], test_tfidf.shape[1])) with SimpleTimer('time to train', outFile): clf = DecisionTreeClassifier().fit(train_tfidf, dataTrain.target) baseScore = clf.score(test_tfidf, dataTest.target) initHeight = clf.tree_.max_depth print 'baseline score %.3f base height %d' % (baseScore, initHeight) outFile.write('baseline score %.3f base height %d \n' % (baseScore, initHeight)) res = [] with SimpleTimer('time to prune', outFile): for height in range(initHeight, 40, -25): # print 'training for height %d' % height clf = DecisionTreeClassifier(max_depth=height).fit( train_tfidf, dataTrain.target) score = clf.score(hold_tfidf, dataHold.target) res.append((score, height)) outFile.write('%d %.3f \n' % (height, score)) res = sorted(res, key=lambda x: x[0], reverse=True) print res[:5] bestDepth = res[0][1] print('best height is %d' % bestDepth) outFile.write('best depth is %d and score is %.3f \n' % (bestDepth, res[0][0])) bestClf = DecisionTreeClassifier(max_depth=bestDepth) bestClf.fit(train_tfidf, dataTrain.target) predicted = bestClf.predict(test_tfidf) train_predict = bestClf.predict(train_tfidf) print 'testing score' outFile.write('testing score') outputScores(dataTest.target, predicted, outFile) print 'training score' outFile.write('testing score') outputScores(dataTrain.target, train_predict, outFile) results = predicted == dataTest.target wrong = [] for i in range(len(results)): if not results[i]: wrong.append(i) print 'classifier got these wrong:' for i in wrong[:10]: print dataTest.data[i], dataTest.target[i] outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i])) plot_learning_curve(bestClf, 'decision tree after pruning from %d to %d depth' % (initHeight, bestDepth), train_tfidf, dataTrain.target, cv=5, n_jobs=4)
def runKNNSimulation(dataTrain, dataTest, dataHold, train_tfidf, test_tfidf, hold_tfidf): outFile = open('knnLog.txt','a') outFile.write('train==> %d, %d \n'%(train_tfidf.shape[0],train_tfidf.shape[1])) outFile.write('test==> %d, %d \n'%(test_tfidf.shape[0],test_tfidf.shape[1])) with SimpleTimer('time to train', outFile): clf = KNeighborsClassifier(weights='distance', ).fit(train_tfidf, dataTrain.target) baseScore = clf.score(test_tfidf, dataTest.target) baseParams = clf.get_params(True) baseNeighbors = baseParams['n_neighbors'] print 'baseline score %.3f base n_neighbors %d' % (baseScore, baseNeighbors) outFile.write('baseline score %.3f base height %d \n' % (baseScore, baseNeighbors)) res = [] with SimpleTimer('time to fine tune number of neighbors', outFile): for neighbors in range(2,800): # print 'training for neighbors %d' % neighbors clf = KNeighborsClassifier(n_neighbors=neighbors, weights='distance').fit(train_tfidf, dataTrain.target) score = clf.score(hold_tfidf, dataHold.target) res.append((score, neighbors)) outFile.write('%d %.3f \n' % (neighbors, score)) res = sorted(res, key=lambda x:x[0], reverse=True) print res[:5] bestNeighbors = res[0][1] print ('best number of neighbors is %d' % bestNeighbors) outFile.write('best number of neighbors is %d and score is %.3f\n' % (bestNeighbors, res[0][0])) bestClf = KNeighborsClassifier(n_neighbors=bestNeighbors, weights='distance') bestClf.fit(train_tfidf, dataTrain.target) train_predict = bestClf.predict(train_tfidf) predicted = bestClf.predict(test_tfidf) print 'testing score' outFile.write('testing score') outputScores(dataTest.target, predicted, outFile) print 'training score' outFile.write('testing score') outputScores(dataTrain.target, train_predict, outFile) results = predicted == dataTest.target print numpy.mean(results) res = [] for i in range(len(results)): if not results[i]: res.append(i) print 'classifier got these wrong:' for i in res[:10]: print dataTest.data[i], dataTest.target[i] outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i])) ''' train_sizes, train_scores, valid_scores = learning_curve(DecisionTreeClassifier(), train_tfidf, dataTrain.target, train_sizes=[50, 80, 110], cv=5) print train_sizes print train_scores print valid_scores ''' plot_learning_curve(bestClf, 'knn with %d neighbors' % bestNeighbors, train_tfidf, dataTrain.target, cv=5, n_jobs=4)