コード例 #1
0
def runDecisionTreeSimulation(dataTrain, dataTest, dataHold, train_tfidf, test_tfidf, hold_tfidf):
    print 'running decision tree'
    outFile = open('decisionTreeLog.txt','a')

    outFile.write('train==> %d, %d \n'%(train_tfidf.shape[0],train_tfidf.shape[1]))
    outFile.write('test==>  %d, %d \n'%(test_tfidf.shape[0],test_tfidf.shape[1]))
    with SimpleTimer('time to train', outFile):
        clf = DecisionTreeClassifier().fit(train_tfidf, dataTrain.target)
    
    baseScore = clf.score(test_tfidf, dataTest.target)
    initHeight = clf.tree_.max_depth
    print 'baseline score %.3f base height %d' % (baseScore, initHeight)
    outFile.write('baseline score %.3f base height %d \n' % (baseScore, initHeight))
    
    
    res = []
    with SimpleTimer('time to prune', outFile):
        for height in range(initHeight, 40, -25):
#             print 'training for height %d' % height
            clf = DecisionTreeClassifier(max_depth=height).fit(train_tfidf, dataTrain.target)
            score = clf.score(hold_tfidf, dataHold.target)
            res.append((score, height))
            outFile.write('%d %.3f \n' % (height, score))
    res = sorted(res, key=lambda x:x[0], reverse=True)
    print res[:5]
    
    bestDepth = res[0][1]
    print ('best height is %d' % bestDepth)
    outFile.write('best depth is %d  and score is %.3f \n' % (bestDepth, res[0][0]))
        
    bestClf = DecisionTreeClassifier(max_depth=bestDepth)
    bestClf.fit(train_tfidf, dataTrain.target)
    
    predicted = bestClf.predict(test_tfidf)
    
    train_predict = bestClf.predict(train_tfidf)
    
    print 'testing score'
    outFile.write('testing score')
    outputScores(dataTest.target, predicted, outFile)
    print 'training score'
    outFile.write('testing score')
    outputScores(dataTrain.target, train_predict, outFile)
    
    results = predicted == dataTest.target
    wrong = []
    for i in range(len(results)):
        if not results[i]:
            wrong.append(i)
    print 'classifier got these wrong:'
    for i in wrong[:10]:
        print dataTest.data[i], dataTest.target[i]
        outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i]))
    plot_learning_curve(bestClf, 'decision tree after pruning from %d to %d depth' % (initHeight, bestDepth), train_tfidf, dataTrain.target, cv=5, n_jobs=4)
コード例 #2
0
def runBoosting(dataTrain, dataTest, holdOut, train_tfidf, test_tfidf,
                hold_tfidf):
    outFile = open('boostingLog.txt', 'a')
    print 'running boosting'
    outFile.write('train==> %d, %d \n' %
                  (train_tfidf.shape[0], train_tfidf.shape[1]))
    outFile.write('test==>  %d, %d \n' %
                  (test_tfidf.shape[0], test_tfidf.shape[1]))
    # takes a very long time to run
    #     score, bestDepth, num = tryVariousHyperParams(dataTrain, dataTest, train_tfidf, test_tfidf)
    bestDepth = 4
    bestNum = 5000
    with SimpleTimer('time to train', outFile):
        estimator = DecisionTreeClassifier(max_depth=bestDepth)
        bestClf = AdaBoostClassifier(base_estimator=estimator,
                                     n_estimators=bestNum)
        bestClf.fit(train_tfidf, dataTrain.target)

    bestScore = bestClf.score(test_tfidf, dataTest.target)
    print 'the best score %.3f' % bestScore
    outFile.write('depth %d, num %d score %.3f \n' %
                  (bestDepth, bestNum, bestScore))
    bestClf.fit(train_tfidf, dataTrain.target)
    predicted = bestClf.predict(test_tfidf)

    train_predict = bestClf.predict(train_tfidf)

    predicted = bestClf.predict(test_tfidf)
    print 'testing score'
    outFile.write('testing score')
    outputScores(dataTest.target, predicted, outFile)
    print 'training score'
    outFile.write('testing score')
    outputScores(dataTrain.target, train_predict, outFile)

    results = predicted == dataTest.target
    print numpy.mean(results)
    res = []
    for i in range(len(results)):
        if not results[i]:
            res.append(i)
コード例 #3
0
def runBoosting(dataTrain, dataTest, holdOut, train_tfidf, test_tfidf, hold_tfidf):
    outFile = open('boostingLog.txt','a')
    print 'running boosting'
    outFile.write('train==> %d, %d \n'%(train_tfidf.shape[0],train_tfidf.shape[1]))
    outFile.write('test==>  %d, %d \n'%(test_tfidf.shape[0],test_tfidf.shape[1]))
    # takes a very long time to run
#     score, bestDepth, num = tryVariousHyperParams(dataTrain, dataTest, train_tfidf, test_tfidf)
    bestDepth = 4
    bestNum = 5000
    with SimpleTimer('time to train', outFile):
        estimator = DecisionTreeClassifier(max_depth=bestDepth)
        bestClf = AdaBoostClassifier(base_estimator=estimator,  n_estimators=bestNum)
        bestClf.fit(train_tfidf, dataTrain.target)
    
    bestScore = bestClf.score(test_tfidf, dataTest.target)
    print 'the best score %.3f' % bestScore
    outFile.write('depth %d, num %d score %.3f \n'%(bestDepth, bestNum, bestScore))
    bestClf.fit(train_tfidf, dataTrain.target)
    predicted = bestClf.predict(test_tfidf)
    
    train_predict = bestClf.predict(train_tfidf)
    
    predicted = bestClf.predict(test_tfidf)
    print 'testing score'
    outFile.write('testing score')
    outputScores(dataTest.target, predicted, outFile)
    print 'training score'
    outFile.write('testing score')
    outputScores(dataTrain.target, train_predict, outFile)
    
    results = predicted == dataTest.target
    print numpy.mean(results)
    res = []
    for i in range(len(results)):
        if not results[i]:
            res.append(i)
コード例 #4
0
def runSVMSimulation(dataTrain, dataTest, holdOut, train_tfidf, test_tfidf,
                     hold_tfidf):
    kernel = 'poly'
    penalty = 1.0
    outFile = open('svmLog%s.txt' % kernel, 'a')
    degree = 3
    outFile.write('train==> %d, %d \n' %
                  (train_tfidf.shape[0], train_tfidf.shape[1]))
    outFile.write('test==>  %d, %d \n' %
                  (test_tfidf.shape[0], test_tfidf.shape[1]))

    with SimpleTimer('time to train', outFile):
        #         clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=30, random_state=42)
        clf = SVC(kernel=kernel, C=penalty, degree=degree)
        clf.fit(train_tfidf, dataTrain.target)

    baseScore = clf.score(test_tfidf, dataTest.target)
    baseIter = 5
    print 'baseline score %.3f penalty %d' % (baseScore, baseIter)
    outFile.write('baseline score %.3f base height %d \n' %
                  (baseScore, baseIter))

    res = []
    with SimpleTimer('number of iter', outFile):
        for pen in [1, 2, 3, 4, 5]:
            print 'training for peanalty %f' % pen
            #             clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=itr, random_state=42)
            clf = SVC(kernel=kernel, C=1.0, degree=pen)
            clf.fit(train_tfidf, dataTrain.target)
            score = clf.score(hold_tfidf, holdOut.target)
            res.append((score, pen))
            outFile.write('%.3f %.3f \n' % (pen, score))

    res = sorted(res, key=lambda x: x[0], reverse=True)
    print res[:5]
    bestPen = res[0][1]
    print('best number of iter is %.3f' % bestPen)
    bestClf = SVC(kernel=kernel, C=1.0, degree=bestPen)
    bestClf.fit(train_tfidf, dataTrain.target)

    train_predict = bestClf.predict(train_tfidf)
    predicted = bestClf.predict(test_tfidf)

    print 'testing score'
    outFile.write('testing score')
    outputScores(dataTest.target, predicted, outFile)
    print 'training score'
    outFile.write('testing score')
    outputScores(dataTrain.target, train_predict, outFile)

    results = predicted == dataTest.target
    res = []
    for i in range(len(results)):
        if not results[i]:
            res.append(i)
    print 'classifier got these wrong:'
    for i in res[:10]:
        print dataTest.data[i], dataTest.target[i]
        outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i]))

    plot_learning_curve(bestClf,
                        'svm with %s kernel & degree %.3f' % (kernel, bestPen),
                        train_tfidf,
                        dataTrain.target,
                        cv=5,
                        n_jobs=4)
    '''
コード例 #5
0
def runDecisionTreeSimulation(dataTrain, dataTest, dataHold, train_tfidf,
                              test_tfidf, hold_tfidf):
    print 'running decision tree'
    outFile = open('decisionTreeLog.txt', 'a')

    outFile.write('train==> %d, %d \n' %
                  (train_tfidf.shape[0], train_tfidf.shape[1]))
    outFile.write('test==>  %d, %d \n' %
                  (test_tfidf.shape[0], test_tfidf.shape[1]))
    with SimpleTimer('time to train', outFile):
        clf = DecisionTreeClassifier().fit(train_tfidf, dataTrain.target)

    baseScore = clf.score(test_tfidf, dataTest.target)
    initHeight = clf.tree_.max_depth
    print 'baseline score %.3f base height %d' % (baseScore, initHeight)
    outFile.write('baseline score %.3f base height %d \n' %
                  (baseScore, initHeight))

    res = []
    with SimpleTimer('time to prune', outFile):
        for height in range(initHeight, 40, -25):
            #             print 'training for height %d' % height
            clf = DecisionTreeClassifier(max_depth=height).fit(
                train_tfidf, dataTrain.target)
            score = clf.score(hold_tfidf, dataHold.target)
            res.append((score, height))
            outFile.write('%d %.3f \n' % (height, score))
    res = sorted(res, key=lambda x: x[0], reverse=True)
    print res[:5]

    bestDepth = res[0][1]
    print('best height is %d' % bestDepth)
    outFile.write('best depth is %d  and score is %.3f \n' %
                  (bestDepth, res[0][0]))

    bestClf = DecisionTreeClassifier(max_depth=bestDepth)
    bestClf.fit(train_tfidf, dataTrain.target)

    predicted = bestClf.predict(test_tfidf)

    train_predict = bestClf.predict(train_tfidf)

    print 'testing score'
    outFile.write('testing score')
    outputScores(dataTest.target, predicted, outFile)
    print 'training score'
    outFile.write('testing score')
    outputScores(dataTrain.target, train_predict, outFile)

    results = predicted == dataTest.target
    wrong = []
    for i in range(len(results)):
        if not results[i]:
            wrong.append(i)
    print 'classifier got these wrong:'
    for i in wrong[:10]:
        print dataTest.data[i], dataTest.target[i]
        outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i]))
    plot_learning_curve(bestClf,
                        'decision tree after pruning from %d to %d depth' %
                        (initHeight, bestDepth),
                        train_tfidf,
                        dataTrain.target,
                        cv=5,
                        n_jobs=4)
コード例 #6
0
def runKNNSimulation(dataTrain, dataTest, dataHold, train_tfidf, test_tfidf, hold_tfidf):
    outFile = open('knnLog.txt','a')
    
    outFile.write('train==> %d, %d \n'%(train_tfidf.shape[0],train_tfidf.shape[1]))
    outFile.write('test==>  %d, %d \n'%(test_tfidf.shape[0],test_tfidf.shape[1]))
    with SimpleTimer('time to train', outFile):
        clf = KNeighborsClassifier(weights='distance', ).fit(train_tfidf, dataTrain.target)
    
    baseScore = clf.score(test_tfidf, dataTest.target)
    baseParams = clf.get_params(True)
    baseNeighbors = baseParams['n_neighbors']
    print 'baseline score %.3f base n_neighbors %d' % (baseScore, baseNeighbors)
    outFile.write('baseline score %.3f base height %d \n' % (baseScore, baseNeighbors))
    
    res = []
    with SimpleTimer('time to fine tune number of neighbors', outFile):
        for neighbors in range(2,800):
#             print 'training for neighbors %d' % neighbors
            clf = KNeighborsClassifier(n_neighbors=neighbors, weights='distance').fit(train_tfidf, dataTrain.target)
            score = clf.score(hold_tfidf, dataHold.target)
            res.append((score, neighbors))
            outFile.write('%d %.3f \n' % (neighbors, score))
    
    res = sorted(res, key=lambda x:x[0], reverse=True)
    print res[:5]
    bestNeighbors = res[0][1]
    print ('best number of neighbors is %d' % bestNeighbors)
    outFile.write('best number of neighbors is %d  and score is %.3f\n' % (bestNeighbors, res[0][0]))
    bestClf = KNeighborsClassifier(n_neighbors=bestNeighbors, weights='distance')
    bestClf.fit(train_tfidf, dataTrain.target)
    
    train_predict = bestClf.predict(train_tfidf)
    
    predicted = bestClf.predict(test_tfidf)
    print 'testing score'
    outFile.write('testing score')
    outputScores(dataTest.target, predicted, outFile)
    print 'training score'
    outFile.write('testing score')
    outputScores(dataTrain.target, train_predict, outFile)
    
    results = predicted == dataTest.target
    print numpy.mean(results)
    res = []
    for i in range(len(results)):
        if not results[i]:
            res.append(i)
            
    
    print 'classifier got these wrong:'
    for i in res[:10]:
        print dataTest.data[i], dataTest.target[i]
        outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i]))
    '''
    train_sizes, train_scores, valid_scores = learning_curve(DecisionTreeClassifier(), train_tfidf, dataTrain.target, train_sizes=[50, 80, 110], cv=5)
    print train_sizes
    print train_scores
    print valid_scores
    '''
    
    plot_learning_curve(bestClf, 'knn with %d neighbors' % bestNeighbors, train_tfidf, dataTrain.target, cv=5, n_jobs=4)