Esempio n. 1
0
def crossValidation():
	min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,7))
	X = dataset[['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe']]
	X = np.array(X)
	X = min_max_scaler.fit_transform(X)
	Y = dataset["class"]
	Y = np.array(Y)
	
	nfold = 25
	precision = []
	recall = []
	fscore = []
	clf = RandomForestClassifier()
	skf = model_selection.StratifiedKFold(n_splits=nfold)
	y_test_total = []
	y_pred_total = []
    
	for train_index, test_index in skf.split(X, Y):
		X_train, X_test = X[train_index], X[test_index]
		y_train, y_test = Y[train_index], Y[test_index]
		y_test_total.extend(y_test.tolist())
		model = clf.fit(X_train, y_train)
		y_pred = model.predict(X_test)
		y_pred_total.extend(y_pred.tolist())
		p,r,f,s = precision_recall_fscore_support(y_test, y_pred, average='weighted')
		#print(accuracy_score(y_test, y_pred))
		a_score.append(accuracy_score(y_test, y_pred))
		precision.append(p)
		recall.append(r)
		fscore.append(f)
	plot_learning_curve(clf, "Learning Curves", X, Y, ylim=None, cv=skf, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5))
	plt.savefig('images/RF-LearningCurve.png')
	return pd.Series(y_test_total), pd.Series(y_pred_total), np.mean(precision),np.mean(recall),np.mean(fscore), np.mean(a_score)
Esempio n. 2
0
def crossValidate(document_term_matrix,labels,nfold=2):
    clf = None
    precision = []
    recall = []
    fscore = []
    #clf = LinearSVC() #loss='hinge',tol=0.000001   loss='hinge',tol=1  loss='hinge',C=0.0001,max_iter=1000  loss='hinge',C=0.1, tol=0.001,max_iter=1000
    clf = LinearSVC(loss='squared_hinge', tol=1e-4,C=1.0, max_iter=1000)    #C=0.05, tol=0.1,max_iter=1000 loss='l2', penalty='l1', dual=False	
    skf = StratifiedKFold(n_splits=nfold)
    y_test_total = []
    y_pred_total = []

    for train_index, test_index in skf.split(document_term_matrix, labels):
        X_train, X_test = document_term_matrix[train_index], document_term_matrix[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        y_test_total.extend(y_test.tolist())
        model = clf.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_total.extend(y_pred.tolist())
        p,r,f,s = precision_recall_fscore_support(y_test, y_pred, average='weighted')
        print accuracy_score(y_test, y_pred)
        a_score.append(accuracy_score(y_test, y_pred))
        precision.append(p)
        recall.append(r)
        fscore.append(f)
        
    plot_learning_curve(clf, "Learning Curves", document_term_matrix, labels, ylim=None, cv=skf, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5))

    plt.savefig('lc.png')

    return pd.Series(y_test_total), pd.Series(y_pred_total), np.mean(precision),np.mean(recall),np.mean(fscore), np.mean(a_score)
Esempio n. 3
0
def runKNNSimulation(dataTrain, dataTest, holdout, train_M, test_M, hold_M):
    outFile = open('knnLog25.txt','a')
    print 'running mashable knn simulation'
    outFile.write('train==> %d, %d \n'%(train_M.shape[0],train_M.shape[1]))
    outFile.write('test==>  %d, %d \n'%(test_M.shape[0],test_M.shape[1]))
    with SimpleTimer('time to train', outFile):
        clf = KNeighborsClassifier(weights='distance', ).fit(train_M, dataTrain.target)
    plot_learning_curve(clf, 'knn with %d neighbors' , train_M, dataTrain.target, cv=5, n_jobs=4)
    
    baseScore = clf.score(test_M, dataTest.target)
    baseParams = clf.get_params(True)
    baseNeighbors = baseParams['n_neighbors']
    print 'baseline score %.3f base n_neighbors %d' % (baseScore, baseNeighbors)
    outFile.write('baseline score %.3f base height %d \n' % (baseScore, baseNeighbors))
    
    res = []
    with SimpleTimer('time to fine tune number of neighbors', outFile):
        for neighbors in range(2,baseNeighbors * 10):
#             print 'training for neighbors %d' % neighbors
            clf = KNeighborsClassifier(n_neighbors=neighbors, weights='distance').fit(train_M, dataTrain.target)
            score = clf.score(hold_M, holdout.target)
            res.append((score, neighbors))
            outFile.write('%d %.3f \n' % (neighbors, score))
    res = sorted(res, key=lambda x:x[0], reverse=True)
    print res[:5]
    bestNeighbors = res[0][1]
    print ('best number of neighbors is %d' % bestNeighbors)
    outFile.write('best number of neighbors is %d  and score is %.3f\n' % (bestNeighbors, res[0][0]))
    
    bestClf = KNeighborsClassifier(n_neighbors=bestNeighbors, weights='distance')
    bestClf.fit(train_M, dataTrain.target)
    
    predicted = bestClf.predict(test_M)
    trainPredict = bestClf.predict(train_M)
    print 'testing score'
    outFile.write('testing score')
    outputScores(dataTest.target, predicted, outFile)
    print 'training score'
    outFile.write('testing score')
    outputScores(dataTrain.target, trainPredict, outFile)
    
    results = predicted == dataTest.target
    print numpy.mean(results)
    res = []
    for i in range(len(results)):
        if not results[i]:
            res.append(i)
    print 'classifier got these wrong:'
    for i in res[:10]:
        print dataTest.data[i], dataTest.target[i]
        outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i]))
    '''
    train_sizes, train_scores, valid_scores = learning_curve(DecisionTreeClassifier(), train_M, dataTrain.target, train_sizes=[50, 80, 110], cv=5)
    print train_sizes
    print train_scores
    print valid_scores
    '''
       
    plot_learning_curve(bestClf, 'knn with %d neighbors' % bestNeighbors, train_M, dataTrain.target, cv=5, n_jobs=4)
Esempio n. 4
0
def algomain(df):
    scaler = preprocessing.StandardScaler()

    #开头有Wh/Who且结尾有Q
    df['WhAndQ1'] = ((df.startWithWh == 1) & (df.endWithQ == 1)).astype(int)
    df['WhAndQ0'] = ((df.startWithWh == 0) & (df.endWithQ == 0)).astype(int)

    #标准化
    popTagsNum_scale_param = scaler.fit(df['popTagsNum'])
    df['popTagsNum_scaled'] = scaler.fit_transform(df['popTagsNum'],
                                                   popTagsNum_scale_param)

    liNum_scale_param = scaler.fit(df['liNum'])
    df['liNum_scaled'] = scaler.fit_transform(df['liNum'], liNum_scale_param)

    codeFragNum_scale_param = scaler.fit(df['codeFragNum'])
    df['codeFragNum_scaled'] = scaler.fit_transform(df['codeFragNum'],
                                                    codeFragNum_scale_param)

    avgTI_scale_param = scaler.fit(df['avgTI'])
    df['avgTI_scaled'] = scaler.fit_transform(df['avgTI'], avgTI_scale_param)

    totalTI_scale_param = scaler.fit(df['totalTI'])
    df['totalTI_scaled'] = scaler.fit_transform(df['totalTI'],
                                                totalTI_scale_param)

    title_scale_param = scaler.fit(df['titleLength'])
    df['title_scaled'] = scaler.fit_transform(df['titleLength'],
                                              title_scale_param)

    body_scale_param = scaler.fit(df['bodyLength'])
    df['body_scaled'] = scaler.fit_transform(df['bodyLength'],
                                             body_scale_param)

    train_df = df[[
        'class', 'codeFragNum_scaled', 'liNum_scaled', 'totalTI', 'avgTI',
        'popTagsNum_scaled', 'startWithWh', 'endWithQ', 'WhAndQ1', 'WhAndQ0',
        'isweekend', 'cntQ', 'cntA', 'body_scaled', 'title_scaled'
    ]]

    train_np = train_df.as_matrix()
    tX, ty = train_np[:, 1:], train_np[:, 0]

    estm = LinearSVC(C=0.1, penalty='l1', dual=False)

    plot_learning_curve(estm,
                        'LinearSVC(C=0.1, penalty=l1)',
                        tX,
                        ty,
                        ylim=(0.5, 1.0),
                        cv=10,
                        train_sizes=np.linspace(.1, 1, 10))

    estm.fit(tX, ty)
    print pd.DataFrame({
        'columns': list(train_df.columns[1:]),
        'coef': list(estm.coef_.T)
    })
def runDecisionTreeSimulation(dataTrain, dataTest, dataHold, train_tfidf, test_tfidf, hold_tfidf):
    print 'running decision tree'
    outFile = open('decisionTreeLog.txt','a')

    outFile.write('train==> %d, %d \n'%(train_tfidf.shape[0],train_tfidf.shape[1]))
    outFile.write('test==>  %d, %d \n'%(test_tfidf.shape[0],test_tfidf.shape[1]))
    with SimpleTimer('time to train', outFile):
        clf = DecisionTreeClassifier().fit(train_tfidf, dataTrain.target)
    
    baseScore = clf.score(test_tfidf, dataTest.target)
    initHeight = clf.tree_.max_depth
    print 'baseline score %.3f base height %d' % (baseScore, initHeight)
    outFile.write('baseline score %.3f base height %d \n' % (baseScore, initHeight))
    
    
    res = []
    with SimpleTimer('time to prune', outFile):
        for height in range(initHeight, 40, -25):
#             print 'training for height %d' % height
            clf = DecisionTreeClassifier(max_depth=height).fit(train_tfidf, dataTrain.target)
            score = clf.score(hold_tfidf, dataHold.target)
            res.append((score, height))
            outFile.write('%d %.3f \n' % (height, score))
    res = sorted(res, key=lambda x:x[0], reverse=True)
    print res[:5]
    
    bestDepth = res[0][1]
    print ('best height is %d' % bestDepth)
    outFile.write('best depth is %d  and score is %.3f \n' % (bestDepth, res[0][0]))
        
    bestClf = DecisionTreeClassifier(max_depth=bestDepth)
    bestClf.fit(train_tfidf, dataTrain.target)
    
    predicted = bestClf.predict(test_tfidf)
    
    train_predict = bestClf.predict(train_tfidf)
    
    print 'testing score'
    outFile.write('testing score')
    outputScores(dataTest.target, predicted, outFile)
    print 'training score'
    outFile.write('testing score')
    outputScores(dataTrain.target, train_predict, outFile)
    
    results = predicted == dataTest.target
    wrong = []
    for i in range(len(results)):
        if not results[i]:
            wrong.append(i)
    print 'classifier got these wrong:'
    for i in wrong[:10]:
        print dataTest.data[i], dataTest.target[i]
        outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i]))
    plot_learning_curve(bestClf, 'decision tree after pruning from %d to %d depth' % (initHeight, bestDepth), train_tfidf, dataTrain.target, cv=5, n_jobs=4)
Esempio n. 6
0
def algomain(df):
    scaler = preprocessing.StandardScaler()
    
    #开头有Wh/Who且结尾有Q
    df['WhAndQ1'] = ((df.startWithWh == 1) & (df.endWithQ == 1)).astype(int)
    df['WhAndQ0'] = ((df.startWithWh == 0) & (df.endWithQ == 0)).astype(int)
    
    #标准化
    popTagsNum_scale_param = scaler.fit(df['popTagsNum'])
    df['popTagsNum_scaled'] = scaler.fit_transform(df['popTagsNum'], popTagsNum_scale_param)

    liNum_scale_param = scaler.fit(df['liNum'])
    df['liNum_scaled'] = scaler.fit_transform(df['liNum'], liNum_scale_param)
    
    codeFragNum_scale_param = scaler.fit(df['codeFragNum'])
    df['codeFragNum_scaled'] = scaler.fit_transform(df['codeFragNum'], codeFragNum_scale_param)
    
    avgTI_scale_param = scaler.fit(df['avgTI'])
    df['avgTI_scaled'] = scaler.fit_transform(df['avgTI'], avgTI_scale_param)
    
    totalTI_scale_param = scaler.fit(df['totalTI'])
    df['totalTI_scaled'] = scaler.fit_transform(df['totalTI'], totalTI_scale_param)
    
    title_scale_param = scaler.fit(df['titleLength'])
    df['title_scaled'] = scaler.fit_transform(df['titleLength'], title_scale_param)
    
    body_scale_param = scaler.fit(df['bodyLength'])
    df['body_scaled'] = scaler.fit_transform(df['bodyLength'], body_scale_param)
    
    

    train_df = df[['class', 
                   'codeFragNum_scaled', 'liNum_scaled',
                   'totalTI', 'avgTI',
                   'popTagsNum_scaled', 
                   'startWithWh', 'endWithQ', 'WhAndQ1', 'WhAndQ0',  'isweekend',
                   'cntQ', 'cntA',
                   'body_scaled', 'title_scaled']]


    train_np = train_df.as_matrix()
    tX, ty = train_np[:, 1:], train_np[:, 0]

    estm = LinearSVC(C=0.1, penalty='l1', dual=False)

    plot_learning_curve(estm, 'LinearSVC(C=0.1, penalty=l1)',
                        tX, ty, ylim=(0.5, 1.0), 
                        cv=10, train_sizes=np.linspace(.1, 1, 10))
                    
    estm.fit(tX, ty)   
    print pd.DataFrame({'columns': list(train_df.columns[1:]), 
                        'coef': list(estm.coef_.T)})
    def make_plots(self,roc=True,lrn_crv=True,prec_rec=True,cnf_mtr=True):

        # Learning Curve

        # #ROC
        if roc:
            plot_roc(self.data, self.clf)
        # #Precision Recall
        if prec_rec:
            plot_precision_recall(self.data, self.clf)
        # #confusion matrix
        if cnf_mtr:
            local_plot_confusion_matrix(self.data, self.clf)
        if lrn_crv:
            plot_learning_curve(self.clf, self.title, self.data.x_train, np.ravel(self.data.y_train))
def ADA_Learning_Curves(X, Y, datasource, n_estimators_value):
    title = "ADA Learning Curves " + datasource
    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=626)
    estimator = AdaBoostClassifier(n_estimators=n_estimators_value,
                                   random_state=626)
    plt = plot_learning_curve(estimator, title, X, Y, ylim=(0.0, 1.05), cv=cv)
    plt.show()
def MLP_Learning_Curves(X, Y, datasource, hidden_layer_sizes_value):
    title = "MLP Learning Curves on " + datasource
    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=626)
    estimator = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes_value,
                              random_state=626)
    plt = plot_learning_curve(estimator, title, X, Y, ylim=(0.0, 1.05), cv=cv)
    plt.show()
Esempio n. 10
0
def algomain(df):
    scaler = preprocessing.StandardScaler()

    #标准化
    popTagsNum_scale_param = scaler.fit(df['popTagsNum'])
    df['popTagsNum_scaled'] = scaler.fit_transform(df['popTagsNum'],
                                                   popTagsNum_scale_param)

    liNum_scale_param = scaler.fit(df['liNum'])
    df['liNum_scaled'] = scaler.fit_transform(df['liNum'], liNum_scale_param)

    codeFragNum_scale_param = scaler.fit(df['codeFragNum'])
    df['codeFragNum_scaled'] = scaler.fit_transform(df['codeFragNum'],
                                                    codeFragNum_scale_param)

    bodyLen_scale_param = scaler.fit(df['bodyLength'])
    df['bodyLen_scaled'] = scaler.fit_transform(df['bodyLength'],
                                                bodyLen_scale_param)

    titleLen_scale_param = scaler.fit(df['titleLength'])
    df['titleLen_scaled'] = scaler.fit_transform(df['titleLength'],
                                                 titleLen_scale_param)

    train_df = df[[
        'class', 'codeFragNum_scaled', 'liNum_scaled', 'popTagsNum_scaled',
        'startWithWh', 'endWithQ', 'bodyLen_scaled', 'titleLen_scaled'
    ]]
    train_np = train_df.as_matrix()

    tX = train_np[:, 1:]
    ty = train_np[:, 0]

    estm = SGDClassifier(loss='log', penalty='l1', alpha=0.015)
    plot_learning_curve(estm,
                        "LogisticRegression(L1), cv=10-fold",
                        tX,
                        ty,
                        ylim=(0.5, 1.0),
                        cv=10,
                        train_sizes=np.linspace(.1, 1, 10))

    estm.fit(tX, ty)
    print pd.DataFrame({
        'columns': list(train_df.columns[1:]),
        'coef': list(estm.coef_.T)
    })
Esempio n. 11
0
def experiment(svc, X_train, y_train, X_test, y_test, tag):
    model_linear = svc

    # Plot learning-curve
    plot_learning_curve(svc, "Learning curve", X_train, y_train)
    plt.savefig(tag + 'learning_curve.png')

    model_linear.fit(X_train, y_train)

    # predict
    y_pred = model_linear.predict(X_test)

    # Confussion matrix
    plot_confusion_matrix(svc, X_test, y_test)
    plt.savefig(tag + 'confussion_matrix' + '.png')

    # Prescision, accuracy, sensitivity and specifity
    print("report:", metrics.classification_report(y_true=y_test, y_pred=y_pred), "\n")
Esempio n. 12
0
def algomain(df):
    scaler = preprocessing.StandardScaler()

    # 标准化
    popTagsNum_scale_param = scaler.fit(df["popTagsNum"])
    df["popTagsNum_scaled"] = scaler.fit_transform(df["popTagsNum"], popTagsNum_scale_param)

    liNum_scale_param = scaler.fit(df["liNum"])
    df["liNum_scaled"] = scaler.fit_transform(df["liNum"], liNum_scale_param)

    codeFragNum_scale_param = scaler.fit(df["codeFragNum"])
    df["codeFragNum_scaled"] = scaler.fit_transform(df["codeFragNum"], codeFragNum_scale_param)

    bodyLen_scale_param = scaler.fit(df["bodyLength"])
    df["bodyLen_scaled"] = scaler.fit_transform(df["bodyLength"], bodyLen_scale_param)

    titleLen_scale_param = scaler.fit(df["titleLength"])
    df["titleLen_scaled"] = scaler.fit_transform(df["titleLength"], titleLen_scale_param)

    train_df = df[
        [
            "class",
            "codeFragNum_scaled",
            "liNum_scaled",
            "popTagsNum_scaled",
            "startWithWh",
            "endWithQ",
            "bodyLen_scaled",
            "titleLen_scaled",
        ]
    ]
    train_np = train_df.as_matrix()

    tX = train_np[:, 1:]
    ty = train_np[:, 0]

    estm = SGDClassifier(loss="log", penalty="l1", alpha=0.015)
    plot_learning_curve(
        estm, "LogisticRegression(L1), cv=10-fold", tX, ty, ylim=(0.5, 1.0), cv=10, train_sizes=np.linspace(0.1, 1, 10)
    )

    estm.fit(tX, ty)
    print pd.DataFrame({"columns": list(train_df.columns[1:]), "coef": list(estm.coef_.T)})
def main(args):
    
    #Getting all training reports for analysis and creating json dictionary of information on file. 
    train_reports=gen_file_lst(args.raw_results_dir)
    train_report_detail=extract_model_type(train_reports)
    
    with open(args.haralick_txt_params,'r') as fb:
        haralick_params=json.load(fb)
    #
    trn_image_dict = read_data(args.train_data_dir)
    tst_image_dict = read_data(args.test_data_dir)
    
    #Iterating through reports for analysis
    for data_combos in train_report_detail:
        data_combos['model_type']='svm_sgd'
        #Generate training numpy arrays for analysis
        #ipdb.set_trace()
        X_train, y_train = create_dataset(trn_image_dict,haralick_params,args.text_dir,data_combos['model_type'])
        X_test, y_test= create_dataset(tst_image_dict,haralick_params,args.text_dir,data_combos['model_type'])
            
        scaling = MinMaxScaler(feature_range=(0,1)).fit(X_train)
        X_train = scaling.transform(X_train)
        X_test = scaling.transform(X_test)
 
        #load data for analysis into dataframe
        tmp_arr_dict=np.load(data_combos['path'],allow_pickle=True)
        tmp_arr_df=tmp_arr_dict.item().get('cv_results_')
        tmp_arr_df=pd.DataFrame.from_dict(tmp_arr_df)
        tmp_arr_df['params'].apply(pd.Series)
        
        #Perform analysis for generating 
        tmp_arr_df.sort_values('rank_test_score',ascending=True,inplace=True)
        trl_arr_df_params_lst=tmp_arr_df['params'][:5].tolist()
        #Restructure file name for analysis
        #ipdb.set_trace()
        if data_combos['model_type']!='svm_sgd':

            model_params_reformat=reformat_model_params(trl_arr_df_params_lst)
        else:
            model_params_reformat=trl_arr_df_params_lst
        #ipdb.set_trace()
        #Taking the top 5 performers forward for running analysis with training and testing curves. 
        for vals in model_params_reformat:
            #Generating detailed tile for model performance.
            title2='_'.join(['_'.join((k,str(v))) for k,v in vals.items()])
            title1='_'.join([v for k,v in data_combos.items() if k!='path'])
            title=title1+'_'+title2
            
            tmp_estimator=gen_estimator(data_combos['model_type'],vals)
            
            tmp_fig=plot_learning_curve(tmp_estimator, title, X_train, y_train,
                                        cv=3,n_jobs=-1)
            #Save figure for analysis
            dst_dir_f=os.path.join(args.dest_dir,title+'.jpeg')
            tmp_fig.savefig(dst_dir_f)
Esempio n. 14
0
def test_learning_curve():
    X = data[[0, 1, 2, 3, 4]].values
    y = data['outcome-class'].values
    fig = plot_learning_curve(estimator,
                              "50 k-NN learning curve",
                              X,
                              y,
                              cv=3,
                              verbose=2,
                              train_sizes=np.linspace(.1, 0.99, 20))
    fig.show()
Esempio n. 15
0
 def plot_learning_curve(self, name, X, y, cv=5):
     """画学习曲线
     根据cv结果画学习曲线
     :param name:标题
     :param X:输入X
     :param y: 标签y
     :param cv:cv
     :return:plt
     """
     plt = plot_learning_curve(self.model, name, X, y, ylim=None, cv=cv)
     return plt
Esempio n. 16
0
def main_learning_curve(x, y):
    title = "RF Learning Curves"
    # Cross validation with 100 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation set.
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    clf1 = RandomForestClassifier(n_estimators=100, max_depth=None)
    # plot_learning_curve(clf, title, x, y, cv=cv, train_sizes=np.logspace(-3, 0, 4), log_x=True, n_jobs=-1)

    # title = "Learning Curves (1000)"
    clf2 = RandomForestClassifier(n_estimators=1000, max_depth=None)
    plot_learning_curve((clf1, clf2),
                        title,
                        x,
                        y,
                        cv=cv,
                        train_sizes=np.logspace(-3, 0, 4),
                        log_x=True,
                        n_jobs=-1)

    plt.show()
Esempio n. 17
0
def runBoosting(dataTrain, dataTest, holdout, train_M, test_M, hold_M):
    outFile = open('boostingLog.txt','a')
    print 'running boosting algo'
    outFile.write('train==> %d, %d \n'%(train_M.shape[0],train_M.shape[1]))
    outFile.write('test==>  %d, %d \n'%(test_M.shape[0],test_M.shape[1]))
    # takes a very long time to run
#     score, bestDepth, num = tryVariousHyperParams(dataTrain, dataTest, train_M, test_M)
    bestDepth = 7
    bestNum = 10000
    with SimpleTimer('time to train', outFile):
        estimator = DecisionTreeClassifier(max_depth=bestDepth)
        bestClf = AdaBoostClassifier(base_estimator=estimator,  n_estimators=bestNum)
        bestClf.fit(train_M, dataTrain.target)
    
    bestScore = bestClf.score(test_M, dataTest.target)
    print 'the best score %.3f' % bestScore
    outFile.write('depth %d, num %d score %.3f \n'%(bestDepth, bestNum, bestScore))
    bestClf.fit(train_M, dataTrain.target)
    predicted = bestClf.predict(test_M)
    
    trainPredict = bestClf.predict(train_M)
    
    print 'testing score'
    outFile.write('testing score')
    outputScores(dataTest.target, predicted, outFile)
    
    print 'training score'
    outFile.write('training score')
    outputScores(dataTrain.target, trainPredict, outFile)
    
    results = predicted == dataTest.target
    res = []
    for i in range(len(results)):
        if not results[i]:
            res.append(i)
    print 'classifier got these wrong:'
    for i in res[:10]:
        print dataTest.data[i], dataTest.target[i]
        outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i]))
    plot_learning_curve(bestClf, 'boosting with %d trees' % bestNum, train_M, dataTrain.target, cv=3, n_jobs=4)
Esempio n. 18
0
 def plot_lc(self,
             algoName,
             inputData=XConstant.test_data,
             title='learning curve'):
     clf = self.findEstimator(algoName)
     if clf is not None:
         # load data
         print('start loading data...')
         delims = '\s+'
         dataType = np.str
         rawData = pd.read_csv(inputData, dtype=dataType, sep=delims)
         dim = rawData.shape
         print('size of data: (%d, %d)' % (dim[0], dim[1]))
         target = rawData.ix[:, 0].astype('float')
         data = rawData.ix[:, 1:dim[1]].astype('float')
         data = xman.oneHotEncoder(data)
         print('data loaded.')
         cv = cross_validation.ShuffleSplit(dim[0],
                                            n_iter=10,
                                            test_size=0.2,
                                            random_state=0)
         plt = plot_learning_curve(clf,
                                   title,
                                   data,
                                   target,
                                   ylim=(0.0, 1.01),
                                   cv=cv,
                                   n_jobs=4)
         #plt.show()
         if not os.path.exists(XConstant.lc_dir):
             os.mkdir(XConstant.lc_dir)
         plt.savefig(XConstant.lc_dir + algoName + '_' +
                     str(time.time()) + '.png')
         print('learning curve drawing finished.')
     else:
         print('learning curve drawing failed.')
Esempio n. 19
0
def run(n_folds=5, use_pickle=True, use_coref=True):

    # maps pubmed identifiers to token features
    # and corresponding labels
    pmids_dict, X_tokens = get_PMIDs_to_X_y(use_pickle, use_coref)
    ''' train / test '''
    '''
        * CV on PMIDs
        *
    '''
    all_pmids = pmids_dict.keys()
    n = len(all_pmids)
    kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds)

    title = "Learning Curves (SVM)"

    ## Learning Curve
    class_weights = {}
    class_weights[1] = 4.0462962962963
    class_weights[-1] = 0.570496083550914
    estimator = svm.SVC(class_weight=class_weights, cache_size=1000)
    train_X, _, train_y = get_features_for_pmids(pmids_dict, all_pmids)
    plc.plot_learning_curve(estimator, title, train_X, train_y, cv=5)
    plt.show()
    ##

    fold_metrics = []
    for fold_idx, (train, test) in enumerate(kf):
        print("on fold %s" % fold_idx)
        train_pmids = [all_pmids[pmid_idx] for pmid_idx in train]
        test_pmids = [all_pmids[pmid_idx] for pmid_idx in test]
        # sanity check
        assert (len(set(train_pmids).intersection(set(test_pmids)))) == 0

        train_X, _, train_y = get_features_for_pmids(pmids_dict, train_pmids)
        test_X, test_index_X, test_y = get_features_for_pmids(
            pmids_dict, test_pmids)

        #model = SGDClassifier(loss="hinge", penalty="l2", n_iter=250, alpha=0.0001, class_weight='balanced')
        class_weights = {}
        class_weights[1] = 4.0462962962963
        class_weights[-1] = 0.570496083550914
        model = svm.SVC(class_weight=class_weights, cache_size=1000)
        model.fit(train_X, train_y)

        #model = RandomForestClassifier(n_estimators = 100)
        #model.fit(train_X, train_y)

        #predict_y = list(model.predict_classes(test_X))
        predict_y = list(model.predict(test_X))
        r, p, accuracy, auc, tp_overlapping_tokens, fp_tokens = _evaluate_detection(
            test_y, predict_y, test_index_X)

        if p + r == 0:
            f1 = None
        else:
            f1 = (2 * p * r) / (p + r)

        tp_spans, tn_spans, fp_spans, fn_spans = _error_report(
            predict_y, test_y, test_index_X)

        cm = confusion_matrix(test_y, predict_y)
        np.set_printoptions(precision=2)
        print('Confusion matrix, without normalization')
        print(cm)
        plt.figure()
        plot_confusion_matrix(cm)
        # Normalize the confusion matrix by row (i.e by the number of samples
        # in each class)
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print('Normalized confusion matrix')
        print(cm_normalized)
        plt.figure()
        plot_confusion_matrix(cm_normalized,
                              title='Normalized confusion matrix')

        #plt.show()

        print(
            "fold %s. precision: %s; recall: %s; f1: %s, accuracy: %s, auc: %s"
            % (fold_idx, p, r, f1, accuracy, auc))
        #pdb.set_trace()
        fold_metrics.append([p, r, f1, accuracy, auc])
        if use_coref:
            file_name_suffix = str(fold_idx) + '_with_coref_tfidf_' + str(
                time.time()) + '.txt'
        else:
            file_name_suffix = str(fold_idx) + '_no_coref_tfidf_' + str(
                time.time()) + '.txt'
        with open('results_true_' + file_name_suffix, 'wb') as results_true:
            results_true.write(str((p, r, f1, accuracy, auc)) + "\n")
            results_true.write(str(tp_spans) + "\n")
            results_true.write(str(tn_spans))
        with open('results_false_' + file_name_suffix, 'wb') as results_false:
            results_false.write(str(fp_spans) + "\n")
            results_false.write(str(fn_spans))
    #convert to numpy array
    fold_metrics = np.array(fold_metrics)
    print("mean: %s, variance: %s" %
          (np.mean(fold_metrics, axis=0), np.var(fold_metrics, axis=0)))
Esempio n. 20
0
def KNN_Learning_Curves(X, Y, datasource, n_neighbors_number):
    title = "KNN Learning Curves on" + datasource
    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=626)
    estimator = KNeighborsClassifier(n_neighbors=n_neighbors_number)
    plt = plot_learning_curve(estimator, title, X, Y, ylim=(0.0, 1.05), cv=cv)
    plt.show()
Esempio n. 21
0
def run_boosting(training_features, training_labels, test_features, test_labels, passed_parameters = None):
    """
    Classifies the data using sklearn's ADAboost
    Does not natively support pruning so max_depth is being used for the decision tree

    Parameters
    ----------
        training_data: data used to train the classifier. For each row, item 0 assumed to be the label
        test_data: data used to test the classifier. For each row, item 0 assumed to be the label
        max_depth: maximum tree depth to be applied (will simulate pruning)
    
    Returns
    -------
        prediction: predicted labels of the test data
        accuracy: percent of test data labels accurately predicted
    """
    time_1 = time.time()

    #set up underlying decision tree classifier
    base_classifier = tree.DecisionTreeClassifier()

    #set up the boosting method
    estimator = ensemble.AdaBoostClassifier(base_estimator = base_classifier)
    
    #set up parameters for the classifier
    parameters = {'base_estimator__max_depth': range(1, 5), 'n_estimators' : range(10, 500, 50), 'learning_rate' : [.25, .5, .75, 1.0] }

    #create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0)

    #plot the validation curves
    for param in parameters:
        if(is_number(parameters[param][0])):
            title = 'Validation Curves \n(AdaBoost)' 
            save_name = "Validation Curves - AdaBoost - %s.png" % param
            plot_validation_curve(estimator, training_features, training_labels, title, param, parameters[param], cv)
            pylab.savefig(os.path.join(results_location, save_name))

    #set up parameters for the classifier
    if(passed_parameters is None):
        parameters = {'base_estimator__max_depth': range(1, 3), 'n_estimators' : range(5, 51, 5), 'learning_rate' : [1.0] }
    else:
        parameters = passed_parameters

    #set up tuning algorithm
    classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters)

    #fit the classifier
    classifier.fit(training_features, training_labels)

    #get the prediction and accuracy of the test set
    test_prediction = classifier.predict(test_features)
    test_accuracy = classifier.score(test_features, test_labels)

    time_2 = time.time()

    #graph the best result
    base_classifier = tree.DecisionTreeClassifier(max_depth = classifier.best_estimator_.base_estimator_.max_depth)
    estimator = ensemble.AdaBoostClassifier(base_estimator = base_classifier, n_estimators = classifier.best_estimator_.n_estimators, learning_rate = classifier.best_estimator_.learning_rate)

    #plot the learning curve
    title = 'Learning Curves (AdaBoost - Decision Tree)\n max_depth=%i estimators=%i learning_rate=%f$' % (classifier.best_estimator_.base_estimator_.max_depth, classifier.best_estimator_.n_estimators, classifier.best_estimator_.learning_rate)
    plot_learning_curve(estimator, title, training_features, training_labels, cv=cv)
    pylab.savefig(os.path.join(results_location, 'Learning Curves - AdaBoost - Decision Tree.png'))
    
    time_3 = time.time()

    #fit the best eetimator
    estimator.fit(training_features, training_labels) 

    #plot the learning curve by number of estimators
    plot_adaclassifier(estimator, classifier.best_estimator_.n_estimators, training_features, test_features, training_labels, test_labels)
    pylab.savefig(os.path.join(results_location, 'Estimator Curves - AdaBoost - Decision Tree.png'))

    #output time stats
    #time 1 -> time 2 is optimization time
    #time 2 -> time 3 is run for just one case
    print("Decision Tree Time Stats")
    print("Optimization Time -> %f" % (time_2 - time_1))
    print("Single Run Time -> %f" % (time_3 - time_2))

    #output classification report and confusion matrix
    print('\n\n----------------------------')
    print('Classification Report')
    print('----------------------------\n')
    print(classification_report(y_true = test_labels, y_pred = test_prediction))
    
    print('\n\n----------------------------')
    print('Confusion Matrix')
    print('----------------------------\n')
    print(confusion_matrix(y_true = test_labels, y_pred = test_prediction))

    return test_prediction, test_accuracy
Esempio n. 22
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import ShuffleSplit
from plot_learning_curve import plot_learning_curve
digits = load_digits()
X, y = digits.data, digits.target  # 加载样例数据

# 图一
title = r"Learning Curves (Naive Bayes)"
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
estimator = GaussianNB()  # 建模
plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=1)

# 图二
title = r"Learning Curves (SVM, RBF kernel, $\gamma=0.001$)"
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
estimator = SVC(gamma=0.001)  # 建模
plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=1)

plt.show()
Esempio n. 23
0
def test_KNN(X_whole, y_whole, X, y):
    

    # Split the initial data
    xtrain , xtest ,ytrain, ytest = train_test_split(X,y,test_size =0.2,random_state =42)

    start=datetime.now()

    ### NNLearner Implementation ###
    knnlearner = knn.KNNLearner(n_folds=3, verbose=True)  

    # Create a validation set - do another train/test split on the training data
    xtrain_val , xtest_val ,ytrain_val, ytest_val = train_test_split(X,y,test_size =0.2,random_state =42)

    ########## Initial Learning Curves for Different Neighbor Sizes ##########

    # 2 neighbors
    # Initial Fit
    initial_classifier = KNeighborsClassifier(n_neighbors=2)
    initial_classifier.fit(xtrain_val, ytrain_val)

    fig, axes = plt.subplots(3, 1, figsize=(10, 15))

    title = "Initial Learning Curves (KNN - 2 neighbors)"
    # Cross validation with 100 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation set.
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

    estimator = initial_classifier
    lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1)


    lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve_initial_2neigh.png')

    # 4 neighbors
    # Initial Fit
    initial_classifier = KNeighborsClassifier(n_neighbors=4)
    initial_classifier.fit(xtrain_val, ytrain_val)

    fig, axes = plt.subplots(3, 1, figsize=(10, 15))

    title = "Initial Learning Curves (KNN - 4 neighbors)"
    # Cross validation with 100 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation set.
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

    estimator = initial_classifier
    lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1)


    lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve_initial_4neigh.png')

    # 6 neighbors
    # Initial Fit
    initial_classifier = KNeighborsClassifier(n_neighbors=6)
    initial_classifier.fit(xtrain_val, ytrain_val)

    fig, axes = plt.subplots(3, 1, figsize=(10, 15))

    title = "Initial Learning Curves (KNN - 6 neighbors)"
    # Cross validation with 100 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation set.
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

    estimator = initial_classifier
    lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1)


    lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve_initial_6neigh.png')

    # 8 neighbors
    # Initial Fit
    initial_classifier = KNeighborsClassifier(n_neighbors=8)
    initial_classifier.fit(xtrain_val, ytrain_val)

    fig, axes = plt.subplots(3, 1, figsize=(10, 15))

    title = "Initial Learning Curves (KNN - 8 neighbors)"
    # Cross validation with 100 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation set.
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

    estimator = initial_classifier
    lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1)


    lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve_initial_8neigh.png')

    # 10 neighbors
    # Initial Fit
    initial_classifier = KNeighborsClassifier(n_neighbors=10)
    initial_classifier.fit(xtrain_val, ytrain_val)

    fig, axes = plt.subplots(3, 1, figsize=(10, 15))

    title = "Initial Learning Curves (KNN - 10 neighbors)"
    # Cross validation with 100 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation set.
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

    estimator = initial_classifier
    lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1)


    lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve_initial_10neigh.png')

    # Get a list of possible knn's and their respective neighbor_types
    flag = 0
    clfs, neighbor_types = knnlearner.train(xtrain_val,ytrain_val,flag)
    # Get the knn that is correlated to the neighbor_type with highest accuracy
    weight_values = "NA"
    algorithm_types = "NA"
    metric_types = "NA"
    p_values = "NA"
    knn_choice_neighbor_based = knnlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,neighbor_types, weight_values, algorithm_types, metric_types, p_values, flag)

    # Get a list of possible knns and their respective weight values
    flag = 1
    clfs, weight_values = knnlearner.train(xtrain_val,ytrain_val,flag)
    # Get the knn that is correlated to the weight with highest accuracy
    neighbor_types = "NA"
    algorithm_types = "NA"
    metric_types = "NA"
    p_values = "NA"
    knn_choice_weight_based = knnlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,neighbor_types, weight_values, algorithm_types, metric_types, p_values, flag)

    # Get a list of possible knns and their respective algorithm_types
    flag = 2
    clfs, algorithm_types = knnlearner.train(xtrain_val,ytrain_val,flag)
    # Get the knn that is correlated to the algorithm with highest accuracy
    neighbor_types = "NA"
    weight_values = "NA"
    metric_types = "NA"
    p_values = "NA"
    knn_choice_algorithm_based = knnlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,neighbor_types, weight_values, algorithm_types, metric_types, p_values, flag)

    # Get a list of possible knns and their respective metric types
    flag = 3
    clfs, metric_types = knnlearner.train(xtrain_val,ytrain_val,flag)
    # Get the knn that is correlated to the metric with highest accuracy
    neighbor_types = "NA"
    weight_values = "NA"
    algorithm_types = "NA"
    p_values = "NA"
    knn_choice_metric_based = knnlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,neighbor_types, weight_values, algorithm_types, metric_types, p_values, flag)

    # Get a list of possible knns and their respective p values
    flag = 4
    clfs, p_values = knnlearner.train(xtrain_val,ytrain_val,flag)
    # Get the knn that is correlated to the p value with highest accuracy
    neighbor_types = "NA"
    weight_values = "NA"
    algorithm_types = "NA"
    metric_types = ['minkowski']
    knn_choice_metric_based = knnlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,neighbor_types, weight_values, algorithm_types, metric_types, p_values, flag)

    # Now that we have the knn, time for tuning hyperparameters
    # Make a new classifier for this
    clf = KNeighborsClassifier()
    clf.fit(xtrain_val, ytrain_val)
    best_params = knnlearner.tune_hyperparameters(clf, xtrain_val, ytrain_val)
    print("Best params are: ", best_params)

    # Now do one more fit based on best params above
    final_classifier = KNeighborsClassifier(n_neighbors=best_params['n_neighbors'],weights=best_params['weights'], algorithm=best_params['algorithm'],metric=best_params['metric'],p=best_params['p'])
    final_classifier.fit(xtrain_val, ytrain_val)

    fig, axes = plt.subplots(3, 1, figsize=(10, 15))

    title = "Learning Curves (KNN)"
    # Cross validation with 100 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation set.
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

    estimator = final_classifier
    lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1)


    lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve.png')

    # Now time for final accuracy score for test set
    knnlearner.final_test(final_classifier,xtest,ytest)

    print(datetime.now()-start)
def learning_curve():
    estimator = KNeighborsClassifier(50)
    folds = KFold(n=len(X), n_folds=10, shuffle=True)
    fig = plot_learning_curve(estimator, "50-NN learning curve", X, y, cv=folds, verbose=2, train_sizes=np.linspace(.1, 1.0, 25))
    fig.show()
Esempio n. 25
0

param = {'n_estimators': list(np.arange(10, 150, 10)), 'min_samples_split': list(np.arange(1, 10, 2)), 'min_samples_leaf': list(np.arange(1, 10, 2))}
rfc = RandomForestClassifier(n_estimators = 120, min_samples_split=5, min_samples_leaf=5)
# print "GridSearchCV on RFC..."
# rfc = GridSearchCV(estimator=rfc, cv=cv, param_grid=param)
rfc.fit(X_train, y_train)
# # summarize the results of the grid search
# print(rfc.best_score_)
# print "Best n_estimators found by GridSearch: ", rfc.best_estimator_.n_estimators
# print "Best min_samples_split found by GridSearch: ", rfc.best_estimator_.min_samples_split
# print "Best min_samples_leaf found by GridSearch: ", rfc.best_estimator_.min_samples_leaf

title = "Learning curves (Random Forest Classifier)"

plc.plot_learning_curve(rfc, title, X_train, y_train, cv=cv)
plt.show()

print "Prediction score on test set: ", rfc.score(X_test, y_test)

print "Creating testing errors file..."
y_pred = rfc.predict(X_test)
gte.get_testing_errors(X_test, y_test, y_pred)

print "Creating kaggle submission file..."
predictions = rfc.predict(kaggle_ds[predictors])
submission = pd.DataFrame({"PassengerId": kaggle_ds["PassengerId"], "Survived": predictions})
submission.to_csv("submission/kaggle.csv", index=False)


Esempio n. 26
0
def test_Boosting(X_whole, y_whole, X, y):
    

    # Split the initial data
    xtrain , xtest ,ytrain, ytest = train_test_split(X,y,test_size =0.2,random_state =42)

    start=datetime.now()

    ### Boosting Implementation ###
    boostlearner = boost.BoostingLearner(n_folds=3, verbose=True)  

    # Create a validation set - do another train/test split on the training data
    xtrain_val , xtest_val ,ytrain_val, ytest_val = train_test_split(X,y,test_size =0.2,random_state =42)

    ########## Initial Learning Curves for Different Pruning Types ##########

    # ccp_alpha = 0.0
    # Initial Fit
    initial_classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0))
    initial_classifier.fit(xtrain_val, ytrain_val)

    fig, axes = plt.subplots(3, 1, figsize=(10, 15))

    title = "Initial Learning Curves (Adaboost - ccp_alpha=0.0)"
    # Cross validation with 100 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation set.
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

    estimator = initial_classifier
    lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1)


    lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/Boosting/boosting_learningcurve_initial_ccpa_0.png')

    # ccp_alpha = 0.0002
    # Initial Fit
    initial_classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0002))
    initial_classifier.fit(xtrain_val, ytrain_val)

    fig, axes = plt.subplots(3, 1, figsize=(10, 15))

    title = "Initial Learning Curves (Adaboost - ccp_alpha=0.0002)"
    # Cross validation with 100 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation set.
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

    estimator = initial_classifier
    lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1)


    lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/Boosting/boosting_learningcurve_initial_ccpa_0002.png')

    # ccp_alpha = 0.0004
    # Initial Fit
    initial_classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0004))
    initial_classifier.fit(xtrain_val, ytrain_val)

    fig, axes = plt.subplots(3, 1, figsize=(10, 15))

    title = "Initial Learning Curves (Adaboost - ccp_alpha=0.0004)"
    # Cross validation with 100 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation set.
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

    estimator = initial_classifier
    lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1)


    lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/Boosting/boosting_learningcurve_initial_ccpa_0004.png')

    # ccp_alpha = 0.0008
    # Initial Fit
    initial_classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0008))
    initial_classifier.fit(xtrain_val, ytrain_val)

    fig, axes = plt.subplots(3, 1, figsize=(10, 15))

    title = "Initial Learning Curves (Adaboost - ccp_alpha=0.0008)"
    # Cross validation with 100 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation set.
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

    estimator = initial_classifier
    lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1)


    lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/Boosting/boosting_learningcurve_initial_ccpa_0008.png')

    # ccp_alpha = 0.0010
    # Initial Fit
    initial_classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0010))
    initial_classifier.fit(xtrain_val, ytrain_val)

    fig, axes = plt.subplots(3, 1, figsize=(10, 15))

    title = "Initial Learning Curves (Adaboost - ccp_alpha=0.0010)"
    # Cross validation with 100 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation set.
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

    estimator = initial_classifier
    lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1)


    lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/Boosting/boosting_learningcurve_initial_ccpa_0010.png')

    # Get a list of possible boostings and their respective alphas
    flag = 0
    clfs, pruning_types = boostlearner.train(xtrain_val,ytrain_val,flag)
    # Get the boosting that is correlated to the alpha with highest accuracy
    number_estimators = "NA"
    learning_rates = "NA"
    boosting_choice_alpha_based = boostlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,pruning_types, number_estimators, learning_rates, flag)

    # Get a list of possible boostings and their respective estimators
    flag = 1
    clfs, number_estimators = boostlearner.train(xtrain_val,ytrain_val,flag)
    # Get the boosting that is correlated to the number of estimators with highest accuracy
    pruning_types = "NA"
    learning_rates = "NA"
    boosting_choice_estimators_based = boostlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,pruning_types, number_estimators, learning_rates, flag)

    # Get a list of possible boostings and their respective learning_rates
    flag = 2
    clfs, learning_rates = boostlearner.train(xtrain_val,ytrain_val,flag)
    # Get the boosting that is correlated to the learning rate with highest accuracy
    pruning_types = "NA"
    number_estimators = "NA"
    boosting_choice_lr_based = boostlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,pruning_types, number_estimators, learning_rates, flag)


    # Now that we have the boosting, time for tuning hyperparameters
    # Make a new classifier for this
    clf = AdaBoostClassifier()
    clf.fit(xtrain_val, ytrain_val)
    best_params = boostlearner.tune_hyperparameters(clf, xtrain_val, ytrain_val)
    print("Best params are: ", best_params)

    # Now do one more fit based on best params above
    final_classifier = AdaBoostClassifier(base_estimator=best_params['base_estimator'],n_estimators=best_params['n_estimators'], learning_rate=best_params['learning_rate'])
    final_classifier.fit(xtrain_val, ytrain_val)

    fig, axes = plt.subplots(3, 1, figsize=(10, 15))

    title = "Learning Curves (Boosting)"
    # Cross validation with 100 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation set.
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

    estimator = final_classifier
    lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1)


    lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/Boosting/boosting_learningcurve.png')

    # Now time for final accuracy score for test set
    boostlearner.final_test(final_classifier,xtest,ytest)

    print(datetime.now()-start)
Esempio n. 27
0
def run_k_nearest_neighbors(training_features, training_labels, test_features, test_labels, passed_parameters = None):
    """
    Classifies the data using sklearn's k nearest neighbors classifier

    Parameters
    ----------
        training_data: data used to train the classifier. For each row, item 0 assumed to be the label
        test_data: data used to test the classifier. For each row, item 0 assumed to be the label
        k: number of nearest neighbors used in the algorithm
    
    Returns
    -------
        prediction: predicted labels of the test data
        accuracy: percent of test data labels accurately predicted
    """

    time_1 = time.time()

    estimator = neighbors.KNeighborsClassifier()
    
    #set up parameters for the classifier
    if(passed_parameters is None):
        parameters = {'n_neighbors': range(1, 11), 'weights': ['uniform', 'distance'], 'p': [1, 2] }
    else:
        parameters = passed_parameters

    #create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0)

    #plot the validation curves
    for param in parameters:
        if(is_number(parameters[param][0])):
            title = 'Validation Curves \n(kNN)' 
            save_name = "Validation Curves - kNN - %s.png" % param
            plot_validation_curve(estimator, training_features, training_labels, title, param, parameters[param], cv)
            pylab.savefig(os.path.join(results_location, save_name))

    #set up tuning algorithm
    classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters)

    #fit the classifier
    classifier.fit(training_features, training_labels)

    test_prediction = classifier.predict(test_features)
    test_accuracy = classifier.score(test_features, test_labels)

    time_2 = time.time()

    #show the best result
    estimator = neighbors.KNeighborsClassifier(n_neighbors = classifier.best_estimator_.n_neighbors, weights = classifier.best_estimator_.weights, algorithm = classifier.best_estimator_.algorithm, leaf_size = classifier.best_estimator_.leaf_size, p = classifier.best_estimator_.p, metric = classifier.best_estimator_.metric)

    #plot the learning curve
    title = 'Learning Curves \n(k-NN, k-neighbors=%i weights=%s algorithm=%s leaf size=%i p=%i )' % (classifier.best_estimator_.n_neighbors, classifier.best_estimator_.weights, classifier.best_estimator_.algorithm, classifier.best_estimator_.leaf_size, classifier.best_estimator_.p)
    plot_learning_curve(estimator, title, training_features, training_labels, cv=cv)
    pylab.savefig(os.path.join(results_location, 'Learning Curves - kNN.png'))
    #plt.show()

    time_3 = time.time()

    #output time stats
    #time 1 -> time 2 is optimization time
    #time 2 -> time 3 is run for just one case
    print("kNN Time Stats")
    print("Optimization Time -> %f" % (time_2 - time_1))
    print("Single Run Time -> %f" % (time_3 - time_2))

    #output classification report and confusion matrix
    print('\n\n----------------------------')
    print('Classification Report')
    print('----------------------------\n')
    print(classification_report(y_true = test_labels, y_pred = test_prediction))
    
    print('\n\n----------------------------')
    print('Confusion Matrix')
    print('----------------------------\n')
    print(confusion_matrix(y_true = test_labels, y_pred = test_prediction))

    return test_prediction, test_accuracy
Esempio n. 28
0
def runSVMSimulation(dataTrain, dataTest, holdout, train_M, test_M, hold_M):
    kernel = "linear"
    outFile = open('svmSarinLog%s.txt' % kernel,'a')
    print 'running svm code'
    
    outFile.write('train==> %d, %d \n'%(train_M.shape[0],train_M.shape[1]))
    outFile.write('test==>  %d, %d \n'%(test_M.shape[0],test_M.shape[1]))
    
    penalty = 0.025
    with SimpleTimer('time to train', outFile):
#         clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=30, random_state=42)
#         clf = LinearSVC(C=1.0)
        clf = SVC(kernel=kernel, C=penalty, degree=1)
        clf.fit(train_M, dataTrain.target)
    
    baseScore = clf.score(test_M, dataTest.target)
    baseIter = 5
    print 'baseline score %.3f base iter %d' % (baseScore, baseIter)
    outFile.write('baseline score %.3f base iter %d \n' % (baseScore, baseIter))
    
    res = []
    with SimpleTimer('number of iter', outFile):
        for pen in [1,5,10,15,20,30]:
            print 'training for neighbors %.3f' % pen
            clf = SVC(kernel=kernel, C=pen, degree=1)
#             clf = LinearSVC(loss='squared_hinge', C=1.0)
            clf.fit(train_M, dataTrain.target)
            score = clf.score(hold_M, holdout.target)
            res.append((score, pen))
            trainPredict = clf.score(train_M, dataTrain.target)
            outFile.write('test %.3f %.3f \n' % (pen, score))
            outFile.write('train %.3f %.3f \n' % (pen, trainPredict))
            
    res = sorted(res, key=lambda x:x[0], reverse=True)
    print res[:5]
    
    bestPen = res[0][1]
    print ('best number of iter is %.3f' % bestPen) 
    
    bestClf = SVC(kernel=kernel, C=penalty, degree=bestPen)
    bestClf.fit(train_M, dataTrain.target)
    
    predicted = bestClf.predict(test_M)
    
    trainPredict = bestClf.predict(train_M)
    
    print 'testing score'
    outFile.write('testing score')
    outputScores(dataTest.target, predicted, outFile)
    
    print 'training score'
    outFile.write('training score')
    outputScores(dataTrain.target, trainPredict, outFile)
    
    results = predicted == dataTest.target
    res = []
    for i in range(len(results)):
        if not results[i]:
            res.append(i)
    print 'classifier got these wrong:'
    for i in res[:10]:
        print dataTest.data[i], dataTest.target[i]
        outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i]))
        

    
    plot_learning_curve(bestClf, 'svm with %s kernel & penalty %.3f' % (kernel, bestPen), train_M, dataTrain.target, cv=5, n_jobs=4)
    '''
Esempio n. 29
0
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

dataset = load_data.load_sgp_data("trigger_sgp_hy.nc")
cape = np.loadtxt("../../data/sgp/sgp_undilute_cape.txt")
lcl = np.loadtxt("../../data/sgp/sgp_undilute_lcl.txt")
dataset['cape'] = cape
dataset['lcl'] = lcl

trig_x = dataset.iloc[:, 0:86]
trig_y = dataset.iloc[:, 86]

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=10)
xgb = XGBClassifier(n_estimators=600,
                    silent=True,
                    nthread=8,
                    max_depth=7,
                    scale_pos_weight=3.5)

title = "Learning Curves (XGBoost)"
plot_learning_curve.plot_learning_curve(xgb,
                                        title,
                                        trig_x,
                                        trig_y,
                                        ylim=(0.7, 1.01),
                                        cv=cv,
                                        n_jobs=8)
plt.show()
Esempio n. 30
0
grid_scores = DataFrame(clf.grid_scores_)
grid_scores.to_csv("grid_scores_nusvc.csv")

print("Detailed classification report:")
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))

# Make a model with the best parameters
estimator = NuSVC(kernel='rbf',
                  gamma=clf.best_estimator_.gamma,
                  nu=clf.best_estimator_.nu)
# C=clf.best_estimator_.C)

# Plot the learning curve to find a good split
title = 'NuSVC'
plot_learning_curve(estimator, title, X_train, y_train, cv=cv, n_jobs=4)
p.savefig("supervised_learning_nusvc.pdf")

# Find a good number of test samples before moving on
# raw_input("Continue??")

# With a good number of test samples found, predict the whole set to the model
estimator.fit(X_train, y_train)
y_pred = estimator.predict(X_all)
DataFrame(y_pred).to_csv("supervised_prediction_labels_nusvc.csv")
print(classification_report(y_all, y_pred))
print "Best params are:" + str(clf.best_params_)
# Hold here
raw_input("Continue??")

# Now take the model found, and find the outliers
def runDecisionTreeSimulation(dataTrain, dataTest, dataHold, train_tfidf,
                              test_tfidf, hold_tfidf):
    print 'running decision tree'
    outFile = open('decisionTreeLog.txt', 'a')

    outFile.write('train==> %d, %d \n' %
                  (train_tfidf.shape[0], train_tfidf.shape[1]))
    outFile.write('test==>  %d, %d \n' %
                  (test_tfidf.shape[0], test_tfidf.shape[1]))
    with SimpleTimer('time to train', outFile):
        clf = DecisionTreeClassifier().fit(train_tfidf, dataTrain.target)

    baseScore = clf.score(test_tfidf, dataTest.target)
    initHeight = clf.tree_.max_depth
    print 'baseline score %.3f base height %d' % (baseScore, initHeight)
    outFile.write('baseline score %.3f base height %d \n' %
                  (baseScore, initHeight))

    res = []
    with SimpleTimer('time to prune', outFile):
        for height in range(initHeight, 40, -25):
            #             print 'training for height %d' % height
            clf = DecisionTreeClassifier(max_depth=height).fit(
                train_tfidf, dataTrain.target)
            score = clf.score(hold_tfidf, dataHold.target)
            res.append((score, height))
            outFile.write('%d %.3f \n' % (height, score))
    res = sorted(res, key=lambda x: x[0], reverse=True)
    print res[:5]

    bestDepth = res[0][1]
    print('best height is %d' % bestDepth)
    outFile.write('best depth is %d  and score is %.3f \n' %
                  (bestDepth, res[0][0]))

    bestClf = DecisionTreeClassifier(max_depth=bestDepth)
    bestClf.fit(train_tfidf, dataTrain.target)

    predicted = bestClf.predict(test_tfidf)

    train_predict = bestClf.predict(train_tfidf)

    print 'testing score'
    outFile.write('testing score')
    outputScores(dataTest.target, predicted, outFile)
    print 'training score'
    outFile.write('testing score')
    outputScores(dataTrain.target, train_predict, outFile)

    results = predicted == dataTest.target
    wrong = []
    for i in range(len(results)):
        if not results[i]:
            wrong.append(i)
    print 'classifier got these wrong:'
    for i in wrong[:10]:
        print dataTest.data[i], dataTest.target[i]
        outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i]))
    plot_learning_curve(bestClf,
                        'decision tree after pruning from %d to %d depth' %
                        (initHeight, bestDepth),
                        train_tfidf,
                        dataTrain.target,
                        cv=5,
                        n_jobs=4)
print(dt_clf.best_params_)


# In[7]:

dt_optimized.fit(X_train, y_train)
dt_optimized.score(X_test, y_test)


# In[8]:

from sklearn.model_selection import learning_curve
from plot_learning_curve import plot_learning_curve
import matplotlib.pyplot as plt

plot_learning_curve(dt_optimized, title='Decision Tree learning curve', X=X_train, y=y_train, cv=10)
plt.show()


# ## knn

# In[22]:

from sklearn.neighbors import KNeighborsClassifier

tuned_parameters = [{'weights': ['uniform', 'distance'], 'n_neighbors': [1, 2, 5, 10, 25]}]
knn_clf = GridSearchCV(KNeighborsClassifier(n_neighbors=1), tuned_parameters, cv=10)
knn_clf.fit(X_train, y_train)
knn_optimized = knn_clf.best_estimator_
print(knn_clf.best_params_)
Esempio n. 33
0
print "Log loss regression (test/train) : {:.5f}/{:.5f}".format( \
        log_loss(y_test, logReg.predict_proba(X_test)), \
        log_loss(y_train, logReg.predict_proba(X_train)))

print "Log loss p(click) = 0.5 : {:.5f}".format( \
    log_loss(y_test, 0.5*np.ones(len(y_test))))
print "Log loss p(click) = {:.5f} : {:.5f}".format(1.0*y.sum()/len(y),
    log_loss(y_test, 1.0*y.sum()/len(y)*np.ones(len(y_test))))

# scorer for log loss
logl_sc = make_scorer(log_loss,needs_proba=True,greater_is_better=False)

# cross validation splitter (5x 70-30)
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)

#print "Cross-val score = {:.5f}".format(\
#    cross_val_score(logReg, X_train, y_train, scoring=logl_sc, cv=cv).mean())

# plot learning curve
lc = plot_learning_curve(logReg, "LogReg", X_train, y_train, 
                         score=logl_sc, cv=cv, n_jobs=4)


# add CTR benchmark to learning curve plot
addBenchToPlot(lc)

# plot regularization validation curve
plot_validation_curve(logReg, X_train, y_train, title="Regularization",
                      ylim=None, cv=cv, score=logl_sc, n_jobs=4,
                      param_range = np.logspace(-2,0,5))
Esempio n. 34
0
print('Training Test')
for i in range(len(results)):
    print("name: {}; score: {}".format(results[i][0], results[i][1]))
print('')

#模型驗證評估
results = []
for name, model in models:
    kfold = KFold(n_splits=10)  # K折交叉驗證器,將資料折成10份(9份訓練, 1份測試)
    cv_result = cross_val_score(model, X, Y, cv=kfold)  #交叉驗證評估分數
    results.append((name, cv_result))
    cv_ShuffleSplit = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
    #畫出學習曲線
    plt_learn = plot_learning_curve(model,
                                    "Learn Curve for KNN Diabetes",
                                    X,
                                    Y,
                                    ylim=(0., 1.2),
                                    cv=cv_ShuffleSplit)

print('Cross Validation')
for i in range(len(results)):
    print("name: {}; cross val score: {}".format(results[i][0],
                                                 results[i][1].mean()))
print('')

#模型之後可以用下列方法預測未知的資料
#print("predict",models[0][1].predict(X),models[0][1].predict(X).shape)

#挑出兩個最佳特徵
from sklearn.feature_selection import SelectKBest
Esempio n. 35
0
def run_support_vector_machines(training_features, training_labels, test_features, test_labels, passed_parameters = None):
    """
    Classifies the data using sklearn's support vector machine classifier

    Parameters
    ----------
        training_data: data used to train the classifier. For each row, item 0 assumed to be the label
        test_data: data used to test the classifier. For each row, item 0 assumed to be the label
        kernel: (optional) Kernel to be used in the svm classifier can be 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'
    
    Returns
    -------
        prediction: predicted labels of the test data
        accuracy: percent of test data labels accurately predicted
    """

    time_1 = time.time()

    estimator = svm.SVC()
    
    #set up parameters that will be used by all kernels
    if(passed_parameters is None):
        parameters = {'C': [1e0, 5e0, 1e1, 5e1]}
    else:
        parameters = passed_parameters 

    #create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0)

    #plot the validation curves
    for param in parameters:
        if(is_number(parameters[param][0])):
            title = 'Validation Curves'
            save_name = "Validation Curves - SVC - %s.png" % param
            plot_validation_curve(estimator, training_features, training_labels, title, param, parameters[param], cv)
            pylab.savefig(os.path.join(results_location, save_name))

    #set up tuning algorithm
    classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters)

    #fit the classifier
    classifier.fit(training_features, training_labels)

    test_prediction = classifier.predict(test_features)
    test_accuracy = classifier.score(test_features, test_labels)

    time_2 = time.time()

    #show the best result
    estimator = svm.SVC(kernel = classifier.best_estimator_.kernel, C = classifier.best_estimator_.C, gamma = classifier.best_estimator_.gamma, degree = classifier.best_estimator_.degree)

    #plot the learning curve
    title = 'Learning Curves (SVM, kernel=%s degree=%i gamma=%f C=%i )' % (classifier.best_estimator_.kernel, classifier.best_estimator_.degree, classifier.best_estimator_.gamma, classifier.best_estimator_.C)
    plot_learning_curve(estimator, title, training_features, training_labels, cv=cv)
    save_file_name = 'Learning Curves - SVM.png'
    pylab.savefig(os.path.join(results_location, save_file_name))
    #plt.show()

    time_3 = time.time()

    if(classifier.best_estimator_.kernel == 'linear'):
        coefficients = classifier.estimator.coef_
        print('\n\n-----------------------')
        print(' Coefficients')
        print(coefficients)

    #output time stats
    #time 1 -> time 2 is optimization time
    #time 2 -> time 3 is run for just one case
    print("SVM Time Stats")
    print("Optimization Time -> %f" % (time_2 - time_1))
    print("Single Run Time -> %f" % (time_3 - time_2))

    #output classification report and confusion matrix
    print('\n\n----------------------------')
    print('Classification Report')
    print('----------------------------\n')
    print(classification_report(y_true = test_labels, y_pred = test_prediction))
    
    print('\n\n----------------------------')
    print('Confusion Matrix')
    print('----------------------------\n')
    print(confusion_matrix(y_true = test_labels, y_pred = test_prediction))

    return test_prediction, test_accuracy
Esempio n. 36
0
import xgboost
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from plot_learning_curve import plot_learning_curve
import matplotlib.pyplot as plt

X, y = load_data()

# Learning Curves for LogisticRegression Tuning
lr_a = LogisticRegression()  # C=1.0
lr_b = LogisticRegression(C=0.1)
lr_c = LogisticRegression(C=0.03)

plt.figure()
plot_learning_curve(lr_a, X, y, 'C=1.0')
plot_learning_curve(lr_b, X, y, 'C=0.1')
plot_learning_curve(lr_c, X, y, 'C=0.03')
plt.legend(loc=(0, 1.00), ncol=2, fontsize=11)
plt.savefig('LogisticRegression_Tuning' + '.png', format='png')

# Learning Curves for all the tuned classifiers
xgb = xgboost.XGBClassifier(objective="multi:softprob", nthread=-1)
gbrt = GradientBoostingClassifier(random_state=0)
forest = RandomForestClassifier(n_jobs=-1, random_state=0)

plt.figure()
plot_learning_curve(xgb, X, y, 'xgb')
plot_learning_curve(gbrt, X, y, 'gbrt')
plot_learning_curve(forest, X, y, 'forest')
plot_learning_curve(lr_c, X, y, 'LR')
Esempio n. 37
0
grid_scores = DataFrame(clf.grid_scores_)
grid_scores.to_csv("grid_scores_nusvc.csv")

print("Detailed classification report:")
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))

# Make a model with the best parameters
estimator = NuSVC(kernel='rbf', gamma=clf.best_estimator_.gamma,
                  nu=clf.best_estimator_.nu)
                # C=clf.best_estimator_.C)

# Plot the learning curve to find a good split
title = 'NuSVC'
plot_learning_curve(estimator, title, X_train, y_train, cv=cv, n_jobs=4)
p.savefig("supervised_learning_nusvc.pdf")

# Find a good number of test samples before moving on
# raw_input("Continue??")

# With a good number of test samples found, predict the whole set to the model
estimator.fit(X_train, y_train)
y_pred = estimator.predict(X_all)
DataFrame(y_pred).to_csv("supervised_prediction_labels_nusvc.csv")
print(classification_report(y_all, y_pred))
print "Best params are:" + str(clf.best_params_)
# Hold here
raw_input("Continue??")

Esempio n. 38
0
def runSVMSimulation(dataTrain, dataTest, holdOut, train_tfidf, test_tfidf,
                     hold_tfidf):
    kernel = 'poly'
    penalty = 1.0
    outFile = open('svmLog%s.txt' % kernel, 'a')
    degree = 3
    outFile.write('train==> %d, %d \n' %
                  (train_tfidf.shape[0], train_tfidf.shape[1]))
    outFile.write('test==>  %d, %d \n' %
                  (test_tfidf.shape[0], test_tfidf.shape[1]))

    with SimpleTimer('time to train', outFile):
        #         clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=30, random_state=42)
        clf = SVC(kernel=kernel, C=penalty, degree=degree)
        clf.fit(train_tfidf, dataTrain.target)

    baseScore = clf.score(test_tfidf, dataTest.target)
    baseIter = 5
    print 'baseline score %.3f penalty %d' % (baseScore, baseIter)
    outFile.write('baseline score %.3f base height %d \n' %
                  (baseScore, baseIter))

    res = []
    with SimpleTimer('number of iter', outFile):
        for pen in [1, 2, 3, 4, 5]:
            print 'training for peanalty %f' % pen
            #             clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=itr, random_state=42)
            clf = SVC(kernel=kernel, C=1.0, degree=pen)
            clf.fit(train_tfidf, dataTrain.target)
            score = clf.score(hold_tfidf, holdOut.target)
            res.append((score, pen))
            outFile.write('%.3f %.3f \n' % (pen, score))

    res = sorted(res, key=lambda x: x[0], reverse=True)
    print res[:5]
    bestPen = res[0][1]
    print('best number of iter is %.3f' % bestPen)
    bestClf = SVC(kernel=kernel, C=1.0, degree=bestPen)
    bestClf.fit(train_tfidf, dataTrain.target)

    train_predict = bestClf.predict(train_tfidf)
    predicted = bestClf.predict(test_tfidf)

    print 'testing score'
    outFile.write('testing score')
    outputScores(dataTest.target, predicted, outFile)
    print 'training score'
    outFile.write('testing score')
    outputScores(dataTrain.target, train_predict, outFile)

    results = predicted == dataTest.target
    res = []
    for i in range(len(results)):
        if not results[i]:
            res.append(i)
    print 'classifier got these wrong:'
    for i in res[:10]:
        print dataTest.data[i], dataTest.target[i]
        outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i]))

    plot_learning_curve(bestClf,
                        'svm with %s kernel & degree %.3f' % (kernel, bestPen),
                        train_tfidf,
                        dataTrain.target,
                        cv=5,
                        n_jobs=4)
    '''
Esempio n. 39
0
def run_decision_tree(training_features, training_labels, test_features, test_labels, passed_parameters = None, headings = None):
    """
    Classifies the data using sklearn's decision tree 
    Does not natively support pruning so max_depth is being used

    Parameters
    ----------
        training_data: data used to train the classifier. For each row, item 0 assumed to be the label
        test_data: data used to test the classifier. For each row, item 0 assumed to be the label
        max_depth: maximum tree depth to be applied (will simulate pruning)
    
    Returns
    -------
        prediction: predicted labels of the test data
        accuracy: percent of test data labels accurately predicted
    """

    time_1 = time.time()

    estimator = tree.DecisionTreeClassifier()
    
    #set up parameters for the classifier
    if(passed_parameters == None):
        parameters = {'max_depth': None}
    else:
        parameters = passed_parameters

    #create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0)

    #plot the validation curves
    for param in parameters:
        if(is_number(parameters[param][0])):
            title = 'Validation Curves \n(Decision Tree)' 
            save_name = "Validation Curves - Decision Tree - %s.png" % param
            plot_validation_curve(estimator, training_features, training_labels, title, param, parameters[param], cv)
            pylab.savefig(os.path.join(results_location, save_name))

    #set up tuning algorithm
    classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters)

    #fit the classifier
    classifier.fit(training_features, training_labels)

    test_prediction = classifier.predict(test_features)
    test_accuracy = classifier.score(test_features, test_labels)

    time_2 = time.time()

    #show the best result
    estimator = tree.DecisionTreeClassifier(max_depth = classifier.best_estimator_.max_depth, criterion = classifier.best_estimator_.criterion)
    estimator.fit(training_features, training_labels)

    #plot the learning curve
    title = 'Learning Curves \n(Decision Tree, max depth=%i)' %classifier.best_estimator_.max_depth
    plot_learning_curve(estimator, title, training_features, training_labels, cv=cv)
    pylab.savefig(os.path.join(results_location, 'Learning Curves - Decision Tree.png'))
    #plt.show()

    #save the visualization of the decision tree only use the top 5 levels for now
    tree_data = StringIO() 
    tree.export_graphviz(estimator, out_file=tree_data, max_depth=5, feature_names=headings)
    graph = pydot.graph_from_dot_data(tree_data.getvalue()) 
    graph.write_pdf(os.path.join(results_location, "Decision Tree Model.pdf")) 

    time_3 = time.time()

    #output time stats
    #time 1 -> time 2 is optimization time
    #time 2 -> time 3 is run for just one case
    print("Decision Tree Time Stats")
    print("Optimization Time -> %f" % (time_2 - time_1))
    print("Single Run Time -> %f" % (time_3 - time_2))

    #output classification report and confusion matrix
    print('\n\n----------------------------')
    print('Classification Report')
    print('----------------------------\n')
    print(classification_report(y_true = test_labels, y_pred = test_prediction))
    
    print('\n\n----------------------------')
    print('Confusion Matrix')
    print('----------------------------\n')
    print(confusion_matrix(y_true = test_labels, y_pred = test_prediction))

    return test_prediction, test_accuracy
Esempio n. 40
0
train_indices = np.load(clean_data_dir + data_size + "train_indices.npy")
train_labels = pd.read_csv(labels_dir + data_size + 'appetency.labels', header=None)

train_labels = squeeze(train_labels.values)[train_indices]
train_labels[train_labels == -1] = 0

#n_true = sum(train_labels == 1)
#train_labels = np.concatenate((train_labels[train_labels == 1], train_labels[train_labels == 0][:n_true]), axis=1)
#train_data = np.concatenate((train_data[train_labels == 1, :],  train_data[train_labels == 0, :][:n_true, :]), axis=0)

clf = None
if alg == "SVM":
    clf = svm.SVC(probability=True, kernel='linear', class_weight='auto')
elif alg == "SGD":
    clf = SGDClassifier(loss='log')
elif alg == "GBM":
    clf = ensemble.GradientBoostingClassifier(max_features=max_features,
                                              subsample=subsample, learning_rate=learning_rate)

log("cross_val_score")
plot_title = data_size + "%s n_factors: %i, subsample: %0.2f, learning_rate: %0.4f, max_features: %0.2f" \
                         % (alg, n_factors, subsample, learning_rate, max_features)

scores = plot_learning_curve(clf, plot_title, train_data, train_labels, cv=3, verbose=4,
                             scoring='roc_auc')

plt.show()

log("done")
Esempio n. 41
0
                       axis=1)
    encode1 = LabelEncoder()
    y = encode1.fit_transform(data['Loan_Status'])

    trainx, testx, trainy, testy = train_test_split(
        X, y, random_state=42,
        test_size=0.2)  #splitting my data into trainand cross validation set
    '''model selection'''
    t1 = time()
    #model1=RandomForestClassifier(n_estimators=100,min_samples_split=20,max_depth=10,min_samples_leaf=10)
    #model1=SVC(C=1,gamma=0.05)
    #model1=GradientBoostingClassifier(n_estimators=100,learning_rate=0.01)

    model1.fit(trainx, trainy)
    print(time() - t1)
    print(model1.score(testx, testy))

    fig = plot_learning_curve(model1, 'gbm', trainx, trainy.astype(int))
    fig.show()

    predictions = model1.predict(test1)
    predictions = predictions.astype(str)
    predictions[(predictions == '1')] = 'Y'
    predictions[predictions == '0'] = 'N'
    sub = pd.DataFrame({
        'Loan_ID': test['Loan_ID'],
        'Loan_Status': predictions
    })

    sub.to_csv('av.csv', index=False)
Esempio n. 42
0
def algomain(df):
    scaler = preprocessing.StandardScaler()
    
    #liNum 只看>=2的部分
    df['liG2'] = (df.liNum > 2).astype(int)    
    
    #开头有Wh/Who且结尾有Q
    df['WhAndQ1'] = ((df.startWithWh == 1) & (df.endWithQ == 1)).astype(int)
    df['WhAndQ0'] = ((df.startWithWh == 0) & (df.endWithQ == 0)).astype(int)
    
    #标准化
    popTagsNum_scale_param = scaler.fit(df['popTagsNum'])
    df['popTagsNum_scaled'] = scaler.fit_transform(df['popTagsNum'], popTagsNum_scale_param)

    liNum_scale_param = scaler.fit(df['liNum'])
    df['liNum_scaled'] = scaler.fit_transform(df['liNum'], liNum_scale_param)
    
    codeFragNum_scale_param = scaler.fit(df['codeFragNum'])
    df['codeFragNum_scaled'] = scaler.fit_transform(df['codeFragNum'], codeFragNum_scale_param)
    
    avgTI_scale_param = scaler.fit(df['avgTI'])
    df['avgTI_scaled'] = scaler.fit_transform(df['avgTI'], avgTI_scale_param)
    
    totalTI_scale_param = scaler.fit(df['totalTI'])
    df['totalTI_scaled'] = scaler.fit_transform(df['totalTI'], totalTI_scale_param)
    
    title_scale_param = scaler.fit(df['titleLength'])
    df['title_scaled'] = scaler.fit_transform(df['titleLength'], title_scale_param)
    
    body_scale_param = scaler.fit(df['bodyLength'])
    df['body_scaled'] = scaler.fit_transform(df['bodyLength'], body_scale_param)
    
    a_scale_param = scaler.fit(df['aNum'])
    df['a_scaled'] = scaler.fit_transform(df['aNum'], a_scale_param)
    
    strong_scale_param = scaler.fit(df['strongNum'])
    df['strong_scaled'] = scaler.fit_transform(df['strongNum'], strong_scale_param)
    
    thx_scale_param = scaler.fit(df['thxNum'])
    df['thx_scaled'] = scaler.fit_transform(df['thxNum'], thx_scale_param)
    
    hourHot_scale_param = scaler.fit(df['hourHot'])
    df['hourHot_scaled'] = scaler.fit_transform(df['hourHot'], hourHot_scale_param)    

    train_df = df[['class', 
                   'codeFragNum_scaled', 'liNum_scaled', 
                   'totalTI', 'avgTI',
                   'popTagsNum_scaled', 
                   'startWithWh', 'endWithQ', 
                   'WhAndQ1', 'WhAndQ0',  'isweekend',
                   'cntQ', 'cntA',
                   'body_scaled', 'title_scaled',
                   'a_scaled', 'strong_scaled', 'thx_scaled', 'hourHot_scaled']]
    
    train_np = train_df.as_matrix()
    tX, ty = train_np[:, 1:], train_np[:, 0]

    n_estimators = 800
    learning_rate = 0.8
    dt = DecisionTreeClassifier(max_depth=2, min_samples_leaf=1)
    
    ada_real = AdaBoostClassifier(
                    base_estimator=dt,
                    learning_rate=learning_rate,
                    n_estimators=n_estimators,
                    algorithm="SAMME.R")
                    

    plot_learning_curve(ada_real, 'AdaBoostWithDT',
                        tX, ty, ylim=(0.5, 1.0), 
                        cv=10, train_sizes=np.linspace(.1, 1, 10))
Esempio n. 43
0
def algomain(df):
    scaler = preprocessing.StandardScaler()

    # liNum 只看>=2的部分
    df["liG2"] = (df.liNum > 2).astype(int)

    # 开头有Wh/Who且结尾有Q
    df["WhAndQ1"] = ((df.startWithWh == 1) & (df.endWithQ == 1)).astype(int)
    df["WhAndQ0"] = ((df.startWithWh == 0) & (df.endWithQ == 0)).astype(int)

    # 标准化
    popTagsNum_scale_param = scaler.fit(df["popTagsNum"])
    df["popTagsNum_scaled"] = scaler.fit_transform(df["popTagsNum"], popTagsNum_scale_param)

    codeFragNum_scale_param = scaler.fit(df["codeFragNum"])
    df["codeFragNum_scaled"] = scaler.fit_transform(df["codeFragNum"], codeFragNum_scale_param)

    avgTI_scale_param = scaler.fit(df["avgTI"])
    df["avgTI_scaled"] = scaler.fit_transform(df["avgTI"], avgTI_scale_param)

    totalTI_scale_param = scaler.fit(df["totalTI"])
    df["totalTI_scaled"] = scaler.fit_transform(df["totalTI"], totalTI_scale_param)

    title_scale_param = scaler.fit(df["titleLength"])
    df["title_scaled"] = scaler.fit_transform(df["titleLength"], title_scale_param)

    body_scale_param = scaler.fit(df["bodyLength"])
    df["body_scaled"] = scaler.fit_transform(df["bodyLength"], body_scale_param)

    a_scale_param = scaler.fit(df["aNum"])
    df["a_scaled"] = scaler.fit_transform(df["aNum"], a_scale_param)

    strong_scale_param = scaler.fit(df["strongNum"])
    df["strong_scaled"] = scaler.fit_transform(df["strongNum"], strong_scale_param)

    thx_scale_param = scaler.fit(df["thxNum"])
    df["thx_scaled"] = scaler.fit_transform(df["thxNum"], thx_scale_param)

    dayhot_scale_param = scaler.fit(df["dayHot"])
    df["dayHot_scaled"] = scaler.fit_transform(df["dayHot"], dayhot_scale_param)

    train_df = df[
        [
            "class",
            "codeFragNum_scaled",
            "liNum",
            "totalTI",
            "avgTI",
            "popTagsNum_scaled",
            "startWithWh",
            "endWithQ",
            "WhAndQ1",
            "WhAndQ0",
            "isweekend",
            "cntQ",
            "cntA",
            "body_scaled",
            "title_scaled",
            "a_scaled",
            "strong_scaled",
            "thx_scaled",
            "dayHot_scaled",
        ]
    ]

    train_np = train_df.as_matrix()
    tX, ty = train_np[:, 1:], train_np[:, 0]

    #    estm = LinearSVC(C=0.3, penalty='l1', dual=False)
    estm = SVC(C=0.1, kernel="linear")

    plot_learning_curve(estm, "LinearSVC", tX, ty, ylim=(0.5, 1.0), train_sizes=np.linspace(0.1, 1, 10))

    estm.fit(tX, ty)
    print pd.DataFrame({"columns": list(train_df.columns[1:]), "coef": list(estm.coef_.T)})
Esempio n. 44
0
X_test = X[m:,:]
y_test = y[m:].ravel()

m_val = int(X_test.shape[0] * 0.5)

X_val = X_test[m_val:,:]
y_val = y_test[m_val:]

X_test = X_test[:m_val,:]
y_test = y_test[:m_val]

#initialising the MLP 
nn_clf = MLPClassifier(hidden_layer_sizes=(20),alpha = 0.3,activation='logistic',solver='lbfgs')

#plotting the learing curve for the model using plot_learing_curve defined in scikit documentation
plot_learning_curve(nn_clf, "NN Learning Curve", X, y)
plt.show()

#training the model
nn_clf.fit(X_train,y_train)

#validating on the validation set
acc = nn_clf.score(X_val, y_val)
print("Classifier accuracy on validation = " +str(acc * 100)+"%")

#testing on the test set
acc = nn_clf.score(X_test, y_test)
print("Classifier accuracy on test = " +str(acc * 100)+"%")

#saving the model
joblib.dump(nn_clf,"nn_clf.joblib")
Esempio n. 45
0
def SVM_Learning_Curves(X, Y, datasource, gamma_value):
    title = "SVM Learning Curves on " + datasource
    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=626)
    estimator = SVC(gamma=gamma_value, random_state=626)
    plt = plot_learning_curve(estimator, title, X, Y, ylim=(0.0, 1.05), cv=cv)
    plt.show()
    observation = env.reset()
    while not done:
        steps += 1
        action = agent.choose_action(observation)
        observation_, reward, done, info = env.step(action)
        agent.remember(observation, action, reward, observation_, done)
        agent.learn()
        observation = observation_
        score += reward
        alpha.append(agent.alpha)

    score_history.append(score)
    alpha_history.append(np.mean(alpha))
    avg_score = np.mean(score_history[-100:])
    if avg_score > best_score:
        best_score = avg_score
        agent.save_models()
    print('episode ', i, 'score %.1f' % score, 'avg score %1.f' % avg_score,
          'steps ', steps, 'alpha ', np.mean(alpha))

plot_learning_curve(score_history,
                    figure_file_return + '.png',
                    color='lightgreen',
                    avg_color='green',
                    Ylabel='Return')
plot_learning_curve(alpha_history,
                    figure_file_alpha + '.png',
                    color='blue',
                    Ylabel='Temperature alpha')

np.save(figure_file_return, score_history)
Esempio n. 47
0
def test_learning_curve():
    X = data[[0, 1, 2, 3, 4]].values
    y = data['outcome-class'].values
    fig = plot_learning_curve(estimator, "50 k-NN learning curve", X, y, cv=3, verbose=2, 
                              train_sizes=np.linspace(.1, 0.99, 20))
    fig.show()