def test_scoring(): X, y = iris_data() clf1 = LogisticRegression(random_state=1) clf2 = DecisionTreeClassifier(random_state=1) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.25, random_state=123) score1 = clf1.fit(X_train, y_train).score(X_test, y_test) score2 = clf2.fit(X_train, y_train).score(X_test, y_test) assert round(score1, 2) == 0.97 assert round(score2, 2) == 0.95 t, p = paired_ttest_5x2cv(estimator1=clf1, estimator2=clf2, X=X, y=y, scoring='accuracy', random_seed=1) assert round(t, 3) == -1.539, t assert round(p, 3) == 0.184, p t, p = paired_ttest_5x2cv(estimator1=clf1, estimator2=clf2, X=X, y=y, scoring='f1_macro', random_seed=1) assert round(t, 3) == -1.510, t assert round(p, 3) == 0.191, p
def test_classifier_defaults(): X, y = iris_data() clf1 = LogisticRegression(random_state=1, multi_class='ovr', solver='liblinear') clf2 = DecisionTreeClassifier(random_state=1) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.25, random_state=123) score1 = clf1.fit(X_train, y_train).score(X_test, y_test) score2 = clf2.fit(X_train, y_train).score(X_test, y_test) assert round(score1, 2) == 0.97 assert round(score2, 2) == 0.95 t, p = paired_ttest_5x2cv(estimator1=clf1, estimator2=clf2, X=X, y=y, random_seed=1) assert round(t, 3) == -1.539, t assert round(p, 3) == 0.184, p # change maxdepth of decision tree classifier clf2 = DecisionTreeClassifier(max_depth=1, random_state=1) score3 = clf2.fit(X_train, y_train).score(X_test, y_test) assert round(score3, 2) == 0.63 t, p = paired_ttest_5x2cv(estimator1=clf1, estimator2=clf2, X=X, y=y, random_seed=1) assert round(t, 3) == 5.386, t assert round(p, 3) == 0.003, p
def test_scoring(): X, y = iris_data() clf1 = LogisticRegression(random_state=1, solver='liblinear', multi_class='ovr') clf2 = DecisionTreeClassifier(random_state=1) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.25, random_state=123) score1 = clf1.fit(X_train, y_train).score(X_test, y_test) score2 = clf2.fit(X_train, y_train).score(X_test, y_test) assert round(score1, 2) == 0.97 assert round(score2, 2) == 0.95 t, p = paired_ttest_5x2cv(estimator1=clf1, estimator2=clf2, X=X, y=y, scoring='accuracy', random_seed=1) assert round(t, 3) == -1.539, t assert round(p, 3) == 0.184, p t, p = paired_ttest_5x2cv(estimator1=clf1, estimator2=clf2, X=X, y=y, scoring='f1_macro', random_seed=1) if Version(sklearn_version) < Version('0.20'): assert round(t, 3) == -1.510, t assert round(p, 3) == 0.191, p else: assert round(t, 3) == -1.506, t assert round(p, 3) == 0.192, p
def test_classifier_defaults(): X, y = iris_data() clf1 = LogisticRegression(random_state=1, multi_class='ovr', solver='liblinear') clf2 = DecisionTreeClassifier(random_state=1) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.25, random_state=123) score1 = clf1.fit(X_train, y_train).score(X_test, y_test) score2 = clf2.fit(X_train, y_train).score(X_test, y_test) assert round(score1, 2) == 0.97 assert round(score2, 2) == 0.95 t, p = paired_ttest_5x2cv(estimator1=clf1, estimator2=clf2, X=X, y=y, random_seed=1) assert round(t, 3) == -1.539, t assert round(p, 3) == 0.184, p # change maxdepth of decision tree classifier clf2 = DecisionTreeClassifier(max_depth=1, random_state=1) score3 = clf2.fit(X_train, y_train).score(X_test, y_test) assert round(score3, 2) == 0.63 t, p = paired_ttest_5x2cv(estimator1=clf1, estimator2=clf2, X=X, y=y, random_seed=1) assert round(t, 3) == 5.386, t assert round(p, 3) == 0.003, p
def hypothesis_testing_between_two_models(estimator1, estimator2, X, y, scoring='explained_variance', random_seed=1): t, p = paired_ttest_5x2cv(estimator1=model1, estimator2=model2, X=X, y=y, scoring=scoring, random_seed=random_seed) return t, p
def test_scoring(): X, y = iris_data() clf1 = LogisticRegression(random_state=1, solver='liblinear', multi_class='ovr') clf2 = DecisionTreeClassifier(random_state=1) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.25, random_state=123) score1 = clf1.fit(X_train, y_train).score(X_test, y_test) score2 = clf2.fit(X_train, y_train).score(X_test, y_test) assert round(score1, 2) == 0.97 assert round(score2, 2) == 0.95 t, p = paired_ttest_5x2cv(estimator1=clf1, estimator2=clf2, X=X, y=y, scoring='accuracy', random_seed=1) assert round(t, 3) == -1.539, t assert round(p, 3) == 0.184, p t, p = paired_ttest_5x2cv(estimator1=clf1, estimator2=clf2, X=X, y=y, scoring='f1_macro', random_seed=1) if Version(sklearn_version) < Version('0.20'): assert round(t, 3) == -1.510, t assert round(p, 3) == 0.191, p else: assert round(t, 3) == -1.506, t assert round(p, 3) == 0.192, p
def test_regressor(): X, y = boston_housing_data() reg1 = Lasso(random_state=1) reg2 = Ridge(random_state=1) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.25, random_state=123) score1 = reg1.fit(X_train, y_train).score(X_test, y_test) score2 = reg2.fit(X_train, y_train).score(X_test, y_test) assert round(score1, 2) == 0.66, score1 assert round(score2, 2) == 0.68, score2 t, p = paired_ttest_5x2cv(estimator1=reg1, estimator2=reg2, X=X, y=y, random_seed=1) assert round(t, 3) == -0.599, t assert round(p, 3) == 0.575, p
def test_regressor(): X, y = boston_housing_data() reg1 = Lasso(random_state=1) reg2 = Ridge(random_state=1) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.25, random_state=123) score1 = reg1.fit(X_train, y_train).score(X_test, y_test) score2 = reg2.fit(X_train, y_train).score(X_test, y_test) assert round(score1, 2) == 0.66, score1 assert round(score2, 2) == 0.68, score2 t, p = paired_ttest_5x2cv(estimator1=reg1, estimator2=reg2, X=X, y=y, random_seed=1) assert round(t, 3) == -0.599, t assert round(p, 3) == 0.575, p
#Naive Bayes model_nb = BernoulliNB() model_nb.fit(X_train_smote, y_train_smote) result_two = model_nb.score(X_test, y_test) print("Naive Bayes Accuracy = %0.2f%%"% (result_two*100)) #Decison Tree model_tree = DecisionTreeClassifier(min_samples_split=5) model_tree.fit(X_train_smote, y_train_smote) result_three = model_tree.score(X_test, y_test) print("Decision Tree Accuracy = %0.2f%%" % (result_three*100)) #Gradient Boosting model_gb = GradientBoostingClassifier() model_gb.fit(X_train_smote, y_train_smote) result_four = model_gb.score(X_test, y_test) print("Gradient Boosting Accuracy = %0.2f%%" % (result_four*100)) #To determine the final model for this problem, I will a 5x2 Cross Validation paired t test #This test is used to determine statistical difference between two classifiers #Citation: http://rasbt.github.io/mlxtend/user_guide/evaluate/paired_ttest_5x2cv/ print('\nResults of 5x2 Cross Validation Paired T-Test: ') t, p = paired_ttest_5x2cv(estimator1=model_rf, estimator2=model_nb, X=X_train_smote, y=y_train_smote, random_seed=1) print('t statistic: %.3f' % t) print('p value: %.3f' % p) if (p>t): print('5x2 CV: The null hypothesis is not rejected. There is no statistical difference between classifiers.') else: print('5x2 CV: The null hypothese is rejected. There is a statistical difference between classifiers.')
def compareModels_ttest(self, models=[], X=None, y=None, a=0.05): comparison_rows = [] headers = [ 'model pairs', 'model1', 'mean1', 'std1', 'model2', 'mean2', 'std2', 't-stat', 'p-val', 'sig/notsig' ] print(*headers) models = list(self.models_dict.keys()) assert len( models ) >= 2, 'there must be at least 2 models to run ttest statistical comparison' betterClassifier = models[0] betterCV = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=self.random_state) betterScore = cross_val_score( self.models_dict[betterClassifier].getBuiltModel(), X, y, scoring='accuracy', cv=betterCV) betterMeanScore = np.mean(betterScore) betterStdDev = np.std(betterScore) comparison_rows = [] for model in models[1:]: row = [f'{betterClassifier} v {model}'] cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=self.random_state) score = cross_val_score(self.models_dict[model].getBuiltModel(), X, y, scoring='accuracy', cv=cv) meanScore = np.mean(score) stdDev = np.std(score) if meanScore > betterMeanScore: # this is better classifier row.extend([model, f'{meanScore:.5f}*', f'{stdDev:.5f}']) row.extend([ betterClassifier, f'{betterMeanScore:.5f}', f'{betterStdDev:.5f}' ]) betterClassifier = model betterMeanScore = meanScore betterStdDev = stdDev else: row.extend([ betterClassifier, f'{betterMeanScore:.5f}*', f'{betterStdDev:.5f}' ]) row.extend([model, f'{meanScore:.5f}', f'{stdDev:.5f}']) t, p = paired_ttest_5x2cv( estimator1=self.models_dict[betterClassifier].getBuiltModel(), estimator2=self.models_dict[model].getBuiltModel(), X=X, y=y, scoring='accuracy') row.extend([f'{t:.3f}', f'{p:.3f}']) if p <= a: row.append('sig') else: row.append('notsig') comparison_rows.append(row) print(*row) print(tabulate(comparison_rows, headers=headers))
def compareModels_2x5cv(self, models=[], X=None, y=None, a=0.05): comparison_rows = [] headers = [ 'models', 'model1', 'mean1', 'std1', 'model2', 'mean2', 'std2', 'sig/notsig' ] comparisons_ran = dict() for model1 in self.models_dict.keys(): if not model1 in comparisons_ran: comparisons_ran[model1] = [] for model2 in self.models_dict.keys(): if not model2 in comparisons_ran: comparisons_ran[model2] = [] if model1 != model2 and (model1 not in comparisons_ran[model2]) and ( model2 not in comparisons_ran[model1]): row = ['{} & {}'.format(model1, model2)] row.append(model1) cv1 = RepeatedStratifiedKFold( n_splits=10, n_repeats=2, random_state=self.random_state) scores1 = cross_val_score( self.models_dict[model1].getBuiltModel(), X, y, scoring='accuracy', cv=cv1) row.append(model2) cv2 = RepeatedStratifiedKFold( n_splits=10, n_repeats=2, random_state=self.random_state) scores2 = cross_val_score( self.models_dict[model2].getBuiltModel(), X, y, scoring='accuracy', cv=cv1) meanScore1 = np.mean(scores1) meanScore2 = np.mean(scores2) row.append(f'{np.std(scores1):.5f}') row.append(f'{np.std(scores2):.5f}') if meanScore1 > meanScore2: row.append(f'*{meanScore1:.5f}') else: row.append(f'*{meanScore2:.5f}') t, p = paired_ttest_5x2cv( estimator1=self.models_dict[model1].getBuiltModel(), estimator2=self.models_dict[model2].getBuiltModel(), X=X, y=y, scoring='accuracy') if p <= a: row.append('sig') else: row.append('notsig') comparisons_ran[model1].append(model2) comparisons_ran[model2].append(model1) comparison_rows.append(row) print(tabulate(comparison_rows, headers=headers))
from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn import datasets from mlxtend.evaluate import paired_ttest_5x2cv iris = datasets.load_iris() # Logistic Regression regressionModel = LogisticRegression() # KNN knnModel = KNeighborsClassifier(n_neighbors=3) # Calculate 5x2 paired t test t, p = paired_ttest_5x2cv(estimator1=regressionModel, estimator2=knnModel, X=iris.data, y=iris.target, random_seed=1) print('t statistic: %.3f' % t) print('p value: %.3f' % p) print('statistic=%.3f, p-value=%.3f' % (t, p)) alpha = 0.05 if p > alpha: print('Same proportions of errors (fail to reject H0)') else: print('Different proportions of errors (reject H0)')
# NO DOCUMENTATION ON CV SPLITS. SO PROBABLY SHUFFELS TIME SERIES DATA TO NON-TIME SERIES DATA debug = False for i in range(len(models)): for j in range(len(models)): if j <= i: continue else: # wrap models i, j in pipelines pipeline_i = Pipeline(steps=[('m', models[i])]) pipeline_j = Pipeline(steps=[('m', models[j])]) # show all warnings on fail on exception if debugging if debug: # Check hypothesis between two models t, p = paired_ttest_5x2cv(estimator1=pipeline_i, estimator2=pipeline_j, X=X, y=y, scoring='explained_variance', random_seed=1) else: try: with catch_warnings(): filterwarnings("ignore") # Check hypothesis between two models t, p = paired_ttest_5x2cv(estimator1=pipeline_i, estimator2=pipeline_j, X=X, y=y, scoring='explained_variance', random_seed=1) except: error = None
print('F statistic: %.3f' % f) print('p value: %.3f' % p) F statistic: 10.407 p value: 0.009 #5X2CV PAIRED T TEST (7,8) from mlxtend.evaluate import paired_ttest_5x2cv import time start = time.time() t, p = paired_ttest_5x2cv( estimator1 = classifier_lgbm_7, estimator2 = classifier_lgbm_8, X = X, y = Y, scoring = make_scorer(matthews_corrcoef), random_seed = 42) end = time.time() print("Tempo de Execução: {:.2f} min".format((end - start)/60)) Tempo de Execução: 30.59 min print('t statistic: %.3f' % t) print('p value: %.3f' % p) t statistic: 0.341 p value: 0.747 #5X2CV PAIRED T TEST (8,28)
def compareModels_tTest(self, X, y, exp_type, a=0.05): # Initialize list objects and set up the header row rows = [] headers = [ 'model pairs', 'model1', 'mean1', 'std1', 'model2', 'mean2', 'std2', 'sig/notsig' ] model_ids = list(self.models.keys()) # Initialize bestModel and respective values to the first one we have bestModel = self.models[model_ids[0]] bestModel.trainCV(X, y, exp_type=exp_type, nfolds=10, nrepeats=2, metrics='accuracy') bestScores = bestModel.getMetrics() bestAvg = bestScores['Accuracy']['avg'] bestStd = bestScores['Accuracy']['std'] for model_id in model_ids[1:]: # Set up the current model model = self.models[model_id] # Set the row's first element row = ['{} vs {}'.format(bestModel.getName(), model.getName())] # Train and get the results for the current model model.trainCV(X, y, exp_type=exp_type, nfolds=10, nrepeats=2, metrics='accuracy') modelScores = model.getMetrics() modelAvg = modelScores['Accuracy']['avg'] modelStd = modelScores['Accuracy']['std'] # Compare the scores with the best one so far swap = False if modelAvg > bestAvg: # This is the best classifier so far # Add the previous best model's information to the row row.append(bestModel.getName()) row.append('{:.4f}'.format(bestAvg)) row.append('{:.4f}'.format(bestStd)) # Add the current model information to the row row.append(model.getName() + '*') row.append('{:.4f}'.format(modelAvg)) row.append('{:.4f}'.format(modelStd)) # Set the best model as the current one swap = True else: # This model performed worse than the best we've seen so far # Add the previous best model's information to the row row.append(bestModel.getName() + '*') row.append('{:.4f}'.format(bestAvg)) row.append('{:.4f}'.format(bestStd)) # Add the current model information to the row row.append(model.getName()) row.append('{:.4f}'.format(modelAvg)) row.append('{:.4f}'.format(modelStd)) # Determine whether the difference in performance is significant t, p = paired_ttest_5x2cv(estimator1=model.getBuiltModel(), estimator2=bestModel.getBuiltModel(), X=X, y=y, scoring='accuracy', random_seed=0) # Add the t, p values to the row #row.append('{:.3f}'.format(t)) #row.append('{:.3f}'.format(p)) # Add the significance determination to the row if p <= a: row.append('sig') else: row.append('notsig') # Add the completed row to the list of rows rows.append(row) if swap: bestModel = model bestAvg = modelAvg bestStd = modelStd # Print the table print(tabulate(rows, headers=headers))
from sklearn.metrics import precision_recall_curve from sklearn.metrics import plot_precision_recall_curve import matplotlib.pyplot as plt disp = plot_precision_recall_curve(text_clf, X_test, y_test) disp.ax_.set_title('2-class Precision-Recall curve: ' 'AP={0:0.2f}'.format(average_precision)) # %% space for statistical testing #5 x 2 cross validation from mlxtend.evaluate import paired_ttest_5x2cv t, p = paired_ttest_5x2cv(estimator1=text_clf, estimator2=voting_clf, X=X, y=y, random_seed=1) print('t statistic: %.3f' % t) print('p value: %.3f' % p) #bootstrap # configure bootstrap n_iterations = 500 n_size = int(len(review_random_set) * 0.50) from sklearn.utils import resample # run bootstrap stats = list()