def trials(x, Y): accc = np.zeros((10, 1)) ebv = np.zeros((10, 3)) prf = np.zeros((10, 3)) for it in np.arange(10): X_train, X_test, y_train, y_test = train_test_split(x, Y, train_size=0.7, random_state=42) X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size=0.9) #X_test = X_train #y_test = y_train #X_test = X_valid #y_test = y_valid treee = DecisionTreeClassifier(max_depth=4, min_samples_split=500, min_samples_leaf=450, max_features=None, random_state=None, max_leaf_nodes=10) treee.fit(X_train, y_train) y_pred = treee.predict(X_test) acc = accuracy(y_test, np.transpose(y_pred)) accc[it - 1] = acc #print("Accuracy:", acc) mse, bias, var = bias_variance_decomp(treee, X_train, y_train, X_test, y_test, loss='0-1_loss', random_seed=123) ebv[it - 1, 0:3] = mse, bias, var #print() #print('Average Expected Loss: %.3f' % mse) #print('Bias: %.3f' % bias) #print('Variance: %.3f' % var) p = precision_score(y_test, y_pred, average='binary') r = recall_score(y_test, y_pred, average='binary') f = f1_score(y_test, y_pred, average='binary') prf[it - 1, 0:3] = p, r, f #print() #print('Precision: %.3f' % p) #print('Recall: %.3f' % r) #print('f1: %.3f' % f) print(accc) print() print(ebv) print() print(prf)
def test_mse_tree(): X, y = boston_housing_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123, shuffle=True) tree = DecisionTreeRegressor(random_state=123) avg_expected_loss, avg_bias, avg_var = bias_variance_decomp( tree, X_train, y_train, X_test, y_test, loss='mse', random_seed=123) assert round(avg_expected_loss, 3) == 31.536 assert round(avg_bias, 3) == 14.096 assert round(avg_var, 3) == 17.440
def bv_decomp(all_estimators, X_train, y_train, X_test, y_test): print("\n\n<<<<DECOMPOSING THEBIAS AND VARIANCE>>>>\n\n") for key, value in all_estimators.items(): avg_expected_loss, avg_bias, avg_var = bias_variance_decomp( value, X_train, y_train, X_test, y_test, loss='0-1_loss', random_seed=123) print('Average expected loss for {} is {}'.format( key, round(avg_expected_loss, 3))) print('Average bias for {} is {}'.format(key, round(avg_bias, 3))) print('Average variance for {} is {}'.format(key, round(avg_var, 3))) print('\n')
def bias_variance_decomp(self, X_train, X_test, Y_train, Y_test): scaler = StandardScaler() X_train = scaler.fit_transform(X=X_train) X_test = scaler.transform(X_test) rf = RandomForestClassifier() avg_expected_loss, avg_bias, avg_var = bias_variance_decomp( rf, X_train, Y_train, X_test, Y_test, loss='0-1_loss') print('Decomposing Bias and Variance of RandomForest') print('-------------------------------------------') print('Average expected loss: %.3f' % avg_expected_loss) print('Average bias: %.3f' % avg_bias) print('Average variance: %.3f' % avg_var) print('-------------------------------------------') return
def test_mse_tree(): X, y = boston_housing_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123, shuffle=True) tree = DecisionTreeRegressor(random_state=123) avg_expected_loss, avg_bias, avg_var = bias_variance_decomp( tree, X_train, y_train, X_test, y_test, loss='mse', random_seed=123) assert round(avg_expected_loss, 3) == 31.917 assert round(avg_bias, 3) == 13.814 assert round(avg_var, 3) == 18.102
def test_01_loss_tree(): X, y = iris_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123, shuffle=True, stratify=y) tree = DecisionTreeClassifier(random_state=123) avg_expected_loss, avg_bias, avg_var = bias_variance_decomp( tree, X_train, y_train, X_test, y_test, loss='0-1_loss', random_seed=123) assert round(avg_expected_loss, 3) == 0.062 assert round(avg_bias, 3) == 0.022 assert round(avg_var, 3) == 0.040
def test_01_loss_tree(): X, y = iris_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123, shuffle=True, stratify=y) tree = DecisionTreeClassifier(random_state=123) avg_expected_loss, avg_bias, avg_var = bias_variance_decomp( tree, X_train, y_train, X_test, y_test, loss='0-1_loss', random_seed=123) assert round(avg_expected_loss, 3) == 0.062 assert round(avg_bias, 3) == 0.022 assert round(avg_var, 3) == 0.040
def test_mse_bagging(): X, y = boston_housing_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123, shuffle=True) tree = DecisionTreeRegressor(random_state=123) bag = BaggingRegressor(base_estimator=tree, n_estimators=10, random_state=123) avg_expected_loss, avg_bias, avg_var = bias_variance_decomp( bag, X_train, y_train, X_test, y_test, loss='mse', random_seed=123) assert round(avg_expected_loss, 2) == 20.24, avg_expected_loss assert round(avg_bias, 2) == 15.63, avg_bias assert round(avg_var, 2) == 4.61, avg_var
def test_mse_bagging(): X, y = boston_housing_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123, shuffle=True) tree = DecisionTreeRegressor(random_state=123) bag = BaggingRegressor(base_estimator=tree, n_estimators=100, random_state=123) avg_expected_loss, avg_bias, avg_var = bias_variance_decomp( bag, X_train, y_train, X_test, y_test, loss='mse', random_seed=123) assert round(avg_expected_loss, 3) == 18.593 assert round(avg_bias, 3) == 15.354 assert round(avg_var, 3) == 3.239
def get_accuracy_with_confusion_matrix(train,y,test,testY,model,classicmodel,nump=False,senti="false"): if senti=="add": train_senti,test_senti=append_senti_to_vect(train,test) train_model=model.fit_transform(train) test_model=model.fit_transform(test) if nump: train_model=train_model.toarray() test_model=test_model.toarray() if senti=="add": train_model=np.c_[train_model,train_senti] test_model=np.c_[test_model,test_senti] final_model=classicmodel.fit(train_model,y) yhat=final_model.predict(test_model) print("Accuracy :", np.mean(yhat == testY)) print(classification_report(testY, yhat)) mse, bias, var = bias_variance_decomp(final_model,train_model,y,test_model,testY,loss='mse',num_rounds=200, random_seed=1) print("MSE : "+str(mse)) print("Bias : "+str(bias)) print("Variance : "+str(var)) confusionMatrix(testY,yhat) return final_model
def calculate_mse_bias_variance(X, y, test_size): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=1) mse, bias, var = bias_variance_decomp(pipeline, X_train, y_train, X_test, y_test, loss='mse', num_rounds=200, random_seed=1) errors.append(mse) biases.append(bias) variances.append(var) print('Estimator :', estimator_names[i]) print('Test Size :', test_size) print('MSE: %.3f' % mse) print('Bias: %.3f' % bias) print('Variance: %.3f' % var) print('--------------------------------')
def perform_bias_variance_decomposition(estimator, x_train, y_train, x_test, y_test, model_uid, n_boostraps=20): """ Decomposes the average loss of a model into bias and variance. Writes out the results locally. :param estimator: estimator object :param x_train: x_train :param y_train: y_train :param x_test: x_test :param y_test: y_test :param n_boostraps: number of bootstrap samples to take :param model_uid: model uid """ x_train = x_train.reset_index(drop=True) y_train = y_train.reset_index(drop=True) x_test = x_test.reset_index(drop=True) y_test = y_test.reset_index(drop=True) avg_expected_loss, avg_bias, avg_var = bias_variance_decomp( estimator, x_train, y_train, x_test, y_test, loss='0-1_loss', random_seed=1234, num_rounds=n_boostraps) pd.DataFrame({ 'avg_expected_loss': [avg_expected_loss], 'avg_bias': [avg_bias], 'avg_var': [avg_var] }).to_csv(os.path.join('modeling', model_uid, 'diagnostics', 'evaluation_files', f'bias_variance_decomposition.csv'), index=False)
def biasVarianceTradeOff(self, lossFunction="mse", numRounds=200, display=True): """ The biasVarianceTradeOff public method print the bias variance trade off (i.e.: The mean square error, the Bias, the variance) of a particular classifier ran. [bias_variance_decomp from mlxtend is used](http://rasbt.github.io/mlxtend/user_guide/evaluate/bias_variance_decomp/) Parameters ---------- lossFunction: string<"mse", "0-1_loss"> Allow to use one of the above loss function for the bias_variance_decomp API method. numRounds: int range(1, inf) DEFAULT=200 Allow to give the number of bootstrapping that the API should do on the data for evaluating the model. display: Boolean DEFAULT=True Display or not the values, in the case of False, it will just stored the result into the class to use it later. Returns ------- (void) """ self.mse, self.bias, self.var = bias_variance_decomp( self.model, self.X_train, self.y_train, self.X_test, self.y_test, loss=lossFunction, num_rounds=numRounds, random_seed=123) # summarize results if display: print('mse Loss: %.3f' % self.mse) print('Bias: %.3f' % self.bias) print('Variance: %.3f' % self.var) print("Accuracy: %.3f" % self.accuracy)
def calculate_bias_variance(self): """ Calculate bias and variance """ mse, bias, var = evaluate.bias_variance_decomp( self.model, np.array(self.data.x_train), np.array(self.data.y_train), np.array(self.data.x_test), np.array(self.data.y_test), loss="mse", num_rounds=200, random_seed=RANDOM_SEED, ) self.output.append({ "type": "bias_variance", "data": { "MSE": mse, "BIAS": bias, "VARIANCE": var, }, }) print(f"{self.name}: Total Error (Means, Bias, Variance) = " f"({mse}, {bias}, {var})") breakpoint()
def calculate_mse_bias_variance(X, y, test_size): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) mse, bias, var = bias_variance_decomp( KNeighborsRegressor(n_neighbors=test_size), X_train, y_train, X_test, y_test, loss='mse', num_rounds=200, random_seed=1) errors.append(mse) biases.append(bias) variances.append(var) print('Estimator : KNN Regressor') print('Degree :', test_size) print('MSE: %.3f' % mse) print('Bias: %.3f' % bias) print('Variance: %.3f' % var) print('--------------------------------')
def pandas_input_fail(): X, y = iris_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123, shuffle=True, stratify=y) X_train = pd.DataFrame(X_train) tree = DecisionTreeClassifier(random_state=123) with pytest.raises(ValueError): avg_expected_loss, avg_bias, avg_var = bias_variance_decomp( tree, X_train, y_train, X_test, y_test, loss='0-1_loss', random_seed=123)
def calculate_mse_bias_variance(X, y, test_size): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) mse, bias, var = bias_variance_decomp(MLPRegressor( hidden_layer_sizes=test_size, max_iter=1000), X_train, y_train, X_test, y_test, loss='mse', num_rounds=200, random_seed=1) errors.append(mse) biases.append(bias) variances.append(var) print('Estimator : Neural Networks') print('Test Size :', test_size) print('MSE: %.3f' % mse) print('Bias: %.3f' % bias) print('Variance: %.3f' % var) print('--------------------------------')
def bv_decomp_wrapper(model, xtrain, ytrain, xtest, ytest): name = Path(model[0]).stem modelobject = model[1] print(("working on model {}".format(name))) avg_expected_loss, avg_bias, avg_var = bias_variance_decomp( modelobject, xtrain, ytrain, xtest, ytest, loss="0-1_loss", random_seed=821996, num_rounds=100, ) result_dict = { "name": name, "avg_bias": avg_bias, "avg_expected_loss": avg_expected_loss, "avg_var": avg_var, } return result_dict
from mlxtend.evaluate import bias_variance_decomp import time start = time.time() avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(classifier_lgbm, X_train, Y_train, X_test, Y_test, loss='0-1_loss', random_seed=42) end = time.time() print("Tempo de Execução: {:.2f} min".format((end - start) / 60)) print('Average expected loss: %.3f' % avg_expected_loss) print('Average bias: %.3f' % avg_bias) print('Average variance: %.3f' % avg_var)
def SimpleDTandOptimizedDTVarianceDecomp(self): """ The SimpleDTandOptimizedDTVarianceDecomp public method is the gain in variance and bias of passing from a simple Decision Tree to a Optimized Decision Tree. [Inspired from the doc/tutorials available in scikitlearn](https://scikit-learn.org/stable/auto_examples/index.html) The process is as follow: - Create the estimator - Evaluate his bias variance decomposition using mlxtend. - Doing the above step twice for the Simple and Optimized Decision tree. - Display the reduction of the variance from the first classifier to the second. - Display the introduction of the bias from the first classifier to the second. Parameters ---------- (void) Returns ------- (void) """ dt = DecisionTreeClassifier(criterion="entropy", max_depth=2) error_dt, bias_dt, var_dt = bias_variance_decomp(dt, self.X_train, self.y_train, self.X_test, self.y_test, 'mse', random_seed=123) param_dist = { "max_depth": range(3, 10), "criterion": ["entropy", "gini"], } OptDt = GridSearchCV(DecisionTreeClassifier(), param_dist, cv=10, n_jobs=-1, return_train_score=True) error_dt_pruned, bias_dt_pruned, var_dt_pruned = bias_variance_decomp( OptDt, self.X_train, self.y_train, self.X_test, self.y_test, 'mse', random_seed=123) print("Variance Impact from the first to the second classifier:", str(np.round((var_dt_pruned / var_dt - 1) * 100, 2)) + '%') print("Bias Impact from the first to the second classifier:", str(np.round((bias_dt_pruned / bias_dt - 1) * 100, 2)) + '%') # fig, ax = plt.subplots(nrows=1, ncols=2) print(var_dt_pruned) print(var_dt) print(bias_dt_pruned) print(bias_dt) fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 8)) algorithms = ['Simple DT', 'Optimised DT'] biases = [bias_dt, bias_dt_pruned] ax[0].bar(algorithms, biases, color='lightblue') ax[0].set_ylabel('Bias') ax[0].set_title('Bias impact through a simple to an optimised DT') ax[0].set_xticks(algorithms) ax[0].set_xticklabels(algorithms) ax[0].legend(['Bias']) variances = [var_dt, var_dt_pruned] ax[1].bar(algorithms, variances, color='#69b3a2') ax[1].set_ylabel('Variance') ax[1].set_title( 'Variance impact through a simple DT to an optimised DT') ax[1].set_xticks(algorithms) ax[1].set_xticklabels(algorithms) ax[1].legend(['Variance']) plt.show()
from sklearn.linear_model import LinearRegression from sklearn.datasets import fetch_california_housing from mlxtend.evaluate import bias_variance_decomp # preparing the dataset into inputs (feature matrix) and outputs (target vector) data = fetch_california_housing() # fetch the data X = data.data # feature matrix y = data.target # target vector # split the data into training and test samples X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # define the model model = LinearRegression() # estimating the bias and variance avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(model, X_train, y_train, X_test, y_test, loss='mse', num_rounds=50, random_seed=20) # summary of the results print('Average expected loss: %.3f' % avg_expected_loss) print('Average bias: %.3f' % avg_bias) print('Average variance: %.3f' % avg_var)
print("Accuracy: " + str((TP + TN) / (TP + TN + FP + FN))) print("Classification Error: " + str((FP + FN) / (TP + TN + FP + FN))) print("Positive Predictive Value: " + str(TP / (TP + FP))) print("Demographic Parity: " + str((TP + FP) / (TP + TN + FP + FN))) print("False Positive Rate: " + str(FP / (TN + FP))) evaluate(covered_points, "******Covered******") evaluate(uncovered_points, "******Uncovered******") covered_x_test = np.array(covered_x_test) covered_y_test = np.array(covered_y_test) uncovered_x_test = np.array(uncovered_x_test) uncovered_y_test = np.array(uncovered_y_test) mse, bias, var = bias_variance_decomp(clf, x_train_scaled, y_train, covered_x_test, covered_y_test, loss='mse', num_rounds=200, random_seed=1) print("******Covered******") print('MSE: %.3f' % mse) print('Bias: %.3f' % bias) print('Variance: %.3f' % var) mse, bias, var = bias_variance_decomp(clf, x_train_scaled, y_train, uncovered_x_test, uncovered_y_test, loss='mse', num_rounds=200, random_seed=1) print("******Uncovered******") print('MSE: %.3f' % mse) print('Bias: %.3f' % bias) print('Variance: %.3f' % var)
print('Accuracy Score: ', accuracy_score(y_pred, y_test)) # y_pred is the output from sklearn.metrics import f1_score f1_metric = f1_score(y_test, y_pred, average='macro') print("f1 score macro:", f1_metric) from sklearn.metrics import f1_score f1_metric_micro = f1_score(y_test, y_pred, average='micro') print("f1 score micro:", f1_metric_micro) # print(tree.plot_tree(classifier)) from mlxtend.evaluate import bias_variance_decomp mse, bias, var = bias_variance_decomp(classifier, X_train, y_train, X_test, y_test, num_rounds=200, random_seed=1) # summarize results print('Bias: %.3f' % bias) print('Variance: %.3f' % var) from sklearn.model_selection import cross_val_score # clf = svm.SVC(kernel='linear', C=1) scores = cross_val_score(classifier.fit(X_train, y_train), X_features_input,y_label_output, cv=5) print('Cross Validation') print(scores) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) #take input from the loaded model input_sepal_length = float(input("Enter sepal length: ")) input_sepal_width = float(input("Enter sepal width:")) input_petal_length = float(input("Enter petal Length: ")) input_petal_width = float(input("Enter petal width: "))