def randomForestBagging(fileNames): trainX, testX, trainY, testY = prepare_data(test_size=.35, seed=0) testY = testY.to_numpy().astype(int) predictions = [] for file in fileNames: # model = dill.load(open(file,"rb")) with open(file, 'rb') as f: rf = dill.load(f) predictions.append(rf.predict(testX)) pred = np.zeros(len(predictions[0])) for i in range(len(predictions[0])): for set in range(len(fileNames)): pred[i] += predictions[set][i] print(pred[i]) pred[i] = (pred[i]/len(fileNames)).round() pred = pred.astype(int) fpr, tpr, thresholds = metrics.roc_curve(testY, pred, pos_label=1) auc = metrics.auc(fpr, tpr) conf_matrix, best_accuracy, recall_array, precision_array = func_confusion_matrix(testY, pred) print("Confusion Matrix: ") print(str(conf_matrix)) print("Average Accuracy: {}".format(str(best_accuracy))) print("Per-Class Precision: {}".format(str(precision_array))) print("Per-Class Recall: {}".format(str(recall_array))) print("Area under the ROC Curve: {}".format(auc))
def buildModels(): trainX, testX, trainY, testY = prepare_data(test_size=.35, seed=0) trainY = trainY.to_numpy().astype(int) testY = testY.to_numpy().astype(int) accuracy = [] pred = [] highestTrueNeg = 98 # set to previous highest highestAcc = .707 # set to previous highest for estimators in range(20, 1000, 10): rf = RandomForestRegressor(n_estimators=estimators) rf.fit(trainX, trainY) predictions = rf.predict(testX).round().astype(int) accuracy.append(metrics.accuracy_score(testY, predictions)) pred.append(predictions) if metrics.accuracy_score(testY, predictions) > .69: # manually consider models with better than 69% accuracy conf_matrix, class_acc, recall_array, precision_array = func_confusion_matrix(testY, predictions) if conf_matrix[0,0] > highestTrueNeg: tn = open("randomForestTrueNeg.obj", "wb") dill.dump(rf,tn) highestTrueNeg = conf_matrix[0,0] tn.close() elif metrics.accuracy_score(testY, predictions) > highestAcc: acc = open("randomForestAccuracy.obj", "wb") dill.dump(rf, acc) acc.close() highestAcc = class_acc index, value = max(enumerate(accuracy), key=operator.itemgetter(1)) print("Best Number of Estimators: {}".format(20 + 10*(index))) # Use the forest's predict method on the test data conf_matrix, best_accuracy, recall_array, precision_array = func_confusion_matrix(testY, pred[index]) print("Confusion Matrix: ") print(str(conf_matrix)) print("Average Accuracy: {}".format(str(best_accuracy))) print("Per-Class Precision: {}".format(str(precision_array))) print("Per-Class Recall: {}".format(str(recall_array)))
## step 4 Select the best model and apply it over the testing subset #############placeholder 4:testing ####################### best_kernel = error_min_for_kernal best_c = error_min_for_c # poly had many that were the "best" model = svm.SVC(kernel=best_kernel, C=best_c) model.fit(X=x_train, y=y_train) #############placeholder end ####################### ## step 5 evaluate your results in terms of accuracy, real, or precision. #############placeholder 5: metrics ####################### # func_confusion_matrix is not included # You might re-use this function for the Part I. y_pred = model.predict(X_test) conf_matrix, accuracy, recall_array, precision_array = func_confusion_matrix( Y_test, y_pred) # print(Y_test) # print(y_pred) print("Confusion Matrix: ") print(conf_matrix) print("Average Accuracy: {}".format(accuracy)) print("Per-Class Precision: {}".format(precision_array)) print("Per-Class Recall: {}".format(recall_array)) #############placeholder end ####################### #############placeholder 6: success and failure examples ####################### # Success samples: samples for which you model can correctly predict their labels # Failure samples: samples for which you model can not correctly predict their labels correct_preds = [] wrong_preds = [] for i in range(0, len(Y_test)):
def evaluate_confusion(model, X, Y): predicted_Y = model.predict(X) conf_matrix, accuracy, recall_array, precision_array = func_confusion_matrix( Y, predicted_Y) return accuracy, conf_matrix
from sklearn.linear_model import LogisticRegression from conf_matrix import func_confusion_matrix from data_preparation import prepare_data from sklearn import metrics # Prepare training and Test Data by splitting in training data X_Train, X_Test, Y_Train, Y_Test = prepare_data(test_size=0.35, seed=0) model = LogisticRegression(penalty='l1', C=8, fit_intercept=True, solver='liblinear', max_iter=100, l1_ratio=None) model.fit(X_Train, Y_Train) predictions = model.predict(X_Test) conf_matrix, accuracy, recall_array, precision_array = func_confusion_matrix( Y_Test, predictions) fpr, tpr, thresholds = metrics.roc_curve(Y_Test, predictions, pos_label=1) auc = metrics.auc(fpr, tpr) print() print("########### MODEL PERFORMANCE ###########") print("Confusion Matrix: ") print(conf_matrix) print("Average Accuracy: {}".format(accuracy)) print("Per-Class Precision: {}".format(precision_array)) print("Per-Class Recall: {}".format(recall_array)) print("#########################################") print("Area under the ROC Curve: {}".format(auc))
accuracy = logisticRegr.score(x_test, y_test) ### data for countplot ### Y = kickstartLabels.astype(float) Y = Y.reshape(49999,) min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1,1)) X = min_max_scaler.fit_transform(kickstartData) # number of fails and success sns.countplot(x=Y, palette='hls') xlabel("Failed 0.0 : Successful 1.0") plt.show() plt.savefig("Failed_Successful_Plot") ### get confusion matrix, accuracy, precision rate, and recall rate CM, acc, arrR, arrP = func_confusion_matrix(y_test, predictions) ### display confusion matrix ### df_cm = pd.DataFrame( CM, index = [i for i in ["Failed","Success"]], columns = [i for i in ["Failed","Success"]] ) ax = sns.heatmap(df_cm, annot=True) ax.set(xlabel="Prediction",ylabel="Ground Truth",title="Confusion Matrix") plt.savefig('Confusion_Matrix') plt.show() ### print out accuracy, precision rate, and recall rate ### print("Accuracy: ", acc) print("Per-class precision rate: ",arrP)
split = np.random.permutation(n) x_train = data[split[:50], 1:6] y_train = data[split[:50], 6].ravel() x_test = data[split[50:100], 1:6] y_test = data[split[50:100], 6].ravel() svm_model1 = svm.SVC(kernel=best_kernel, C=best_c) svm_model1.fit(X=x_train, y=y_train) ## Evaluate the results with the confusion matrix, accuracy, recall per class, and precision per class y_pred = svm_model1.predict(x_test) conf_mtx, acc, recall_arr, pr_arr = func_confusion_matrix(y_test, y_pred) print("Confusion Matrix with optimal parameters:") print(conf_mtx) print("Accuracy:\n{}".format(acc)) print("Per-Class Precision\n{}".format(pr_arr)) print("Per-Class Recall:\n{}".format(recall_arr)) print() #################################################### #### Implement a Bagging classifier (MODEL 2) #### #################################################### SVM = svm.LinearSVC(random_state=42) svm_model2 = BaggingClassifier(base_estimator=SVM, n_estimators=45,
nnpredict = 1 if predictions[1][i, 1] > predictions[1][i, 0] else 0 if (nnpredict == 0 and predictions[0][i] < percent * .01): pred[i] = 0 else: pred[i] = np.round(predictions[0][i]) pred = pred.astype(int) predic.append(pred) # find best thresholded prediction by auc max = 0 maxidx = -1 idx = 0 for pred in predic: fpr, tpr, thresholds = metrics.roc_curve(testY, pred, pos_label=1) auc = metrics.auc(fpr, tpr) if auc > max: max = auc maxidx = idx idx += 1 # evaluate best prediction bestPred = predic[maxidx] conf_matrix, best_accuracy, recall_array, precision_array = func_confusion_matrix( testY, bestPred) fpr, tpr, thresholds = metrics.roc_curve(testY, bestPred, pos_label=1) auc = metrics.auc(fpr, tpr) print("Confusion Matrix: ") print(str(conf_matrix)) print("Average Accuracy: {}".format(str(best_accuracy))) print("Per-Class Precision: {}".format(str(precision_array))) print("Per-Class Recall: {}".format(str(recall_array))) print("Area under the ROC Curve: {}".format(auc))