Esempio n. 1
0
def randomForestBagging(fileNames):
    trainX, testX, trainY, testY = prepare_data(test_size=.35, seed=0)
    testY = testY.to_numpy().astype(int)
    predictions = []
    for file in fileNames:
        # model = dill.load(open(file,"rb"))
        with open(file, 'rb') as f:
            rf = dill.load(f)
        predictions.append(rf.predict(testX))
    pred = np.zeros(len(predictions[0]))
    for i in range(len(predictions[0])):
        for set in range(len(fileNames)):
            pred[i] += predictions[set][i]
            print(pred[i])
        pred[i] = (pred[i]/len(fileNames)).round()
    pred = pred.astype(int)
    fpr, tpr, thresholds = metrics.roc_curve(testY, pred, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    conf_matrix, best_accuracy, recall_array, precision_array = func_confusion_matrix(testY, pred)
    print("Confusion Matrix: ")
    print(str(conf_matrix))
    print("Average Accuracy: {}".format(str(best_accuracy)))
    print("Per-Class Precision: {}".format(str(precision_array)))
    print("Per-Class Recall: {}".format(str(recall_array)))
    print("Area under the ROC Curve: {}".format(auc))
Esempio n. 2
0
def buildModels():
    trainX, testX, trainY, testY = prepare_data(test_size=.35, seed=0)
    trainY = trainY.to_numpy().astype(int)
    testY = testY.to_numpy().astype(int)

    accuracy = []
    pred = []
    highestTrueNeg = 98  # set to previous highest
    highestAcc = .707  # set to previous highest
    for estimators in range(20, 1000, 10):
        rf = RandomForestRegressor(n_estimators=estimators)
        rf.fit(trainX, trainY)
        predictions = rf.predict(testX).round().astype(int)
        accuracy.append(metrics.accuracy_score(testY, predictions))
        pred.append(predictions)
        if metrics.accuracy_score(testY, predictions) > .69:  # manually consider models with better than 69% accuracy
            conf_matrix, class_acc, recall_array, precision_array = func_confusion_matrix(testY, predictions)
            if conf_matrix[0,0] > highestTrueNeg:
                tn = open("randomForestTrueNeg.obj", "wb")
                dill.dump(rf,tn)
                highestTrueNeg = conf_matrix[0,0]
                tn.close()
            elif metrics.accuracy_score(testY, predictions) > highestAcc:
                acc = open("randomForestAccuracy.obj", "wb")
                dill.dump(rf, acc)
                acc.close()
                highestAcc = class_acc

    index, value = max(enumerate(accuracy), key=operator.itemgetter(1))

    print("Best Number of Estimators: {}".format(20 + 10*(index)))
    # Use the forest's predict method on the test data
    conf_matrix, best_accuracy, recall_array, precision_array = func_confusion_matrix(testY, pred[index])
    print("Confusion Matrix: ")
    print(str(conf_matrix))
    print("Average Accuracy: {}".format(str(best_accuracy)))
    print("Per-Class Precision: {}".format(str(precision_array)))
    print("Per-Class Recall: {}".format(str(recall_array)))
Esempio n. 3
0
## step 4 Select the best model and apply it over the testing subset
#############placeholder 4:testing  #######################

best_kernel = error_min_for_kernal
best_c = error_min_for_c  # poly had many that were the "best"
model = svm.SVC(kernel=best_kernel, C=best_c)
model.fit(X=x_train, y=y_train)
#############placeholder end #######################

## step 5 evaluate your results in terms of accuracy, real, or precision.

#############placeholder 5: metrics #######################
# func_confusion_matrix is not included
# You might re-use this function for the Part I.
y_pred = model.predict(X_test)
conf_matrix, accuracy, recall_array, precision_array = func_confusion_matrix(
    Y_test, y_pred)
# print(Y_test)
# print(y_pred)
print("Confusion Matrix: ")
print(conf_matrix)
print("Average Accuracy: {}".format(accuracy))
print("Per-Class Precision: {}".format(precision_array))
print("Per-Class Recall: {}".format(recall_array))
#############placeholder end #######################

#############placeholder 6: success and failure examples #######################
# Success samples: samples for which you model can correctly predict their labels
# Failure samples: samples for which you model can not correctly predict their labels
correct_preds = []
wrong_preds = []
for i in range(0, len(Y_test)):
def evaluate_confusion(model, X, Y):
    predicted_Y = model.predict(X)
    conf_matrix, accuracy, recall_array, precision_array = func_confusion_matrix(
        Y, predicted_Y)
    return accuracy, conf_matrix
from sklearn.linear_model import LogisticRegression
from conf_matrix import func_confusion_matrix
from data_preparation import prepare_data
from sklearn import metrics

# Prepare training and Test Data by splitting in training data
X_Train, X_Test, Y_Train, Y_Test = prepare_data(test_size=0.35, seed=0)

model = LogisticRegression(penalty='l1',
                           C=8,
                           fit_intercept=True,
                           solver='liblinear',
                           max_iter=100,
                           l1_ratio=None)
model.fit(X_Train, Y_Train)
predictions = model.predict(X_Test)
conf_matrix, accuracy, recall_array, precision_array = func_confusion_matrix(
    Y_Test, predictions)
fpr, tpr, thresholds = metrics.roc_curve(Y_Test, predictions, pos_label=1)
auc = metrics.auc(fpr, tpr)
print()
print("########### MODEL PERFORMANCE ###########")
print("Confusion Matrix: ")
print(conf_matrix)
print("Average Accuracy: {}".format(accuracy))
print("Per-Class Precision: {}".format(precision_array))
print("Per-Class Recall: {}".format(recall_array))
print("#########################################")
print("Area under the ROC Curve: {}".format(auc))
accuracy = logisticRegr.score(x_test, y_test)

### data for countplot ###
Y = kickstartLabels.astype(float)
Y = Y.reshape(49999,)
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))
X = min_max_scaler.fit_transform(kickstartData)
 
# number of fails and success
sns.countplot(x=Y, palette='hls')
xlabel("Failed 0.0 : Successful 1.0")
plt.show()
plt.savefig("Failed_Successful_Plot")
 
### get confusion matrix, accuracy, precision rate, and recall rate
CM, acc, arrR, arrP = func_confusion_matrix(y_test, predictions)

### display confusion matrix ###
df_cm = pd.DataFrame(
    CM,
    index = [i for i in ["Failed","Success"]],
    columns = [i for i in ["Failed","Success"]]
    )
ax = sns.heatmap(df_cm, annot=True)
ax.set(xlabel="Prediction",ylabel="Ground Truth",title="Confusion Matrix")
plt.savefig('Confusion_Matrix')
plt.show()

### print out accuracy, precision rate, and recall rate ###
print("Accuracy: ", acc)
print("Per-class precision rate: ",arrP)
Esempio n. 7
0
split = np.random.permutation(n)

x_train = data[split[:50], 1:6]
y_train = data[split[:50], 6].ravel()

x_test = data[split[50:100], 1:6]
y_test = data[split[50:100], 6].ravel()

svm_model1 = svm.SVC(kernel=best_kernel, C=best_c)
svm_model1.fit(X=x_train, y=y_train)

## Evaluate the results with the confusion matrix, accuracy, recall per class, and precision per class
y_pred = svm_model1.predict(x_test)

conf_mtx, acc, recall_arr, pr_arr = func_confusion_matrix(y_test, y_pred)

print("Confusion Matrix with optimal parameters:")
print(conf_mtx)
print("Accuracy:\n{}".format(acc))
print("Per-Class Precision\n{}".format(pr_arr))
print("Per-Class Recall:\n{}".format(recall_arr))
print()

####################################################
#### Implement a Bagging classifier (MODEL 2) ####
####################################################

SVM = svm.LinearSVC(random_state=42)
svm_model2 = BaggingClassifier(base_estimator=SVM,
                               n_estimators=45,
        nnpredict = 1 if predictions[1][i, 1] > predictions[1][i, 0] else 0
        if (nnpredict == 0 and predictions[0][i] < percent * .01):
            pred[i] = 0
        else:
            pred[i] = np.round(predictions[0][i])
    pred = pred.astype(int)
    predic.append(pred)
# find best thresholded prediction by auc
max = 0
maxidx = -1
idx = 0
for pred in predic:
    fpr, tpr, thresholds = metrics.roc_curve(testY, pred, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    if auc > max:
        max = auc
        maxidx = idx
    idx += 1
# evaluate best prediction
bestPred = predic[maxidx]
conf_matrix, best_accuracy, recall_array, precision_array = func_confusion_matrix(
    testY, bestPred)
fpr, tpr, thresholds = metrics.roc_curve(testY, bestPred, pos_label=1)
auc = metrics.auc(fpr, tpr)
print("Confusion Matrix: ")
print(str(conf_matrix))
print("Average Accuracy: {}".format(str(best_accuracy)))
print("Per-Class Precision: {}".format(str(precision_array)))
print("Per-Class Recall: {}".format(str(recall_array)))
print("Area under the ROC Curve: {}".format(auc))