def perform_cross_validation_split(data, num_splits, filename): y = data['defect_status'] del data["defect_status"] # removing defect answer for training X = data.to_numpy() # remove defects # performing cross-validation kf = KFold(n_splits=num_splits) for train, test in kf.split(X): X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test] actual = y_test.tolist() # Classification and Regression Tree (CART) prediction = classifiers.cart(X_train, y_train, X_test) calculate_metrics(prediction, actual, "cart") # KNearestNeighbor(KNN) prediction = classifiers.knn(X_train, y_train, X_test) calculate_metrics(prediction, actual, "knn") # Logistic Regression (LR) prediction = classifiers.logistic_regression(X_train, y_train, X_test) calculate_metrics(prediction, actual, "lr") # Naive Bayes (NB) prediction = classifiers.naive_bayes(X_train, y_train, X_test) calculate_metrics(prediction, actual, "nb") # Random Forest (RF) prediction = classifiers.random_forest(X_train, y_train, X_test) calculate_metrics(prediction, actual, "rf") print("\nCalculate mean metrics...") filename = filename.split("IST_")[1].split(".csv")[0] precision, recall, f_score, auc = calculate_mean_metrics( precision_results_cart, recall_results_cart, f_score_results_cart, auc_results_cart) save_results(precision, recall, f_score, auc, filename, "CART") precision, recall, f_score, auc = calculate_mean_metrics( precision_results_knn, recall_results_knn, f_score_results_knn, auc_results_knn) save_results(precision, recall, f_score, auc, filename, "KNN") precision, recall, f_score, auc = calculate_mean_metrics( precision_results_lr, recall_results_lr, f_score_results_lr, auc_results_lr) save_results(precision, recall, f_score, auc, filename, "LR") precision, recall, f_score, auc = calculate_mean_metrics( precision_results_nb, recall_results_nb, f_score_results_nb, auc_results_nb) save_results(precision, recall, f_score, auc, filename, "NB") precision, recall, f_score, auc = calculate_mean_metrics( precision_results_rf, recall_results_rf, f_score_results_rf, auc_results_rf) save_results(precision, recall, f_score, auc, filename, "RF")
def cross_val(data, labels, k, smote, classifier): """ Performs k-fold cross validation using the specified classifier, returns number of true/false positives/negatives """ kf = KFold(n_splits=k) tp, fp, fn, tn = 0, 0, 0, 0 i = 0 for train_index, test_index in kf.split(data): test_set, train_set, test_label, train_label = [], [], [], [] # make train and test sets/labels for i in train_index: train_set.append(data[i]) train_label.append(labels[i]) for i in test_index: test_set.append(data[i]) test_label.append(labels[i]) # Apply SMOTEing when smote parameter is True if smote: train_set, train_label = SMOTE.SMOTEd(train_set, train_label) if classifier == 'linear': predicted = classifiers.lin_reg(train_set, test_set, train_label) elif classifier == 'logistic': predicted = classifiers.log_reg(train_set, test_set, train_label) elif classifier == 'decision tree': predicted = classifiers.decision_tree(train_set, test_set, train_label) elif classifier == 'neuralnetwork': predicted = classifiers.neuralnetwork(train_set, test_set, train_label) elif classifier == 'naive bayes': predicted = classifiers.naive_bayes(train_set, test_set, train_label) elif classifier == 'randomforest': predicted = classifiers.randomforest(train_set, test_set, train_label) elif classifier == 'knn': predicted = classifiers.knn(train_set, test_set, train_label) else: print 'Wrong name supplied: %s' % classifier return [test_label, predicted]
random.shuffle(train_set) Y_train = np.array([row[0] for row in train_set]) X_train = np.array([row[1] for row in train_set]) test_set = [] for key, value in test.items(): for file_id, words in value.items(): bow = np.zeros(len(vocab)) for word in words: if word in vocab: bow[vocab[word]] += 1 else: bow[vocab['</unk>']] += 1 tf = bow/len(words) tfidf = tf*idf_test avg_glove = np.zeros(300) for i in range(len(vocab)): avg_glove += tfidf[i]*glove[i] test_set.append([key, avg_glove]) print('shuffling test') random.shuffle(test_set) Y_test = np.array([row[0] for row in test_set]) X_test = np.array([row[1] for row in test_set]) print("Calling Classifiers\n") print("Accuracy for Naive Bayes is : ", naive_bayes(X_train, Y_train, X_test, Y_test)) print("Accuracy for Logistic Regression is : ", logistic_regression(X_train, Y_train, X_test, Y_test)) print("Accuracy for SVM is : ", svm(X_train, Y_train, X_test, Y_test)) print("Accuracy for FF Neural Net is : ", fnn(X_train, Y_train, X_test, Y_test)) # print("Accuracy for Recurrent Neural Net is : ", rnn(X_train, Y_train, X_test, Y_t))
print('shuffling train') random.shuffle(train_set) Y_train = np.array([row[0] for row in train_set]) X_train = np.array([row[1] for row in train_set]) test_set = [] for key, value in test.items(): for file_id, words in value.items(): bow = np.zeros(len(vocab)) for word in words: if word in vocab: bow[vocab[word]] += 1 else: bow[vocab['</unk>']] += 1 bow = bow / len(words) test_set.append([key, bow]) print('shuffling test') random.shuffle(test_set) Y_test = np.array([row[0] for row in test_set]) X_test = np.array([row[1] for row in test_set]) print("Calling Classifiers\n") print("Accuracy for Naive Bayes is : ", naive_bayes(X_train, Y_train, X_test, Y_test)) print("Accuracy for Logistic Regression is : ", logistic_regression(X_train, Y_train, X_test, Y_test)) print("Accuracy for SVM is : ", svm(X_train, Y_train, X_test, Y_test)) print("Accuracy for FF Neural Net is : ", fnn(X_train, Y_train, X_test, Y_test)) # print("Accuracy for Recurrent Neural Net is : ", rnn(X_train, Y_train, X_test, Y_t))
print('Random Forest:') rf_acc, rf_prec, rf_rec = clf.random_forest(analysis, 100, 10) print( 'Accuracy: {}% +/-{}\nPrecsion: {}% +/- {}\nRecall: {}% +/- {}\n'. format(np.round(rf_acc[0] * 100, 2), np.round(100 * rf_acc[1], 2), np.round(100 * rf_prec[0], 2), np.round(100 * rf_prec[1], 2), np.round(100 * rf_rec[0], 2), np.round(100 * rf_rec[1], 2))) result = ['random_forest', i] result.extend(rf_acc + rf_prec + rf_rec) results.append(result) print('{}\n'.format('-' * 80)) # Fit Naive-Bayes print('Naive-Bayes:') nb_acc, nb_prec, nb_rec = clf.naive_bayes(analysis, 10) print( 'Accuracy: {}% +/-{}\nPrecsion: {}% +/- {}\nRecall: {}% +/- {}\n'. format(np.round(nb_acc[0] * 100, 2), np.round(100 * nb_acc[1], 2), np.round(100 * nb_prec[0], 2), np.round(100 * nb_prec[1], 2), np.round(100 * nb_rec[0], 2), np.round(100 * nb_rec[1], 2))) result = ['naive_bayes', i] result.extend(nb_acc + nb_prec + nb_rec) results.append(result) print('{}\n'.format('-' * 80)) # Fit LSTM print('LSTM:') lstm_clf = clf.LSTM_model(analysis, 734, 200) lstm_acc, lstm_prec, lstm_rec = lstm_clf.fit()