예제 #1
0
def perform_cross_validation_split(data, num_splits, filename):
    y = data['defect_status']
    del data["defect_status"]  # removing defect answer for training
    X = data.to_numpy()  # remove defects

    # performing cross-validation
    kf = KFold(n_splits=num_splits)

    for train, test in kf.split(X):
        X_train, X_test = X[train], X[test]
        y_train, y_test = y[train], y[test]
        actual = y_test.tolist()

        # Classification and Regression Tree (CART)
        prediction = classifiers.cart(X_train, y_train, X_test)
        calculate_metrics(prediction, actual, "cart")

        # KNearestNeighbor(KNN)
        prediction = classifiers.knn(X_train, y_train, X_test)
        calculate_metrics(prediction, actual, "knn")

        # Logistic Regression (LR)
        prediction = classifiers.logistic_regression(X_train, y_train, X_test)
        calculate_metrics(prediction, actual, "lr")

        # Naive Bayes (NB)
        prediction = classifiers.naive_bayes(X_train, y_train, X_test)
        calculate_metrics(prediction, actual, "nb")

        # Random Forest (RF)
        prediction = classifiers.random_forest(X_train, y_train, X_test)
        calculate_metrics(prediction, actual, "rf")

    print("\nCalculate mean metrics...")
    filename = filename.split("IST_")[1].split(".csv")[0]

    precision, recall, f_score, auc = calculate_mean_metrics(
        precision_results_cart, recall_results_cart, f_score_results_cart,
        auc_results_cart)
    save_results(precision, recall, f_score, auc, filename, "CART")

    precision, recall, f_score, auc = calculate_mean_metrics(
        precision_results_knn, recall_results_knn, f_score_results_knn,
        auc_results_knn)
    save_results(precision, recall, f_score, auc, filename, "KNN")

    precision, recall, f_score, auc = calculate_mean_metrics(
        precision_results_lr, recall_results_lr, f_score_results_lr,
        auc_results_lr)
    save_results(precision, recall, f_score, auc, filename, "LR")

    precision, recall, f_score, auc = calculate_mean_metrics(
        precision_results_nb, recall_results_nb, f_score_results_nb,
        auc_results_nb)
    save_results(precision, recall, f_score, auc, filename, "NB")

    precision, recall, f_score, auc = calculate_mean_metrics(
        precision_results_rf, recall_results_rf, f_score_results_rf,
        auc_results_rf)
    save_results(precision, recall, f_score, auc, filename, "RF")
예제 #2
0
def cross_val(data, labels, k, smote, classifier):
    """ Performs k-fold cross validation using the specified classifier, returns number of true/false
    positives/negatives """
    kf = KFold(n_splits=k)
    tp, fp, fn, tn = 0, 0, 0, 0
    i = 0
    for train_index, test_index in kf.split(data):
            test_set, train_set, test_label, train_label = [], [], [], []
            # make train and test sets/labels
            for i in train_index:
                train_set.append(data[i])
                train_label.append(labels[i])
            for i in test_index:
                test_set.append(data[i])
                test_label.append(labels[i])

            # Apply SMOTEing when smote parameter is True
            if smote:
                train_set, train_label = SMOTE.SMOTEd(train_set, train_label)

            if classifier == 'linear':
                predicted = classifiers.lin_reg(train_set, test_set, train_label)
            elif classifier == 'logistic':
                predicted = classifiers.log_reg(train_set, test_set, train_label)
            elif classifier == 'decision tree':
                predicted = classifiers.decision_tree(train_set, test_set, train_label)
            elif classifier == 'neuralnetwork':
                predicted = classifiers.neuralnetwork(train_set, test_set, train_label)
            elif classifier == 'naive bayes':
                predicted = classifiers.naive_bayes(train_set, test_set, train_label)
            elif classifier == 'randomforest':
                predicted = classifiers.randomforest(train_set, test_set, train_label)
            elif classifier == 'knn':
                predicted = classifiers.knn(train_set, test_set, train_label)
            else:
                print 'Wrong name supplied: %s' % classifier

    return [test_label, predicted]
예제 #3
0
random.shuffle(train_set)
Y_train = np.array([row[0] for row in train_set])
X_train = np.array([row[1] for row in train_set])

test_set = []
for key, value in test.items():
    for file_id, words in value.items():
        bow = np.zeros(len(vocab))
        for word in words:
            if word in vocab:
                bow[vocab[word]] += 1
            else:
                bow[vocab['</unk>']] += 1
        tf = bow/len(words)
        tfidf = tf*idf_test
        avg_glove = np.zeros(300)
        for i in range(len(vocab)):
            avg_glove += tfidf[i]*glove[i]
        test_set.append([key, avg_glove])

print('shuffling test')
random.shuffle(test_set)
Y_test = np.array([row[0] for row in test_set])
X_test = np.array([row[1] for row in test_set])

print("Calling Classifiers\n")
print("Accuracy for Naive Bayes is : ", naive_bayes(X_train, Y_train, X_test, Y_test))
print("Accuracy for Logistic Regression is : ", logistic_regression(X_train, Y_train, X_test, Y_test))
print("Accuracy for SVM is : ", svm(X_train, Y_train, X_test, Y_test))
print("Accuracy for FF Neural Net is : ", fnn(X_train, Y_train, X_test, Y_test))
# print("Accuracy for Recurrent Neural Net is : ", rnn(X_train, Y_train, X_test, Y_t))
예제 #4
0
print('shuffling train')
random.shuffle(train_set)
Y_train = np.array([row[0] for row in train_set])
X_train = np.array([row[1] for row in train_set])

test_set = []
for key, value in test.items():
    for file_id, words in value.items():
        bow = np.zeros(len(vocab))
        for word in words:
            if word in vocab:
                bow[vocab[word]] += 1
            else:
                bow[vocab['</unk>']] += 1
        bow = bow / len(words)
        test_set.append([key, bow])

print('shuffling test')
random.shuffle(test_set)
Y_test = np.array([row[0] for row in test_set])
X_test = np.array([row[1] for row in test_set])

print("Calling Classifiers\n")
print("Accuracy for Naive Bayes is : ",
      naive_bayes(X_train, Y_train, X_test, Y_test))
print("Accuracy for Logistic Regression is : ",
      logistic_regression(X_train, Y_train, X_test, Y_test))
print("Accuracy for SVM is : ", svm(X_train, Y_train, X_test, Y_test))
print("Accuracy for FF Neural Net is : ", fnn(X_train, Y_train, X_test,
                                              Y_test))
# print("Accuracy for Recurrent Neural Net is : ", rnn(X_train, Y_train, X_test, Y_t))
예제 #5
0
        print('Random Forest:')
        rf_acc, rf_prec, rf_rec = clf.random_forest(analysis, 100, 10)
        print(
            'Accuracy: {}% +/-{}\nPrecsion: {}% +/- {}\nRecall: {}% +/- {}\n'.
            format(np.round(rf_acc[0] * 100, 2), np.round(100 * rf_acc[1], 2),
                   np.round(100 * rf_prec[0], 2),
                   np.round(100 * rf_prec[1], 2), np.round(100 * rf_rec[0], 2),
                   np.round(100 * rf_rec[1], 2)))
        result = ['random_forest', i]
        result.extend(rf_acc + rf_prec + rf_rec)
        results.append(result)
        print('{}\n'.format('-' * 80))

        # Fit Naive-Bayes
        print('Naive-Bayes:')
        nb_acc, nb_prec, nb_rec = clf.naive_bayes(analysis, 10)
        print(
            'Accuracy: {}% +/-{}\nPrecsion: {}% +/- {}\nRecall: {}% +/- {}\n'.
            format(np.round(nb_acc[0] * 100, 2), np.round(100 * nb_acc[1], 2),
                   np.round(100 * nb_prec[0], 2),
                   np.round(100 * nb_prec[1], 2), np.round(100 * nb_rec[0], 2),
                   np.round(100 * nb_rec[1], 2)))
        result = ['naive_bayes', i]
        result.extend(nb_acc + nb_prec + nb_rec)
        results.append(result)
        print('{}\n'.format('-' * 80))

        # Fit LSTM
        print('LSTM:')
        lstm_clf = clf.LSTM_model(analysis, 734, 200)
        lstm_acc, lstm_prec, lstm_rec = lstm_clf.fit()