Ejemplo n.º 1
0
def cross_validate_rf_model(X_data, y_data):
    k_parameters = [10, 50, 100]
    params = {"k": k_parameters}
    model = md.RandomForest()
    print("Cross validation SVM Model")
    model.cross_validate_model(params, X_data, y_data, 5)
    print()
    return model
Ejemplo n.º 2
0
import config
import models


def huber_approx_obj(preds, dtrain):
    '''
    xgboost optimizing function for mean absolute error
    '''
    d = preds - dtrain  #add .get_labels() for xgb.train()
    h = 1  #h is delta in the graphic
    scale = 1 + (d / h)**2
    scale_sqrt = np.sqrt(scale)
    grad = d / scale_sqrt
    hess = 1 / scale / scale_sqrt
    return grad, hess


models = {
    "dt": models.DecisionTree(),
    "rf": models.RandomForest(),
    "lr": models.LR(),
    "xgb": models.XGBoost(),
    "svm": models.SVM(),
    "lgb": models.LGB(),
    # "mlp": models.MLP(),
    "lstm": models.LSTM()
}

# to get the final accuracy, calculate the mean and the mean absolute error should be the percentage of the
# performance since he wants to see performance
Ejemplo n.º 3
0
def main():
    data = Data()
    logistic_regression = models.LogisticRegression()
    neural_network = models.NeuralNet()
    svm = models.SupportVectorMachine(C=1.0, kernel='rbf', gamma='scale')
    random_forest = models.RandomForest(n_estimators=100,
                                        max_depth=None,
                                        random_state=None)

    # Process dataset
    training_data_features, training_data_labels, mnist_test_data_features, mnist_test_data_labels, \
    usps_test_data_features, usps_test_data_labels, combined_test_data_features, combined_test_data_labels = \
        data.pre_process()

    # Logistic Regression
    logistic_regression.fit(training_data_features,
                            training_data_labels,
                            learning_rate=0.01,
                            epochs=500)
    accuracy_mnist, confusion_mnist = logistic_regression.predict(
        mnist_test_data_features, mnist_test_data_labels)
    accuracy_usps, confusion_usps = logistic_regression.predict(
        usps_test_data_features, usps_test_data_labels)
    accuracy_combined, confusion_combined = logistic_regression.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('Logistic Regression', accuracy_mnist, accuracy_usps,
                   accuracy_combined, confusion_mnist, confusion_usps,
                   confusion_combined)

    # Neural Network
    neural_network.fit(training_data_features, training_data_labels, epochs=10)
    accuracy_mnist, confusion_mnist = neural_network.predict(
        mnist_test_data_features, mnist_test_data_labels)
    accuracy_usps, confusion_usps = neural_network.predict(
        usps_test_data_features, usps_test_data_labels)
    accuracy_combined, confusion_combined = neural_network.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('Neural Network', accuracy_mnist, accuracy_usps,
                   accuracy_combined, confusion_mnist, confusion_usps,
                   confusion_combined)

    # Support Vector Machine
    svm.fit(training_data_features, training_data_labels)
    accuracy_mnist, confusion_mnist = svm.predict(mnist_test_data_features,
                                                  mnist_test_data_labels)
    accuracy_usps, confusion_usps = svm.predict(usps_test_data_features,
                                                usps_test_data_labels)
    accuracy_combined, confusion_combined = svm.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('SVM', accuracy_mnist, accuracy_usps, accuracy_combined,
                   confusion_mnist, confusion_usps, confusion_combined)

    # Random Forest
    random_forest.fit(training_data_features, training_data_labels)
    accuracy_mnist, confusion_mnist = random_forest.predict(
        mnist_test_data_features, mnist_test_data_labels)
    accuracy_usps, confusion_usps = random_forest.predict(
        usps_test_data_features, usps_test_data_labels)
    accuracy_combined, confusion_combined = random_forest.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('Random Forest', accuracy_mnist, accuracy_usps,
                   accuracy_combined, confusion_mnist, confusion_usps,
                   confusion_combined)
Ejemplo n.º 4
0
    #loaded_model = load("model_SVC.joblib")
    #SVM.test_svm_classifier(loaded_model, val_data, val_labels)

    loaded_model = load_model("models/best_model_DNN_Adam.h5")
    NN.test_neural_network(loaded_model, val_data, val_labels)


if __name__ == "__main__":
    total_features = 545333  # total unique features
    testing_set_size = 1500  # set site that will be used to create random test set
    malware_ratio = 0.3  # malware ratio in the set size

    print("Creating data-labels...")
    onehot.create_list_of_apps()  # function from set_one_encoding.py

    # initialize sklearn models
    GNB = models.GaussianNaiveBayes()
    MNB = models.MultinomialNaiveBayes()
    CNB = models.ComplementNaiveBayes()
    BNB = models.BernoulliNaiveBayes()
    DT = models.DecisionTree()
    RF = models.RandomForest()
    KNN = models.KNearestNeighbors()
    LR = models.LogRegression()
    SVM = models.SupportVectorMachine()

    val_runs = 8

    #evaluate_models(val_runs)
    evaluate_on_test_set()
Ejemplo n.º 5
0
feature_cols = x_train.columns
dot_data = StringIO()

export_graphviz(tree,
                out_file=dot_data,
                filled=True,
                rounded=True,
                special_characters=True,
                feature_names=feature_cols,
                class_names=['0', '1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('../data/tree_entropy_{}.png'.format(datetime.date.today()))
Image(graph.create_png())

# ---------------------------------------------------------- RandomForest
r_forest = models.RandomForest(x_train, y_train.isFraud).fit()

y_train['r_forest'] = r_forest.predict(x_train)
y_test['r_forest'] = r_forest.predict(x_test)
y_validation['r_forest'] = r_forest.predict(x_validation)

cm_r_forest_train = helpers.confusion_matrix(y_train, ['Fraude', 'r_forest'])
cm_r_forest_test = helpers.confusion_matrix(y_test, ['Fraude', 'r_forest'])
cm_r_forest_val = helpers.confusion_matrix(y_validation,
                                           ['Fraude', 'r_forest'])

# ---------------------------------------------------------- Neural Net
nn_model = Sequential()

nn_model.add(Dense(15, input_dim=29, activation='relu'))
nn_model.add(Dense(15, activation='relu'))
Ejemplo n.º 6
0
def predict(in_fname,
            lin_n_cv_iters,
            n_cv_iters,
            regularizations,
            n_labs,
            age_index,
            gender_index,
            out_fname,
            nn_out_fname=None,
            verbose=False,
            emb_fnames=None):

    if verbose:
        print "loading data"

    X_train, Y_train, X_validation, Y_validation, X_test, Y_test = features.get_data(
        in_fname)

    emb_data_list = [None]
    emb_fname_list = ['']
    if emb_fnames is not None:
        for emb_fname in emb_fnames:
            emb_data_list.append(emb.get_emb_data(emb_fname))
            emb_fname_list.append(emb_fname)

    if verbose:
        print "training, validating and testing models"

    results = []

    for e, emb_data in enumerate(emb_data_list):
        if verbose:
            print str(e)

        if verbose:
            print "-->L2"

        model = models.L2(X_train, Y_train, X_validation, Y_validation, X_test,
                          Y_test, n_labs, emb_data)
        if lin_n_cv_iters == -1:
            params = [[False, True], regularizations]
        else:
            params = [['sample', False, True],
                      ['uniform', regularizations[0], regularizations[-1]]]

        model.crossvalidate(params=params,
                            param_names=['fit_intercept', 'C'],
                            n_cv_iters=lin_n_cv_iters)
        model.test()
        s = model.summarize()
        s['emb_fname'] = emb_fname_list[e]
        results.append(s)

        if verbose:
            print "-->L1"

        model = models.L1(X_train, Y_train, X_validation, Y_validation, X_test,
                          Y_test, n_labs, age_index, gender_index, emb_data)
        if lin_n_cv_iters == -1:
            params = [[False, True], regularizations]
        else:
            params = [['sample', False, True],
                      ['uniform', regularizations[0], regularizations[-1]]]
        model.crossvalidate(params=params,
                            param_names=['fit_intercept', 'C'],
                            n_cv_iters=lin_n_cv_iters)
        model.test()
        s = model.summarize()
        s['emb_fname'] = emb_fname_list[e]
        results.append(s)

        if verbose:
            print "-->RandomForest"

        model = models.RandomForest(X_train, Y_train, X_validation,
                                    Y_validation, X_test, Y_test, emb_data)
        if n_cv_iters == -1:
            params = [[1, 10, 20], [1, 3,
                                    10], ['sqrt_n_features', 'n_features'],
                      [1, 3, 10], [1, 3, 10], [True, False],
                      ['gini', 'entropy']]
        else:
            params = [['randint', 1, 20], ['randint', 1, 10],
                      ['sample', 'sqrt_n_features', 'n_features'],
                      ['randint', 1, 10], ['randint', 1, 10],
                      ['sample', True, False], ['sample', 'gini', 'entropy']]
        param_names = [
            'n_estimators', 'max_depth', 'max_features', 'min_samples_split',
            'min_samples_leaf', 'bootstrap', 'criterion'
        ]
        model.crossvalidate(params=params,
                            param_names=param_names,
                            n_cv_iters=n_cv_iters)
        model.test()
        s = model.summarize()
        s['emb_fname'] = emb_fname_list[e]
        results.append(s)

        if emb_data is not None:
            if verbose:
                print "-->Only embeddings"

            model = models.L(emb_data[0], Y_train, emb_data[1], Y_validation,
                             emb_data[2], Y_test, None)
            if lin_n_cv_iters == -1:
                params = [['l1', 'l2'], [False, True], regularizations]
            else:
                params = [['sample', 'l1', 'l2'], ['sample', False, True],
                          ['uniform', regularizations[0], regularizations[-1]]]

            model.crossvalidate(params=params,
                                param_names=['penalty', 'fit_intercept', 'C'],
                                n_cv_iters=lin_n_cv_iters)
            model.test()
            s = model.summarize()
            s['emb_fname'] = emb_fname_list[e]
            results.append(s)

    with open(out_fname, 'w') as fout:
        fout.write(yaml.dump(results))

    if nn_out_fname is not None:
        best_model = nn.evaluate(nn_out_fname,
                                 n_cv_iters,
                                 20,
                                 X_train,
                                 Y_train,
                                 X_validation,
                                 Y_validation,
                                 X_test,
                                 Y_test,
                                 45,
                                 models=['cnn2'],
                                 random_seed=345,
                                 verbose=verbose)
Ejemplo n.º 7
0
import feature_extraction
from sklearn.model_selection import cross_val_score
import numpy as np
import torch
import models
from models import train_test_split
from sklearn.metrics import precision_score

data, labels, name_list = feature_extraction.raw_data(two_cat=True)

tree_counts = [5, 10, 25, 50, 100, 1000]

plt.figure()

for t in tree_counts:
    rf = models.RandomForest(n_estimators=t)
    rf.train(data, labels, name_list)
    sc = rf.test(rf.X_valid, rf.y_valid)

    # construct ROC curve for random forest
    rf_voting = rf.clf.predict_proba(data)

    thresholds = np.linspace(0, 1, t + 1)
    tpr = np.zeros((thresholds.size, ))
    fpr = np.zeros((thresholds.size, ))

    for th in range(len(thresholds)):
        thresh = thresholds[th]
        pos = (rf_voting[:, 1] > thresh).astype(int)
        tpr[th] = np.sum(((pos == 1) & (labels == 1))) / np.sum(labels)
        fpr[th] = np.sum(((pos == 1) * (labels == 0))).astype(float) / np.sum(
Ejemplo n.º 8
0
    for train_index, test_index in group_kfold.split(Xdata, Ydata, groups):
        model.train(Xdata[train_index], Ydata[train_index])
        Ypred = model.test(Xdata[test_index])
        confusion = sklearn.metrics.confusion_matrix(Ydata[test_index], Ypred,
                labels=features.labels)
        if sum_confusion is None:
            sum_confusion = np.zeros(confusion.shape)
        sum_confusion += confusion
    return sum_confusion / k

def select_best_model(Xdata, Ydata, models):
    avg_accuracies = [(i, k_fold_cross_validate(Xdata, Ydata, 4, model)) for
            i, model in enumerate(models)]
    print(avg_accuracies)
    return max(avg_accuracies, key=operator.itemgetter(1))


allfeatures = features.compute_or_read_features()
Xdata, Ydata = to_numpy_arrays(allfeatures)

models = [models.RandomForest(200, 'gini'), models.LogisticRegression(),
        models.SVMNonLinear('rbf'), models.SVMNonLinear('sigmoid'),
        models.NeuralNet(), models.KNN()]
#best = select_best_model(Xdata, Ydata, models)
#print(best)

for model in models:
    cm = k_fold_confusion_matrix(Xdata, Ydata, 4, model)
    save_confusion_matrix(cm, model._name)
    print(f"Confusion matrix for {model._name} saved")
Ejemplo n.º 9
0
    'n_estimators': [7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
    'max_depth': [2, 3, 4, 5, 6, None],
    'random_state': [42]
}

models = {
    'MLP': {
        'build_fn': m.build_MLP((24, )),
        'params': param_grid_MLP
    },
    'Decision_tree': {
        'build_fn': m.DecisionTreeModel(train=False),
        'params': param_grid_Dt
    },
    'Random_forest': {
        'build_fn': m.RandomForest(train=False),
        'params': param_grid_random_forest
    },
    'svm': {
        'build_fn': m.SVM(train=False),
        'params': param_grid_svm
    }
}
# to find the best parameters of a given model. A parameters grid should be provided.
if finetune:
    print("Finetuning ...")

    model = models[model_name]['build_fn']
    param_grid = models[model_name]['params']
    gs, fitted_model, pred = search_pipeline(X_train,
                                             X_test,
Ejemplo n.º 10
0
def main():
    data = Data()
    logistic_regression = models.LogisticRegression()
    neural_network = models.NeuralNet()
    svm = models.SupportVectorMachine(C=1.0, kernel='rbf', gamma='scale')
    random_forest = models.RandomForest(n_estimators=100,
                                        max_depth=None,
                                        random_state=None)
    discriminant_analysis = DiscriminantAnalysis()
    vaecnn = deep_learning_models.VAEConvolutionNeuralNet(
        input_data.read_data_sets("data", one_hot=True), (28, 28), (28, 28))

    # Process dataset
    training_data_features, training_data_labels, mnist_test_data_features, mnist_test_data_labels, \
    usps_test_data_features, usps_test_data_labels, combined_test_data_features, combined_test_data_labels = \
        data.pre_process()

    # Discriminant Analysis
    IMAGE_SIZE = int(training_data_features.shape[-1]**0.5)
    discriminant_analysis.fit(
        training_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)),
        training_data_labels)
    accuracy_mnist, confusion_mnist = discriminant_analysis.predict(
        'MNIST dataset',
        mnist_test_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)),
        mnist_test_data_labels)
    accuracy_usps, confusion_usps = discriminant_analysis.predict(
        'USPS dataset',
        usps_test_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)),
        usps_test_data_labels)
    accuracy_combined, confusion_combined = discriminant_analysis.predict(
        'Combined dataset',
        combined_test_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)),
        combined_test_data_labels)
    print_and_plot('Bayesian Discriminant Analysis', accuracy_mnist,
                   accuracy_usps, accuracy_combined, confusion_mnist,
                   confusion_usps, confusion_combined)

    # Logistic Regression
    logistic_regression.fit(training_data_features,
                            training_data_labels,
                            learning_rate=0.01,
                            epochs=500)
    accuracy_mnist, confusion_mnist = logistic_regression.predict(
        mnist_test_data_features, mnist_test_data_labels)
    accuracy_usps, confusion_usps = logistic_regression.predict(
        usps_test_data_features, usps_test_data_labels)
    accuracy_combined, confusion_combined = logistic_regression.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('Logistic Regression', accuracy_mnist, accuracy_usps,
                   accuracy_combined, confusion_mnist, confusion_usps,
                   confusion_combined)

    # Neural Network
    neural_network.fit(training_data_features, training_data_labels, epochs=10)
    accuracy_mnist, confusion_mnist = neural_network.predict(
        mnist_test_data_features, mnist_test_data_labels)
    accuracy_usps, confusion_usps = neural_network.predict(
        usps_test_data_features, usps_test_data_labels)
    accuracy_combined, confusion_combined = neural_network.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('Neural Network', accuracy_mnist, accuracy_usps,
                   accuracy_combined, confusion_mnist, confusion_usps,
                   confusion_combined)

    # Support Vector Machine
    svm.fit(training_data_features, training_data_labels)
    accuracy_mnist, confusion_mnist = svm.predict(mnist_test_data_features,
                                                  mnist_test_data_labels)
    accuracy_usps, confusion_usps = svm.predict(usps_test_data_features,
                                                usps_test_data_labels)
    accuracy_combined, confusion_combined = svm.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('SVM', accuracy_mnist, accuracy_usps, accuracy_combined,
                   confusion_mnist, confusion_usps, confusion_combined)

    # Random Forest
    random_forest.fit(training_data_features, training_data_labels)
    accuracy_mnist, confusion_mnist = random_forest.predict(
        mnist_test_data_features, mnist_test_data_labels)
    accuracy_usps, confusion_usps = random_forest.predict(
        usps_test_data_features, usps_test_data_labels)
    accuracy_combined, confusion_combined = random_forest.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('Random Forest', accuracy_mnist, accuracy_usps,
                   accuracy_combined, confusion_mnist, confusion_usps,
                   confusion_combined)

    # Restricted Boltzmann Machine
    num_hidden_nodes_list = [20, 100, 500]
    for num_hidden_nodes in num_hidden_nodes_list:
        rbm = deep_learning_models.RBM(images=input_data.read_data_sets(
            "data", one_hot=True),
                                       n_components=num_hidden_nodes,
                                       learning_rate=0.02,
                                       batch_size=100,
                                       n_iter=1000,
                                       random_state=0)
        rbm.fit()
        rbm.gibbs_sampling(1000)
        rbm.generate_images(num_hidden_nodes)

    # Variational Auto Encoders
    code_unit_list = [2, 8, 16]
    for code_unit in code_unit_list:
        vae = deep_learning_models.VAE(
            input_data.read_data_sets("data", one_hot=True), code_unit)
        vae.generate_images(epochs=20)

    # Variational Auto Encoders with Convolutional Neural Networks
    vaecnn.encode()
    vaecnn.decode()
    vaecnn.compile_()
    vaecnn.train(epochs=10, batch_size=100)
Ejemplo n.º 11
0
def train_external_detector():

    train_data, train_labels, test_data, test_labels = create_sets()

    trained_model = tf.keras.models.load_model('best_model_Adam.h5')
    predict_original = trained_model.predict(train_data)
    confusion = confusion_matrix(train_labels,
                                 np.argmax(predict_original, axis=1))
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    FNR_original = FN / float(FN + TP) * 100
    FPR = FP / float(FP + TN) * 100
    accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100
    print(confusion)
    print("Original FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
    print("Original Accuracy:", accuracy, "- FPR:", FPR, "- FNR:",
          FNR_original)
    average_changes = 0
    amount_malwares = 0
    averageChanges = 0

    # the numpy array will be filled dynamically
    adversarial_data = np.zeros((0, 3880), dtype=float)

    for i in range(len(train_data)):
        if train_labels[i] == 1:

            x = train_data[i:i + 1]
            # print("x: ", x)
            # print(x.shape)
            try:
                adv_x, changes = craft_adversarial_samples(
                    x, 0, trained_model, 1)
                # print(adv_x)
                # append the adversarial data to the numpy array
                adversarial_data = np.concatenate((adversarial_data, adv_x))
                if changes >= 0:
                    average_changes += changes
                    amount_malwares += 1
            except NameError:
                pass
            except ValueError:
                pass

    if amount_malwares > 0:
        averageChanges += (average_changes / float(amount_malwares))

    train_data, train_labels, test_data, test_labels = create_sets()
    predictions = trained_model.predict(train_data)
    confusion = confusion_matrix(train_labels, np.argmax(predictions, axis=1))
    print(confusion)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    FNR = FN / float(FN + TP) * 100
    FPR = FP / float(FP + TN) * 100
    accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100
    print("Adversarial  FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
    print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
    print("Misclassification Rate:", FNR - FNR_original)
    print("Distortion:", averageChanges)
    predictions = trained_model.predict(adversarial_data)
    adversarial_labels = np.ones((len(adversarial_data), ), dtype=int)
    confusion = confusion_matrix(adversarial_labels,
                                 np.argmax(predictions, axis=1))
    print(confusion)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    FNR = FN / float(FN + TP) * 100
    FPR = FP / float(FP + TN) * 100
    accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100
    print("Adversarial  FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
    print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
    print("Misclassification Rate:", FNR - FNR_original)
    print("Distortion:", averageChanges)
    print(changes_dict)
    del predict_original, FNR_original, predictions, confusion, TP, TN, FP, FN, FNR, FPR, accuracy

    # concatenate legit with produced adversarial input
    final_train_data = np.concatenate((train_data, adversarial_data))
    print("final train data shape:", final_train_data.shape)

    train_labels = np.zeros((len(train_labels), ),
                            dtype=int)  # fill with 0 (the original class)
    print("train labels shape:", train_labels.shape)

    adverarial_labels = np.ones(
        (len(adversarial_data), ),
        dtype=int)  # fill with 1 (the adversarial class)
    print("adversarial labels:", adverarial_labels.shape)

    final_train_labels = np.concatenate((train_labels, adverarial_labels))
    print("final labels shape:", final_train_labels.shape)
    print("Unique classes:", np.unique(final_train_labels))

    del train_data, train_labels, adversarial_data, adverarial_labels
    #shuffle the set
    shuffle(final_train_data, final_train_labels, random_state=123)

    # train with the augmented dataset (with adverarial examples belong to class '1')
    model = generate_neural_network(total_features, [200, 200], 0.2, 0.001,
                                    "glorot_uniform", "zeros", "relu", 2)
    train_neural_network(model,
                         epochs=30,
                         batch_size=150,
                         features=final_train_data,
                         labels=final_train_labels,
                         verbose=2,
                         validation=True,
                         val_data=final_train_data,
                         val_labels=final_train_labels,
                         callbacks=True,
                         path=dir_path + "logs/fit/",
                         model_name="external_detector_2")
    GNB = models.GaussianNaiveBayes()
    MNB = models.MultinomialNaiveBayes()
    CNB = models.ComplementNaiveBayes()
    BNB = models.BernoulliNaiveBayes()
    DT = models.DecisionTree()
    RF = models.RandomForest()
    KNN = models.KNearestNeighbors()
    LR = models.LogRegression()
    SVM = models.SupportVectorMachine()

    model = GNB.train_gaussian_naive_bayes_classifier(
        final_train_data, final_train_labels)  # train Naive Bayes
    score_GNB = GNB.evaluate_gaussian_naive_bayes_classifier(
        model, final_train_data, final_train_labels)  # test performance
    print("GNB", score_GNB)

    model = MNB.train_multi_naive_bayes_classifier(final_train_data,
                                                   final_train_labels)
    score_MNB = MNB.evaluate_multi_naive_bayes_classifier(
        model, final_train_data, final_train_labels)
    print("MNB", score_MNB)

    model = CNB.train_complement_naive_bayes_classifier(
        final_train_data, final_train_labels)
    score_CNB = CNB.evaluate_complement_naive_bayes_classifier(
        model, final_train_data, final_train_labels)
    print("CNB", score_CNB)

    model = BNB.train_bernoulli_naive_bayes_classifier(final_train_data,
                                                       final_train_labels)
    score_BNB = BNB.evaluate_bernoulli_naive_bayes_classifier(
        model, test_data, test_labels)
    print("BNB", score_BNB)

    model = DT.train_decision_tree_classifier(
        final_train_data, final_train_labels)  # train Decision Tree Classifier
    score_dt = DT.evaluate_decision_tree_classifier(model, final_train_data,
                                                    final_train_labels)
    print("DT:", score_dt)

    model = LR.train_logistic_regression_classifier(
        final_train_data, final_train_labels)  # train logistic Regression
    score_lr = LR.evaluate_logistic_regression_classifier(
        model, final_train_data, final_train_labels)
    print("LR", score_lr)

    model = KNN.train_knn_classifier(
        final_train_data,
        final_train_labels)  # train k-Nearest Neighbors Classifier
    score_knn = KNN.evaluate_knn_classifier(model, final_train_data,
                                            final_train_labels)
    print("KNN", score_knn)

    model = SVM.train_svm_classifier(
        final_train_data, final_train_labels)  # train Support Vector Machines
    score_svm = SVM.evaluate_svm_classifier(model, final_train_data,
                                            final_train_labels)
    print("SVM", score_svm)

    model = RF.train_random_forest_classifier(
        final_train_data, final_train_labels)  # train Random Forest
    score_rf = RF.evaluate_random_forest_classifier(model, final_train_data,
                                                    final_train_labels)
    print("RF:", score_rf)
Ejemplo n.º 12
0
def main():
    data = Data()
    logistic_regression = models.LogisticRegression()
    neural_network = models.NeuralNet()
    svm = models.SupportVectorMachine(C=1.0, kernel='rbf', gamma='scale')
    random_forest = models.RandomForest(n_estimators=100,
                                        max_depth=None,
                                        random_state=None)
    discriminant_analysis = DiscriminantAnalysis()

    # Process dataset
    training_data_features, training_data_labels, mnist_test_data_features, mnist_test_data_labels, \
    usps_test_data_features, usps_test_data_labels, combined_test_data_features, combined_test_data_labels = \
        data.pre_process()

    # Discriminant Analysis
    IMAGE_SIZE = int(training_data_features.shape[-1]**0.5)
    discriminant_analysis.fit(
        training_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)),
        training_data_labels)
    accuracy_mnist, confusion_mnist = discriminant_analysis.predict(
        'MNIST dataset',
        mnist_test_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)),
        mnist_test_data_labels)
    accuracy_usps, confusion_usps = discriminant_analysis.predict(
        'USPS dataset',
        usps_test_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)),
        usps_test_data_labels)
    accuracy_combined, confusion_combined = discriminant_analysis.predict(
        'Combined dataset',
        combined_test_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)),
        combined_test_data_labels)
    print_and_plot('Bayesian Discriminant Analysis', accuracy_mnist,
                   accuracy_usps, accuracy_combined, confusion_mnist,
                   confusion_usps, confusion_combined)

    # Logistic Regression
    logistic_regression.fit(training_data_features,
                            training_data_labels,
                            learning_rate=0.01,
                            epochs=500)
    accuracy_mnist, confusion_mnist = logistic_regression.predict(
        mnist_test_data_features, mnist_test_data_labels)
    accuracy_usps, confusion_usps = logistic_regression.predict(
        usps_test_data_features, usps_test_data_labels)
    accuracy_combined, confusion_combined = logistic_regression.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('Logistic Regression', accuracy_mnist, accuracy_usps,
                   accuracy_combined, confusion_mnist, confusion_usps,
                   confusion_combined)

    # Neural Network
    neural_network.fit(training_data_features, training_data_labels, epochs=10)
    accuracy_mnist, confusion_mnist = neural_network.predict(
        mnist_test_data_features, mnist_test_data_labels)
    accuracy_usps, confusion_usps = neural_network.predict(
        usps_test_data_features, usps_test_data_labels)
    accuracy_combined, confusion_combined = neural_network.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('Neural Network', accuracy_mnist, accuracy_usps,
                   accuracy_combined, confusion_mnist, confusion_usps,
                   confusion_combined)

    # Support Vector Machine
    svm.fit(training_data_features, training_data_labels)
    accuracy_mnist, confusion_mnist = svm.predict(mnist_test_data_features,
                                                  mnist_test_data_labels)
    accuracy_usps, confusion_usps = svm.predict(usps_test_data_features,
                                                usps_test_data_labels)
    accuracy_combined, confusion_combined = svm.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('SVM', accuracy_mnist, accuracy_usps, accuracy_combined,
                   confusion_mnist, confusion_usps, confusion_combined)

    # Random Forest
    random_forest.fit(training_data_features, training_data_labels)
    accuracy_mnist, confusion_mnist = random_forest.predict(
        mnist_test_data_features, mnist_test_data_labels)
    accuracy_usps, confusion_usps = random_forest.predict(
        usps_test_data_features, usps_test_data_labels)
    accuracy_combined, confusion_combined = random_forest.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('Random Forest', accuracy_mnist, accuracy_usps,
                   accuracy_combined, confusion_mnist, confusion_usps,
                   confusion_combined)