Exemple #1
0
def main():
    data = Data()
    logistic_regression = models.LogisticRegression()
    neural_network = models.NeuralNet()
    svm = models.SupportVectorMachine(C=1.0, kernel='rbf', gamma='scale')
    random_forest = models.RandomForest(n_estimators=100,
                                        max_depth=None,
                                        random_state=None)

    # Process dataset
    training_data_features, training_data_labels, mnist_test_data_features, mnist_test_data_labels, \
    usps_test_data_features, usps_test_data_labels, combined_test_data_features, combined_test_data_labels = \
        data.pre_process()

    # Logistic Regression
    logistic_regression.fit(training_data_features,
                            training_data_labels,
                            learning_rate=0.01,
                            epochs=500)
    accuracy_mnist, confusion_mnist = logistic_regression.predict(
        mnist_test_data_features, mnist_test_data_labels)
    accuracy_usps, confusion_usps = logistic_regression.predict(
        usps_test_data_features, usps_test_data_labels)
    accuracy_combined, confusion_combined = logistic_regression.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('Logistic Regression', accuracy_mnist, accuracy_usps,
                   accuracy_combined, confusion_mnist, confusion_usps,
                   confusion_combined)

    # Neural Network
    neural_network.fit(training_data_features, training_data_labels, epochs=10)
    accuracy_mnist, confusion_mnist = neural_network.predict(
        mnist_test_data_features, mnist_test_data_labels)
    accuracy_usps, confusion_usps = neural_network.predict(
        usps_test_data_features, usps_test_data_labels)
    accuracy_combined, confusion_combined = neural_network.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('Neural Network', accuracy_mnist, accuracy_usps,
                   accuracy_combined, confusion_mnist, confusion_usps,
                   confusion_combined)

    # Support Vector Machine
    svm.fit(training_data_features, training_data_labels)
    accuracy_mnist, confusion_mnist = svm.predict(mnist_test_data_features,
                                                  mnist_test_data_labels)
    accuracy_usps, confusion_usps = svm.predict(usps_test_data_features,
                                                usps_test_data_labels)
    accuracy_combined, confusion_combined = svm.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('SVM', accuracy_mnist, accuracy_usps, accuracy_combined,
                   confusion_mnist, confusion_usps, confusion_combined)

    # Random Forest
    random_forest.fit(training_data_features, training_data_labels)
    accuracy_mnist, confusion_mnist = random_forest.predict(
        mnist_test_data_features, mnist_test_data_labels)
    accuracy_usps, confusion_usps = random_forest.predict(
        usps_test_data_features, usps_test_data_labels)
    accuracy_combined, confusion_combined = random_forest.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('Random Forest', accuracy_mnist, accuracy_usps,
                   accuracy_combined, confusion_mnist, confusion_usps,
                   confusion_combined)
Exemple #2
0
    #loaded_model = load("model_SVC.joblib")
    #SVM.test_svm_classifier(loaded_model, val_data, val_labels)

    loaded_model = load_model("models/best_model_DNN_Adam.h5")
    NN.test_neural_network(loaded_model, val_data, val_labels)


if __name__ == "__main__":
    total_features = 545333  # total unique features
    testing_set_size = 1500  # set site that will be used to create random test set
    malware_ratio = 0.3  # malware ratio in the set size

    print("Creating data-labels...")
    onehot.create_list_of_apps()  # function from set_one_encoding.py

    # initialize sklearn models
    GNB = models.GaussianNaiveBayes()
    MNB = models.MultinomialNaiveBayes()
    CNB = models.ComplementNaiveBayes()
    BNB = models.BernoulliNaiveBayes()
    DT = models.DecisionTree()
    RF = models.RandomForest()
    KNN = models.KNearestNeighbors()
    LR = models.LogRegression()
    SVM = models.SupportVectorMachine()

    val_runs = 8

    #evaluate_models(val_runs)
    evaluate_on_test_set()
Exemple #3
0
def train_external_detector():

    train_data, train_labels, test_data, test_labels = create_sets()

    trained_model = tf.keras.models.load_model('best_model_Adam.h5')
    predict_original = trained_model.predict(train_data)
    confusion = confusion_matrix(train_labels,
                                 np.argmax(predict_original, axis=1))
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    FNR_original = FN / float(FN + TP) * 100
    FPR = FP / float(FP + TN) * 100
    accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100
    print(confusion)
    print("Original FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
    print("Original Accuracy:", accuracy, "- FPR:", FPR, "- FNR:",
          FNR_original)
    average_changes = 0
    amount_malwares = 0
    averageChanges = 0

    # the numpy array will be filled dynamically
    adversarial_data = np.zeros((0, 3880), dtype=float)

    for i in range(len(train_data)):
        if train_labels[i] == 1:

            x = train_data[i:i + 1]
            # print("x: ", x)
            # print(x.shape)
            try:
                adv_x, changes = craft_adversarial_samples(
                    x, 0, trained_model, 1)
                # print(adv_x)
                # append the adversarial data to the numpy array
                adversarial_data = np.concatenate((adversarial_data, adv_x))
                if changes >= 0:
                    average_changes += changes
                    amount_malwares += 1
            except NameError:
                pass
            except ValueError:
                pass

    if amount_malwares > 0:
        averageChanges += (average_changes / float(amount_malwares))

    train_data, train_labels, test_data, test_labels = create_sets()
    predictions = trained_model.predict(train_data)
    confusion = confusion_matrix(train_labels, np.argmax(predictions, axis=1))
    print(confusion)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    FNR = FN / float(FN + TP) * 100
    FPR = FP / float(FP + TN) * 100
    accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100
    print("Adversarial  FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
    print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
    print("Misclassification Rate:", FNR - FNR_original)
    print("Distortion:", averageChanges)
    predictions = trained_model.predict(adversarial_data)
    adversarial_labels = np.ones((len(adversarial_data), ), dtype=int)
    confusion = confusion_matrix(adversarial_labels,
                                 np.argmax(predictions, axis=1))
    print(confusion)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    FNR = FN / float(FN + TP) * 100
    FPR = FP / float(FP + TN) * 100
    accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100
    print("Adversarial  FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
    print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
    print("Misclassification Rate:", FNR - FNR_original)
    print("Distortion:", averageChanges)
    print(changes_dict)
    del predict_original, FNR_original, predictions, confusion, TP, TN, FP, FN, FNR, FPR, accuracy

    # concatenate legit with produced adversarial input
    final_train_data = np.concatenate((train_data, adversarial_data))
    print("final train data shape:", final_train_data.shape)

    train_labels = np.zeros((len(train_labels), ),
                            dtype=int)  # fill with 0 (the original class)
    print("train labels shape:", train_labels.shape)

    adverarial_labels = np.ones(
        (len(adversarial_data), ),
        dtype=int)  # fill with 1 (the adversarial class)
    print("adversarial labels:", adverarial_labels.shape)

    final_train_labels = np.concatenate((train_labels, adverarial_labels))
    print("final labels shape:", final_train_labels.shape)
    print("Unique classes:", np.unique(final_train_labels))

    del train_data, train_labels, adversarial_data, adverarial_labels
    #shuffle the set
    shuffle(final_train_data, final_train_labels, random_state=123)

    # train with the augmented dataset (with adverarial examples belong to class '1')
    model = generate_neural_network(total_features, [200, 200], 0.2, 0.001,
                                    "glorot_uniform", "zeros", "relu", 2)
    train_neural_network(model,
                         epochs=30,
                         batch_size=150,
                         features=final_train_data,
                         labels=final_train_labels,
                         verbose=2,
                         validation=True,
                         val_data=final_train_data,
                         val_labels=final_train_labels,
                         callbacks=True,
                         path=dir_path + "logs/fit/",
                         model_name="external_detector_2")
    GNB = models.GaussianNaiveBayes()
    MNB = models.MultinomialNaiveBayes()
    CNB = models.ComplementNaiveBayes()
    BNB = models.BernoulliNaiveBayes()
    DT = models.DecisionTree()
    RF = models.RandomForest()
    KNN = models.KNearestNeighbors()
    LR = models.LogRegression()
    SVM = models.SupportVectorMachine()

    model = GNB.train_gaussian_naive_bayes_classifier(
        final_train_data, final_train_labels)  # train Naive Bayes
    score_GNB = GNB.evaluate_gaussian_naive_bayes_classifier(
        model, final_train_data, final_train_labels)  # test performance
    print("GNB", score_GNB)

    model = MNB.train_multi_naive_bayes_classifier(final_train_data,
                                                   final_train_labels)
    score_MNB = MNB.evaluate_multi_naive_bayes_classifier(
        model, final_train_data, final_train_labels)
    print("MNB", score_MNB)

    model = CNB.train_complement_naive_bayes_classifier(
        final_train_data, final_train_labels)
    score_CNB = CNB.evaluate_complement_naive_bayes_classifier(
        model, final_train_data, final_train_labels)
    print("CNB", score_CNB)

    model = BNB.train_bernoulli_naive_bayes_classifier(final_train_data,
                                                       final_train_labels)
    score_BNB = BNB.evaluate_bernoulli_naive_bayes_classifier(
        model, test_data, test_labels)
    print("BNB", score_BNB)

    model = DT.train_decision_tree_classifier(
        final_train_data, final_train_labels)  # train Decision Tree Classifier
    score_dt = DT.evaluate_decision_tree_classifier(model, final_train_data,
                                                    final_train_labels)
    print("DT:", score_dt)

    model = LR.train_logistic_regression_classifier(
        final_train_data, final_train_labels)  # train logistic Regression
    score_lr = LR.evaluate_logistic_regression_classifier(
        model, final_train_data, final_train_labels)
    print("LR", score_lr)

    model = KNN.train_knn_classifier(
        final_train_data,
        final_train_labels)  # train k-Nearest Neighbors Classifier
    score_knn = KNN.evaluate_knn_classifier(model, final_train_data,
                                            final_train_labels)
    print("KNN", score_knn)

    model = SVM.train_svm_classifier(
        final_train_data, final_train_labels)  # train Support Vector Machines
    score_svm = SVM.evaluate_svm_classifier(model, final_train_data,
                                            final_train_labels)
    print("SVM", score_svm)

    model = RF.train_random_forest_classifier(
        final_train_data, final_train_labels)  # train Random Forest
    score_rf = RF.evaluate_random_forest_classifier(model, final_train_data,
                                                    final_train_labels)
    print("RF:", score_rf)
Exemple #4
0
def main():
    data = Data()
    logistic_regression = models.LogisticRegression()
    neural_network = models.NeuralNet()
    svm = models.SupportVectorMachine(C=1.0, kernel='rbf', gamma='scale')
    random_forest = models.RandomForest(n_estimators=100,
                                        max_depth=None,
                                        random_state=None)
    discriminant_analysis = DiscriminantAnalysis()
    vaecnn = deep_learning_models.VAEConvolutionNeuralNet(
        input_data.read_data_sets("data", one_hot=True), (28, 28), (28, 28))

    # Process dataset
    training_data_features, training_data_labels, mnist_test_data_features, mnist_test_data_labels, \
    usps_test_data_features, usps_test_data_labels, combined_test_data_features, combined_test_data_labels = \
        data.pre_process()

    # Discriminant Analysis
    IMAGE_SIZE = int(training_data_features.shape[-1]**0.5)
    discriminant_analysis.fit(
        training_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)),
        training_data_labels)
    accuracy_mnist, confusion_mnist = discriminant_analysis.predict(
        'MNIST dataset',
        mnist_test_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)),
        mnist_test_data_labels)
    accuracy_usps, confusion_usps = discriminant_analysis.predict(
        'USPS dataset',
        usps_test_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)),
        usps_test_data_labels)
    accuracy_combined, confusion_combined = discriminant_analysis.predict(
        'Combined dataset',
        combined_test_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)),
        combined_test_data_labels)
    print_and_plot('Bayesian Discriminant Analysis', accuracy_mnist,
                   accuracy_usps, accuracy_combined, confusion_mnist,
                   confusion_usps, confusion_combined)

    # Logistic Regression
    logistic_regression.fit(training_data_features,
                            training_data_labels,
                            learning_rate=0.01,
                            epochs=500)
    accuracy_mnist, confusion_mnist = logistic_regression.predict(
        mnist_test_data_features, mnist_test_data_labels)
    accuracy_usps, confusion_usps = logistic_regression.predict(
        usps_test_data_features, usps_test_data_labels)
    accuracy_combined, confusion_combined = logistic_regression.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('Logistic Regression', accuracy_mnist, accuracy_usps,
                   accuracy_combined, confusion_mnist, confusion_usps,
                   confusion_combined)

    # Neural Network
    neural_network.fit(training_data_features, training_data_labels, epochs=10)
    accuracy_mnist, confusion_mnist = neural_network.predict(
        mnist_test_data_features, mnist_test_data_labels)
    accuracy_usps, confusion_usps = neural_network.predict(
        usps_test_data_features, usps_test_data_labels)
    accuracy_combined, confusion_combined = neural_network.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('Neural Network', accuracy_mnist, accuracy_usps,
                   accuracy_combined, confusion_mnist, confusion_usps,
                   confusion_combined)

    # Support Vector Machine
    svm.fit(training_data_features, training_data_labels)
    accuracy_mnist, confusion_mnist = svm.predict(mnist_test_data_features,
                                                  mnist_test_data_labels)
    accuracy_usps, confusion_usps = svm.predict(usps_test_data_features,
                                                usps_test_data_labels)
    accuracy_combined, confusion_combined = svm.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('SVM', accuracy_mnist, accuracy_usps, accuracy_combined,
                   confusion_mnist, confusion_usps, confusion_combined)

    # Random Forest
    random_forest.fit(training_data_features, training_data_labels)
    accuracy_mnist, confusion_mnist = random_forest.predict(
        mnist_test_data_features, mnist_test_data_labels)
    accuracy_usps, confusion_usps = random_forest.predict(
        usps_test_data_features, usps_test_data_labels)
    accuracy_combined, confusion_combined = random_forest.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('Random Forest', accuracy_mnist, accuracy_usps,
                   accuracy_combined, confusion_mnist, confusion_usps,
                   confusion_combined)

    # Restricted Boltzmann Machine
    num_hidden_nodes_list = [20, 100, 500]
    for num_hidden_nodes in num_hidden_nodes_list:
        rbm = deep_learning_models.RBM(images=input_data.read_data_sets(
            "data", one_hot=True),
                                       n_components=num_hidden_nodes,
                                       learning_rate=0.02,
                                       batch_size=100,
                                       n_iter=1000,
                                       random_state=0)
        rbm.fit()
        rbm.gibbs_sampling(1000)
        rbm.generate_images(num_hidden_nodes)

    # Variational Auto Encoders
    code_unit_list = [2, 8, 16]
    for code_unit in code_unit_list:
        vae = deep_learning_models.VAE(
            input_data.read_data_sets("data", one_hot=True), code_unit)
        vae.generate_images(epochs=20)

    # Variational Auto Encoders with Convolutional Neural Networks
    vaecnn.encode()
    vaecnn.decode()
    vaecnn.compile_()
    vaecnn.train(epochs=10, batch_size=100)
Exemple #5
0
def main():
    data = Data()
    logistic_regression = models.LogisticRegression()
    neural_network = models.NeuralNet()
    svm = models.SupportVectorMachine(C=1.0, kernel='rbf', gamma='scale')
    random_forest = models.RandomForest(n_estimators=100,
                                        max_depth=None,
                                        random_state=None)
    discriminant_analysis = DiscriminantAnalysis()

    # Process dataset
    training_data_features, training_data_labels, mnist_test_data_features, mnist_test_data_labels, \
    usps_test_data_features, usps_test_data_labels, combined_test_data_features, combined_test_data_labels = \
        data.pre_process()

    # Discriminant Analysis
    IMAGE_SIZE = int(training_data_features.shape[-1]**0.5)
    discriminant_analysis.fit(
        training_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)),
        training_data_labels)
    accuracy_mnist, confusion_mnist = discriminant_analysis.predict(
        'MNIST dataset',
        mnist_test_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)),
        mnist_test_data_labels)
    accuracy_usps, confusion_usps = discriminant_analysis.predict(
        'USPS dataset',
        usps_test_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)),
        usps_test_data_labels)
    accuracy_combined, confusion_combined = discriminant_analysis.predict(
        'Combined dataset',
        combined_test_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)),
        combined_test_data_labels)
    print_and_plot('Bayesian Discriminant Analysis', accuracy_mnist,
                   accuracy_usps, accuracy_combined, confusion_mnist,
                   confusion_usps, confusion_combined)

    # Logistic Regression
    logistic_regression.fit(training_data_features,
                            training_data_labels,
                            learning_rate=0.01,
                            epochs=500)
    accuracy_mnist, confusion_mnist = logistic_regression.predict(
        mnist_test_data_features, mnist_test_data_labels)
    accuracy_usps, confusion_usps = logistic_regression.predict(
        usps_test_data_features, usps_test_data_labels)
    accuracy_combined, confusion_combined = logistic_regression.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('Logistic Regression', accuracy_mnist, accuracy_usps,
                   accuracy_combined, confusion_mnist, confusion_usps,
                   confusion_combined)

    # Neural Network
    neural_network.fit(training_data_features, training_data_labels, epochs=10)
    accuracy_mnist, confusion_mnist = neural_network.predict(
        mnist_test_data_features, mnist_test_data_labels)
    accuracy_usps, confusion_usps = neural_network.predict(
        usps_test_data_features, usps_test_data_labels)
    accuracy_combined, confusion_combined = neural_network.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('Neural Network', accuracy_mnist, accuracy_usps,
                   accuracy_combined, confusion_mnist, confusion_usps,
                   confusion_combined)

    # Support Vector Machine
    svm.fit(training_data_features, training_data_labels)
    accuracy_mnist, confusion_mnist = svm.predict(mnist_test_data_features,
                                                  mnist_test_data_labels)
    accuracy_usps, confusion_usps = svm.predict(usps_test_data_features,
                                                usps_test_data_labels)
    accuracy_combined, confusion_combined = svm.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('SVM', accuracy_mnist, accuracy_usps, accuracy_combined,
                   confusion_mnist, confusion_usps, confusion_combined)

    # Random Forest
    random_forest.fit(training_data_features, training_data_labels)
    accuracy_mnist, confusion_mnist = random_forest.predict(
        mnist_test_data_features, mnist_test_data_labels)
    accuracy_usps, confusion_usps = random_forest.predict(
        usps_test_data_features, usps_test_data_labels)
    accuracy_combined, confusion_combined = random_forest.predict(
        combined_test_data_features, combined_test_data_labels)
    print_and_plot('Random Forest', accuracy_mnist, accuracy_usps,
                   accuracy_combined, confusion_mnist, confusion_usps,
                   confusion_combined)