Example #1
0
def main():
    cancer = pd.read_csv("Breast Invasive Carcinoma (TCGA, Provisional).csv")

    target_threshold = 32
    cancer["survival"] = ""

    cancer = prepro.setup_target_variable(cancer, 'OS MONTHS', 'survival', 32)

    cancer = prepro.handle_categorical_variables(cancer, [
        'AJCC METASTASIS PATHOLOGIC PM', 'AJCC NODES PATHOLOGIC PN',
        'AJCC PATHOLOGIC TUMOR STAGE', 'AJCC TUMOR PATHOLOGIC PT',
        'DFS STATUS', 'ETHNICITY', 'GENDER'
    ])

    cancer = prepro.handle_na(cancer)

    print "Shape(Before removing NaN): ", cancer.shape
    cancer_without_nan = cancer.dropna(subset=['OS MONTHS'])
    print "Shape(After removing NaN): ", cancer_without_nan.shape

    models.DecisionTree(cancer_without_nan, [2])
import config
import models


def huber_approx_obj(preds, dtrain):
    '''
    xgboost optimizing function for mean absolute error
    '''
    d = preds - dtrain  #add .get_labels() for xgb.train()
    h = 1  #h is delta in the graphic
    scale = 1 + (d / h)**2
    scale_sqrt = np.sqrt(scale)
    grad = d / scale_sqrt
    hess = 1 / scale / scale_sqrt
    return grad, hess


models = {
    "dt": models.DecisionTree(),
    "rf": models.RandomForest(),
    "lr": models.LR(),
    "xgb": models.XGBoost(),
    "svm": models.SVM(),
    "lgb": models.LGB(),
    # "mlp": models.MLP(),
    "lstm": models.LSTM()
}

# to get the final accuracy, calculate the mean and the mean absolute error should be the percentage of the
# performance since he wants to see performance
Example #3
0
    #loaded_model = load("model_SVC.joblib")
    #SVM.test_svm_classifier(loaded_model, val_data, val_labels)

    loaded_model = load_model("models/best_model_DNN_Adam.h5")
    NN.test_neural_network(loaded_model, val_data, val_labels)


if __name__ == "__main__":
    total_features = 545333  # total unique features
    testing_set_size = 1500  # set site that will be used to create random test set
    malware_ratio = 0.3  # malware ratio in the set size

    print("Creating data-labels...")
    onehot.create_list_of_apps()  # function from set_one_encoding.py

    # initialize sklearn models
    GNB = models.GaussianNaiveBayes()
    MNB = models.MultinomialNaiveBayes()
    CNB = models.ComplementNaiveBayes()
    BNB = models.BernoulliNaiveBayes()
    DT = models.DecisionTree()
    RF = models.RandomForest()
    KNN = models.KNearestNeighbors()
    LR = models.LogRegression()
    SVM = models.SupportVectorMachine()

    val_runs = 8

    #evaluate_models(val_runs)
    evaluate_on_test_set()
Example #4
0
    print "question 12"
    print "Eout(g1) is " + str(g_err_list[0])
    utils.curve(range(1, 301), g_err_list, '12.png', 't', 'Ein')

    # question 13
    alpha_list, g_list, _, _ = models.adaboost(features_train, 300)
    G_err_list = models.predict_with_G(features_test, g_list, alpha_list)
    print "question 13"
    print "Eout(G) is " + str(G_err_list[299])
    utils.curve(range(1, 301), G_err_list, '13.png', 't', 'Eout')

    data_train = utils.load_data_tree('hw3_train.dat')
    data_test = utils.load_data_tree('hw3_test.dat')

    # question 15
    root, leaf_count = models.DecisionTree(data_train)
    models.DecisionTree_print(root)
    p_in = models.DecisionTree_predict(root, data_train)
    p_out = models.DecisionTree_predict(root, data_test)
    ans_in = [d[2] for d in data_train]
    ans_out = [d[2] for d in data_test]
    Ein = models.error_0_1(p_in, ans_in)
    Eout = models.error_0_1(p_out, ans_out)
    print "question 15"
    print "Ein: %f, Eout: %f" % (Ein, Eout)

    # question 16
    root, leaf_count = models.DecisionTree(data_train)
    ans_in = [d[2] for d in data_train]
    ans_out = [d[2] for d in data_test]
    Ein_list = []
Example #5
0
def train_external_detector():

    train_data, train_labels, test_data, test_labels = create_sets()

    trained_model = tf.keras.models.load_model('best_model_Adam.h5')
    predict_original = trained_model.predict(train_data)
    confusion = confusion_matrix(train_labels,
                                 np.argmax(predict_original, axis=1))
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    FNR_original = FN / float(FN + TP) * 100
    FPR = FP / float(FP + TN) * 100
    accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100
    print(confusion)
    print("Original FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
    print("Original Accuracy:", accuracy, "- FPR:", FPR, "- FNR:",
          FNR_original)
    average_changes = 0
    amount_malwares = 0
    averageChanges = 0

    # the numpy array will be filled dynamically
    adversarial_data = np.zeros((0, 3880), dtype=float)

    for i in range(len(train_data)):
        if train_labels[i] == 1:

            x = train_data[i:i + 1]
            # print("x: ", x)
            # print(x.shape)
            try:
                adv_x, changes = craft_adversarial_samples(
                    x, 0, trained_model, 1)
                # print(adv_x)
                # append the adversarial data to the numpy array
                adversarial_data = np.concatenate((adversarial_data, adv_x))
                if changes >= 0:
                    average_changes += changes
                    amount_malwares += 1
            except NameError:
                pass
            except ValueError:
                pass

    if amount_malwares > 0:
        averageChanges += (average_changes / float(amount_malwares))

    train_data, train_labels, test_data, test_labels = create_sets()
    predictions = trained_model.predict(train_data)
    confusion = confusion_matrix(train_labels, np.argmax(predictions, axis=1))
    print(confusion)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    FNR = FN / float(FN + TP) * 100
    FPR = FP / float(FP + TN) * 100
    accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100
    print("Adversarial  FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
    print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
    print("Misclassification Rate:", FNR - FNR_original)
    print("Distortion:", averageChanges)
    predictions = trained_model.predict(adversarial_data)
    adversarial_labels = np.ones((len(adversarial_data), ), dtype=int)
    confusion = confusion_matrix(adversarial_labels,
                                 np.argmax(predictions, axis=1))
    print(confusion)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    FNR = FN / float(FN + TP) * 100
    FPR = FP / float(FP + TN) * 100
    accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100
    print("Adversarial  FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
    print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
    print("Misclassification Rate:", FNR - FNR_original)
    print("Distortion:", averageChanges)
    print(changes_dict)
    del predict_original, FNR_original, predictions, confusion, TP, TN, FP, FN, FNR, FPR, accuracy

    # concatenate legit with produced adversarial input
    final_train_data = np.concatenate((train_data, adversarial_data))
    print("final train data shape:", final_train_data.shape)

    train_labels = np.zeros((len(train_labels), ),
                            dtype=int)  # fill with 0 (the original class)
    print("train labels shape:", train_labels.shape)

    adverarial_labels = np.ones(
        (len(adversarial_data), ),
        dtype=int)  # fill with 1 (the adversarial class)
    print("adversarial labels:", adverarial_labels.shape)

    final_train_labels = np.concatenate((train_labels, adverarial_labels))
    print("final labels shape:", final_train_labels.shape)
    print("Unique classes:", np.unique(final_train_labels))

    del train_data, train_labels, adversarial_data, adverarial_labels
    #shuffle the set
    shuffle(final_train_data, final_train_labels, random_state=123)

    # train with the augmented dataset (with adverarial examples belong to class '1')
    model = generate_neural_network(total_features, [200, 200], 0.2, 0.001,
                                    "glorot_uniform", "zeros", "relu", 2)
    train_neural_network(model,
                         epochs=30,
                         batch_size=150,
                         features=final_train_data,
                         labels=final_train_labels,
                         verbose=2,
                         validation=True,
                         val_data=final_train_data,
                         val_labels=final_train_labels,
                         callbacks=True,
                         path=dir_path + "logs/fit/",
                         model_name="external_detector_2")
    GNB = models.GaussianNaiveBayes()
    MNB = models.MultinomialNaiveBayes()
    CNB = models.ComplementNaiveBayes()
    BNB = models.BernoulliNaiveBayes()
    DT = models.DecisionTree()
    RF = models.RandomForest()
    KNN = models.KNearestNeighbors()
    LR = models.LogRegression()
    SVM = models.SupportVectorMachine()

    model = GNB.train_gaussian_naive_bayes_classifier(
        final_train_data, final_train_labels)  # train Naive Bayes
    score_GNB = GNB.evaluate_gaussian_naive_bayes_classifier(
        model, final_train_data, final_train_labels)  # test performance
    print("GNB", score_GNB)

    model = MNB.train_multi_naive_bayes_classifier(final_train_data,
                                                   final_train_labels)
    score_MNB = MNB.evaluate_multi_naive_bayes_classifier(
        model, final_train_data, final_train_labels)
    print("MNB", score_MNB)

    model = CNB.train_complement_naive_bayes_classifier(
        final_train_data, final_train_labels)
    score_CNB = CNB.evaluate_complement_naive_bayes_classifier(
        model, final_train_data, final_train_labels)
    print("CNB", score_CNB)

    model = BNB.train_bernoulli_naive_bayes_classifier(final_train_data,
                                                       final_train_labels)
    score_BNB = BNB.evaluate_bernoulli_naive_bayes_classifier(
        model, test_data, test_labels)
    print("BNB", score_BNB)

    model = DT.train_decision_tree_classifier(
        final_train_data, final_train_labels)  # train Decision Tree Classifier
    score_dt = DT.evaluate_decision_tree_classifier(model, final_train_data,
                                                    final_train_labels)
    print("DT:", score_dt)

    model = LR.train_logistic_regression_classifier(
        final_train_data, final_train_labels)  # train logistic Regression
    score_lr = LR.evaluate_logistic_regression_classifier(
        model, final_train_data, final_train_labels)
    print("LR", score_lr)

    model = KNN.train_knn_classifier(
        final_train_data,
        final_train_labels)  # train k-Nearest Neighbors Classifier
    score_knn = KNN.evaluate_knn_classifier(model, final_train_data,
                                            final_train_labels)
    print("KNN", score_knn)

    model = SVM.train_svm_classifier(
        final_train_data, final_train_labels)  # train Support Vector Machines
    score_svm = SVM.evaluate_svm_classifier(model, final_train_data,
                                            final_train_labels)
    print("SVM", score_svm)

    model = RF.train_random_forest_classifier(
        final_train_data, final_train_labels)  # train Random Forest
    score_rf = RF.evaluate_random_forest_classifier(model, final_train_data,
                                                    final_train_labels)
    print("RF:", score_rf)
import graphviz 
from sklearn import tree

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Read selected features
with open('../cache/selected_features.json') as f:
    data = json.load(f)
features = data['diabetes']
print("Selected Features: ",features)

x,y = exp.experiment_3(features)

cutoff = x.shape[0] - 30
valx   = x[cutoff:]
valy   = y[cutoff:]
trainX = x[:cutoff]
trainY = y[:cutoff]

temp = model.DecisionTree(valx,valy, trainX, trainY, 4)