def main(): cancer = pd.read_csv("Breast Invasive Carcinoma (TCGA, Provisional).csv") target_threshold = 32 cancer["survival"] = "" cancer = prepro.setup_target_variable(cancer, 'OS MONTHS', 'survival', 32) cancer = prepro.handle_categorical_variables(cancer, [ 'AJCC METASTASIS PATHOLOGIC PM', 'AJCC NODES PATHOLOGIC PN', 'AJCC PATHOLOGIC TUMOR STAGE', 'AJCC TUMOR PATHOLOGIC PT', 'DFS STATUS', 'ETHNICITY', 'GENDER' ]) cancer = prepro.handle_na(cancer) print "Shape(Before removing NaN): ", cancer.shape cancer_without_nan = cancer.dropna(subset=['OS MONTHS']) print "Shape(After removing NaN): ", cancer_without_nan.shape models.DecisionTree(cancer_without_nan, [2])
import config import models def huber_approx_obj(preds, dtrain): ''' xgboost optimizing function for mean absolute error ''' d = preds - dtrain #add .get_labels() for xgb.train() h = 1 #h is delta in the graphic scale = 1 + (d / h)**2 scale_sqrt = np.sqrt(scale) grad = d / scale_sqrt hess = 1 / scale / scale_sqrt return grad, hess models = { "dt": models.DecisionTree(), "rf": models.RandomForest(), "lr": models.LR(), "xgb": models.XGBoost(), "svm": models.SVM(), "lgb": models.LGB(), # "mlp": models.MLP(), "lstm": models.LSTM() } # to get the final accuracy, calculate the mean and the mean absolute error should be the percentage of the # performance since he wants to see performance
#loaded_model = load("model_SVC.joblib") #SVM.test_svm_classifier(loaded_model, val_data, val_labels) loaded_model = load_model("models/best_model_DNN_Adam.h5") NN.test_neural_network(loaded_model, val_data, val_labels) if __name__ == "__main__": total_features = 545333 # total unique features testing_set_size = 1500 # set site that will be used to create random test set malware_ratio = 0.3 # malware ratio in the set size print("Creating data-labels...") onehot.create_list_of_apps() # function from set_one_encoding.py # initialize sklearn models GNB = models.GaussianNaiveBayes() MNB = models.MultinomialNaiveBayes() CNB = models.ComplementNaiveBayes() BNB = models.BernoulliNaiveBayes() DT = models.DecisionTree() RF = models.RandomForest() KNN = models.KNearestNeighbors() LR = models.LogRegression() SVM = models.SupportVectorMachine() val_runs = 8 #evaluate_models(val_runs) evaluate_on_test_set()
print "question 12" print "Eout(g1) is " + str(g_err_list[0]) utils.curve(range(1, 301), g_err_list, '12.png', 't', 'Ein') # question 13 alpha_list, g_list, _, _ = models.adaboost(features_train, 300) G_err_list = models.predict_with_G(features_test, g_list, alpha_list) print "question 13" print "Eout(G) is " + str(G_err_list[299]) utils.curve(range(1, 301), G_err_list, '13.png', 't', 'Eout') data_train = utils.load_data_tree('hw3_train.dat') data_test = utils.load_data_tree('hw3_test.dat') # question 15 root, leaf_count = models.DecisionTree(data_train) models.DecisionTree_print(root) p_in = models.DecisionTree_predict(root, data_train) p_out = models.DecisionTree_predict(root, data_test) ans_in = [d[2] for d in data_train] ans_out = [d[2] for d in data_test] Ein = models.error_0_1(p_in, ans_in) Eout = models.error_0_1(p_out, ans_out) print "question 15" print "Ein: %f, Eout: %f" % (Ein, Eout) # question 16 root, leaf_count = models.DecisionTree(data_train) ans_in = [d[2] for d in data_train] ans_out = [d[2] for d in data_test] Ein_list = []
def train_external_detector(): train_data, train_labels, test_data, test_labels = create_sets() trained_model = tf.keras.models.load_model('best_model_Adam.h5') predict_original = trained_model.predict(train_data) confusion = confusion_matrix(train_labels, np.argmax(predict_original, axis=1)) TP = confusion[1, 1] TN = confusion[0, 0] FP = confusion[0, 1] FN = confusion[1, 0] FNR_original = FN / float(FN + TP) * 100 FPR = FP / float(FP + TN) * 100 accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100 print(confusion) print("Original FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) print("Original Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR_original) average_changes = 0 amount_malwares = 0 averageChanges = 0 # the numpy array will be filled dynamically adversarial_data = np.zeros((0, 3880), dtype=float) for i in range(len(train_data)): if train_labels[i] == 1: x = train_data[i:i + 1] # print("x: ", x) # print(x.shape) try: adv_x, changes = craft_adversarial_samples( x, 0, trained_model, 1) # print(adv_x) # append the adversarial data to the numpy array adversarial_data = np.concatenate((adversarial_data, adv_x)) if changes >= 0: average_changes += changes amount_malwares += 1 except NameError: pass except ValueError: pass if amount_malwares > 0: averageChanges += (average_changes / float(amount_malwares)) train_data, train_labels, test_data, test_labels = create_sets() predictions = trained_model.predict(train_data) confusion = confusion_matrix(train_labels, np.argmax(predictions, axis=1)) print(confusion) TP = confusion[1, 1] TN = confusion[0, 0] FP = confusion[0, 1] FN = confusion[1, 0] FNR = FN / float(FN + TP) * 100 FPR = FP / float(FP + TN) * 100 accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100 print("Adversarial FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) print("Misclassification Rate:", FNR - FNR_original) print("Distortion:", averageChanges) predictions = trained_model.predict(adversarial_data) adversarial_labels = np.ones((len(adversarial_data), ), dtype=int) confusion = confusion_matrix(adversarial_labels, np.argmax(predictions, axis=1)) print(confusion) TP = confusion[1, 1] TN = confusion[0, 0] FP = confusion[0, 1] FN = confusion[1, 0] FNR = FN / float(FN + TP) * 100 FPR = FP / float(FP + TN) * 100 accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100 print("Adversarial FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) print("Misclassification Rate:", FNR - FNR_original) print("Distortion:", averageChanges) print(changes_dict) del predict_original, FNR_original, predictions, confusion, TP, TN, FP, FN, FNR, FPR, accuracy # concatenate legit with produced adversarial input final_train_data = np.concatenate((train_data, adversarial_data)) print("final train data shape:", final_train_data.shape) train_labels = np.zeros((len(train_labels), ), dtype=int) # fill with 0 (the original class) print("train labels shape:", train_labels.shape) adverarial_labels = np.ones( (len(adversarial_data), ), dtype=int) # fill with 1 (the adversarial class) print("adversarial labels:", adverarial_labels.shape) final_train_labels = np.concatenate((train_labels, adverarial_labels)) print("final labels shape:", final_train_labels.shape) print("Unique classes:", np.unique(final_train_labels)) del train_data, train_labels, adversarial_data, adverarial_labels #shuffle the set shuffle(final_train_data, final_train_labels, random_state=123) # train with the augmented dataset (with adverarial examples belong to class '1') model = generate_neural_network(total_features, [200, 200], 0.2, 0.001, "glorot_uniform", "zeros", "relu", 2) train_neural_network(model, epochs=30, batch_size=150, features=final_train_data, labels=final_train_labels, verbose=2, validation=True, val_data=final_train_data, val_labels=final_train_labels, callbacks=True, path=dir_path + "logs/fit/", model_name="external_detector_2") GNB = models.GaussianNaiveBayes() MNB = models.MultinomialNaiveBayes() CNB = models.ComplementNaiveBayes() BNB = models.BernoulliNaiveBayes() DT = models.DecisionTree() RF = models.RandomForest() KNN = models.KNearestNeighbors() LR = models.LogRegression() SVM = models.SupportVectorMachine() model = GNB.train_gaussian_naive_bayes_classifier( final_train_data, final_train_labels) # train Naive Bayes score_GNB = GNB.evaluate_gaussian_naive_bayes_classifier( model, final_train_data, final_train_labels) # test performance print("GNB", score_GNB) model = MNB.train_multi_naive_bayes_classifier(final_train_data, final_train_labels) score_MNB = MNB.evaluate_multi_naive_bayes_classifier( model, final_train_data, final_train_labels) print("MNB", score_MNB) model = CNB.train_complement_naive_bayes_classifier( final_train_data, final_train_labels) score_CNB = CNB.evaluate_complement_naive_bayes_classifier( model, final_train_data, final_train_labels) print("CNB", score_CNB) model = BNB.train_bernoulli_naive_bayes_classifier(final_train_data, final_train_labels) score_BNB = BNB.evaluate_bernoulli_naive_bayes_classifier( model, test_data, test_labels) print("BNB", score_BNB) model = DT.train_decision_tree_classifier( final_train_data, final_train_labels) # train Decision Tree Classifier score_dt = DT.evaluate_decision_tree_classifier(model, final_train_data, final_train_labels) print("DT:", score_dt) model = LR.train_logistic_regression_classifier( final_train_data, final_train_labels) # train logistic Regression score_lr = LR.evaluate_logistic_regression_classifier( model, final_train_data, final_train_labels) print("LR", score_lr) model = KNN.train_knn_classifier( final_train_data, final_train_labels) # train k-Nearest Neighbors Classifier score_knn = KNN.evaluate_knn_classifier(model, final_train_data, final_train_labels) print("KNN", score_knn) model = SVM.train_svm_classifier( final_train_data, final_train_labels) # train Support Vector Machines score_svm = SVM.evaluate_svm_classifier(model, final_train_data, final_train_labels) print("SVM", score_svm) model = RF.train_random_forest_classifier( final_train_data, final_train_labels) # train Random Forest score_rf = RF.evaluate_random_forest_classifier(model, final_train_data, final_train_labels) print("RF:", score_rf)
import graphviz from sklearn import tree import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=FutureWarning) warnings.filterwarnings("ignore", category=RuntimeWarning) warnings.filterwarnings("ignore", category=UserWarning) # Read selected features with open('../cache/selected_features.json') as f: data = json.load(f) features = data['diabetes'] print("Selected Features: ",features) x,y = exp.experiment_3(features) cutoff = x.shape[0] - 30 valx = x[cutoff:] valy = y[cutoff:] trainX = x[:cutoff] trainY = y[:cutoff] temp = model.DecisionTree(valx,valy, trainX, trainY, 4)