def test_file_full(file): split_dataset(file + "_data.csv", file + "_labels.csv", file + "_train.csv", file + "_test.csv", None) train_data = LogFile(file + "_train.csv", ",", 0, 1000000, None, "case_id", "name") train_data.remove_attributes(["label"]) model = edbn.train(train_data) test_data = LogFile(file + "_test.csv", ",", 0, 1000000, None, "case_id", "name", values=train_data.values) edbn.test(test_data, file + "_output_full.csv", model, "label", "0", train_data) plot.plot_single_roc_curve(file + "_output_full.csv", file, save_file="../Data/Nolle_Graphs/" + file.split("/")[-1] + "_roc.png") plot.plot_single_prec_recall_curve(file + "_output_full.csv", file, save_file="../Data/Nolle_Graphs/" + file.split("/")[-1] + "_precrec.png")
def score_continuous_net(model, test, label_attr, output_file=None, title=None): import Utils.PlotResults as plot ranking = model.test_parallel(test) ranking.sort(key=lambda l: l[0].get_total_score()) scores = [] y = [] for r in ranking: scores.append((getattr(r[1], "Index"), r[0].get_total_score(), getattr(r[1], label_attr) != 0)) y.append(r[0].get_total_score()) print(len(scores)) if output_file is None: output_file = "../output.csv" with open(output_file, "w") as fout: for s in scores: fout.write(",".join([str(i) for i in s])) fout.write("\n") plot.plot_single_roc_curve(output_file, title) plot.plot_single_prec_recall_curve(output_file, title)
def test_file_bohmer(file): split_dataset(file + "_data.csv", file + "_labels.csv", file + "_train.csv", file + "_test.csv", 10000) train_data = LogFile(file + "_train.csv", ",", 0, 1000000, None, "case_id", "name", convert=False) train_data.remove_attributes(["label"]) model = bohmer.train(train_data, 3, 4, 1) test_data = LogFile(file + "_test.csv", ",", 0, 1000000, None, "case_id", "name", convert=False, values=train_data.values) bohmer.test(test_data, file + "_output_bohmer.csv", model, "label", 0) plot.plot_single_roc_curve(file + "_output_bohmer.csv", file, save_file="../Data/Nolle_Graphs/" + file.split("/")[-1] + "_roc_bohmer.png") plot.plot_single_prec_recall_curve(file + "_output_bohmer.csv", file, save_file="../Data/Nolle_Graphs/" + file.split("/")[-1] + "_precrec_bohmer.png")
def stephenRun(): # Use the BPIC15_x_sorted.csv to generate new training and test datafiles with anomalies introduced # After running this once you can comment this line out # preProcessData("../Data/") # Indicate which are the training and test files train_file = "../Data/BPIC15_train_1.csv" test_file = "../Data/BPIC15_test_1.csv" # Load logfile to use as training data train_data = LogFile(train_file, ",", 0, 500000, None, "Case") train_data.remove_attributes(["Anomaly"]) # Train the model model = edbn.train(train_data) # Test the model and save the scores in ../Data/output.csv test_data = LogFile(test_file, ",", header=0, rows=500000, time_attr=None, trace_attr="Case", values=train_data.values) edbn.test(test_data, "../Data/output.csv", model, label="Anomaly", normal_val="0") # Plot the ROC curve based on the results plot.plot_single_roc_curve("../Data/output.csv")
def compare_bpic_total(path): train = path + "BPIC15_train_total.csv" test = path + "BPIC15_test_total.csv" output = path + "Output/BPIC_15_output_total.csv" output_edbn = path + "Output/BPIC15_edbn_output_total.csv" prec_recall = path + "Output/prec_recall_total.png" roc = path + "Output/roc_total.png" if not os.path.exists(path + "Output"): os.mkdir(path + "Output") train_data = LogFile(train, ",", 0, 500000, "Time", "Case", activity_attr="Activity", convert=False) train_data.remove_attributes(["Anomaly", "Type", "Time"]) test_data = LogFile(test, ",", 0, 500000, "Time", "Case", activity_attr="Activity", values=train_data.values, convert=False) bohmer_model = bmr.train(train_data) bmr.test(test_data, output, bohmer_model, label = "Anomaly", normal_val = 0) train_data.convert2int() test_data.convert2int() edbn_model = edbn_train(train_data) edbn_test(test_data, output_edbn, edbn_model, label = "Anomaly", normal_val = "0") plt.plot_compare_prec_recall_curve([output, output_edbn], ["Likelihood Graph", "EDBN"], save_file=prec_recall) plt.plot_compare_roc_curve([output, output_edbn], ["Likelihood Graph", "EDBN"], roc)
def duration_test(): path = "../Data/Experiments_Duration/" train_rates = [0, 5, 10, 25] test_rates = [1, 5, 10, 25, 50, 100, 250, 500] anoms_rates = [] for train_rate in train_rates: for test_rate in test_rates: anoms_rates.append((train_rate, test_rate)) for i in range(len(anoms_rates)): print(anoms_rates[i]) scores = [] for run in range(RUNS): print("Run %i" % run) train_file = path + "%i_train_%i.csv" % (i, anoms_rates[i][0]) test_file = path + "%i_test_%i.csv" % (i, anoms_rates[i][1]) duration_generator.generate(10000, 10000, anoms_rates[i][0], anoms_rates[i][1], train_file, test_file) train_data = LogFile(train_file, ",", 0, 1000000, "date", "trace") train_data.remove_attributes(["Anomaly"]) test_data = LogFile(test_file, ",", 0, 1000000, "date", "trace", values=train_data.values) train_data.keep_attributes( ["event", "date", "trace", "process", "resource", "random"]) train_data.create_k_context() train_data.add_duration_to_k_context() bins = train_data.discretize("duration_0") test_data.create_k_context() test_data.add_duration_to_k_context() test_data.discretize("duration_0", bins) model = edbn.train(train_data) edbn.test(test_data, path + "Output_%i_%i.csv" % anoms_rates[i], model, "anomaly", "0") output_file = path + "Output_%i_%i.csv" % anoms_rates[i] output_roc = path + "roc_%i_%i.png" % anoms_rates[i] output_prec = path + "prec_recall_%i_%i.png" % anoms_rates[i] score = plt.get_roc_auc(output_file) scores.append(plt.get_roc_auc(output_file)) print("Score = %f" % score) with open(path + "results.txt", "a") as fout: fout.write("Testing:\ntrain rate: %i\ntest rate: %i\n" % (anoms_rates[i][0], anoms_rates[i][1])) fout.write("Result: " + str(scores) + "\n") fout.write("Mean: %f Median: %f\n" % (np.mean(scores), np.median(scores))) fout.write("Variance: %f\n\n" % np.var(scores))
def compare_bpics(path): for i in range(1, 6): # Input Files train = path + "BPIC15_train_%i.csv" % (i) test = path + "BPIC15_test_%i.csv" % (i) output = path + "Output/BPIC15_output_%i.csv" % (i) output_edbn = path + "Output/BPIC15_edbn_output_%i.csv" % (i) prec_recall = path + "Output/prec_recall_%i.png" % (i) roc = path + "Output/roc_%i.png" % (i) train_data = LogFile(train, ",", 0, 500000, "Time", "Case", activity_attr="Activity", convert=False) train_data.remove_attributes(["Anomaly", "Type", "Time"]) test_data = LogFile(test, ",", 0, 500000, "Time", "Case", activity_attr="Activity", values=train_data.values, convert=False) bohmer_model = bmr.train(train_data) bmr.test(test_data, output, bohmer_model, label="Anomaly", normal_val="0") train_data.convert2int() test_data.convert2int() edbn_model = edbn.train(train_data) edbn.test(test_data, output_edbn, edbn_model, label="Anomaly", normal_val="0") plt.plot_compare_prec_recall_curve([output, output_edbn], ["Likelihood Graph", "EDBN"], save_file=prec_recall) plt.plot_compare_roc_curve([output, output_edbn], ["Likelihood Graph", "EDBN"], roc)
def categorical_test(): path = "../Data/Experiments/" train_rates = [0, 5, 10, 25] test_rates = [1, 5, 10, 25, 50, 100, 250, 500] anoms_rates = [] for train_rate in train_rates: for test_rate in test_rates: anoms_rates.append((train_rate, test_rate)) for i in range(len(anoms_rates)): print(anoms_rates[i]) scores = [] for run in range(RUNS): print("Run %i" % run) train_file = path + "%i_train_%i.csv" % (i, anoms_rates[i][0]) test_file = path + "%i_test_%i.csv" % (i, anoms_rates[i][1]) generator.create_shipment_data(10000, 10000, anoms_rates[i][0], anoms_rates[i][1], train_file, test_file) train_data = LogFile(train_file, ",", 0, 1000000, None, "Case") train_data.remove_attributes(["Anomaly"]) test_data = LogFile(test_file, ",", 0, 1000000, None, "Case", values=train_data.values) model = edbn.train(train_data) edbn.test(test_data, path + "Output_%i_%i.csv" % anoms_rates[i], model, "Anomaly", "0") output_file = path + "Output_%i_%i.csv" % anoms_rates[i] output_roc = path + "roc_%i_%i.png" % anoms_rates[i] output_prec = path + "prec_recall_%i_%i.png" % anoms_rates[i] score = plt.get_roc_auc(output_file) scores.append(plt.get_roc_auc(output_file)) print("Score = %f" % score) with open(path + "results.txt", "a") as fout: fout.write("Testing:\ntrain rate: %i\ntest rate: %i\n" % (anoms_rates[i][0], anoms_rates[i][1])) fout.write("Result: " + str(scores) + "\n") fout.write("Mean: %f Median: %f\n" % (np.mean(scores), np.median(scores))) fout.write("Variance: %f\n\n" % np.var(scores))
def breast_discrete_exec(): data = "../Data/breast_data.csv" labels = "../Data/breast_labels.csv" log = pd.read_csv(data, header=None) labels = pd.read_csv(labels, header=None) log["Label"] = labels[0] cols = [] for c in log.columns: cols.append("V" + str(c)) log.columns = cols log['ID'] = log.reset_index().index print(log) train = log[:100] test = log[100:] train = train[train.VLabel == 0].drop(columns=["VLabel"]) train.to_csv("../Data/breast_train.csv", index=False) test.to_csv("../Data/breast_test.csv", index=False) train_data = LogFile("../Data/breast_train.csv", ",", 0, 500000, None, "ID", activity_attr="Activity") train_data.k = 0 model = edbn.train(train_data) test_data = LogFile("../Data/breast_test.csv", ",", 0, 500000, None, "ID", activity_attr="Activity") test_data.k = 0 print(test_data.data) edbn.test(test_data, "../Data/breast_discrete_output.csv", model, "VLabel", "0") plot.plot_single_roc_curve("../Data/breast_discrete_output.csv", "breast_discrete") plot.plot_single_prec_recall_curve("../Data/breast_discrete_output.csv", "breast_discrete")
def compare(files, nolle_result, nolle_labels): i = 0 for file in files: results = [] results.append(file + "_output_sample.csv") results.append(file + "_output_full.csv") results.append(file + "_output_bohmer.csv") plot.plot_compare_prec_recall_curve( results, ["Sample", "Full", "Bohmer"] + nolle_labels, nolle_result, "Comparison", save_file="../Data/Nolle_Graphs/" + file.split("/")[-1] + "_compare_precrec.png") plot.plot_compare_roc_curve(results, ["Sample", "Full", "Bohmer"], "Comparison", save_file="../Data/Nolle_Graphs/" + file.split("/")[-1] + "_compare_roc.png") i += 1
def compare_bpic_total(path): train = path + "BPIC15_train_total.csv" test = path + "BPIC15_test_total.csv" output = path + "Output/BPIC_15_output_total.csv" output_edbn = path + "Output/BPIC15_edbn_output_total.csv" prec_recall = path + "Output/prec_recall_total.png" roc = path + "Output/roc_total.png" #bohmer_model = bmr.train(train, header = 0, length = 5000000) #bmr.test(train, test, output, bohmer_model, ",", 5000000, skip=0) train_data = LogFile(train, ",", 0, 500000, None, "Case") train_data.remove_attributes(["Anomaly"]) test_data = LogFile(test, ",", 0, 500000, None, "Case", train_data.string_2_int, train_data.int_2_string) edbn_model = edbn.train(train_data) edbn.test(test_data, output_edbn, edbn_model, "Anomaly", "0") plt.plot_compare_prec_recall_curve([output, output_edbn], ["Likelihood Graph", "eDBN"], save_file=prec_recall) plt.plot_compare_roc_curve([output, output_edbn], ["Likelihood Graph", "eDBN"], roc)
def compare_bpics(path): for i in range(1,6): # Input Files train = path + "BPIC15_train_%i.csv" % (i) test = path + "BPIC15_test_%i.csv" % (i) output = path + "Output/BPIC15_output_%i.csv" % (i) output_edbn = path + "Output/BPIC15_edbn_output_%i.csv" % (i) prec_recall = path + "Output/prec_recall_%i.png" % (i) roc = path + "Output/roc_%i.png" % (i) #bohmer_model = bmr.train(train + "_ints", header = 0, length = 500000) #bmr.test(train + "_ints", test + "_ints", output, bohmer_model, ",", 500000, skip=0) train_data = LogFile(train, ",", 0, 500000, None, "Case") train_data.remove_attributes(["Anomaly"]) test_data = LogFile(test, ",", 0, 500000, None, "Case", train_data.string_2_int, train_data.int_2_string) edbn_model = edbn.train(train_data) edbn.test(test_data, output_edbn, edbn_model, "Anomaly", "0") plt.plot_compare_prec_recall_curve([output, output_edbn], ["Likelihood Graph", "eDBN"], save_file=prec_recall) plt.plot_compare_roc_curve([output, output_edbn], ["Likelihood Graph", "eDBN"], roc)
test_date = LogFile(test_file, ",", 0, 1000000, None, "Case", string_2_int=train_date.string_2_int, int_2_string=train_date.int_2_string) model = edbn.train(train_date) edbn.test(test_date, path + "Output_%i_%i.csv" % anoms_rates[i], model, "Anomaly", "0") output_file = path + "Output_%i_%i.csv" % anoms_rates[i] output_roc = path + "roc_%i_%i.png" % anoms_rates[i] output_prec = path + "prec_recall_%i_%i.png" % anoms_rates[i] score = plt.get_roc_auc(output_file) scores.append(plt.get_roc_auc(output_file)) print("Score = %f" % score) with open(path + "results.txt", "a") as fout: fout.write("Testing:\ntrain rate: %i\ntest rate: %i\n" % (anoms_rates[i][0], anoms_rates[i][1])) fout.write("Result: " + str(scores) + "\n") fout.write("Mean: %f Median: %f\n" % (np.mean(scores), np.median(scores))) fout.write("Variance: %f\n\n" % np.var(scores)) #plt.plot_single_roc_curve(output_file, output_roc) #plt.plot_single_prec_recall_curve(output_file, None, output_prec)
for edge in net.edges(): relations.append((edge[0], edge[1])) for relation in relations: # if relation not in mappings: edbn.get_variable(relation[1]).add_parent(edbn.get_variable(relation[0])) print(relation[0], "->", relation[1]) edbn.train(train, single=True) ranking = edbn.test(test) ranking.sort(key=lambda l: l[0].get_total_score()) scores = [] y = [] for r in ranking: scores.append((getattr(r[1], "Index"), r[0].get_total_score(), getattr(r[1], "Class") != 1)) y.append(r[0].get_total_score()) print(len(scores)) with open("../output.csv", "w") as fout: for s in scores: fout.write(",".join([str(i) for i in s])) fout.write("\n") plot.plot_single_roc_curve("../output.csv") plot.plot_single_prec_recall_curve("../output.csv") plt.plot(list(range(len(y))), y) plt.show()
def run_full(): # Use the BPIC15_x_sorted.csv to generate new training and test datafiles with anomalies introduced # After running this once you can comment this line out #preProcessData("../Data/") for i in range(1, 2): # Indicate which are the training and test files train_file = "../Data/bpic15_%i_train.csv" % (i) test_file = "../Data/bpic15_%i_test.csv" % (i) # Load logfile to use as training data train_data = LogFile(train_file, ",", 0, 500000, time_attr="Complete_Timestamp", trace_attr="Case_ID", activity_attr="Activity") train_data.remove_attributes(["Anomaly"]) # train_data.keep_attributes(["Case_ID", "Complete_Timestamp", "Activity", "Resource", "case_termName"]) train_data.remove_attributes(["planned"]) train_data.remove_attributes(["dueDate"]) train_data.remove_attributes(["dateFinished"]) # train_data.keep_attributes(["Case_ID", "Complete_Timestamp", "Activity", "Resource", "Weekday"]) # train_data.create_k_context() # train_data.add_duration_to_k_context() # Train the model model = edbn.train(train_data) # Test the model and save the scores in ../Data/output.csv test_data = LogFile(test_file, ",", header=0, rows=500000, time_attr="Complete_Timestamp", trace_attr="Case_ID", values=train_data.values) # test_data.create_k_context() # test_data.add_duration_to_k_context() edbn.test(test_data, "../Data/output2_%i.csv" % (i), model, label="Anomaly", normal_val="0", train_data=train_data) # Plot the ROC curve based on the results plot.plot_single_roc_curve("../Data/output2_%i.csv" % (i), title="BPIC15_%i" % (i)) plot.plot_single_prec_recall_curve("../Data/output2_%i.csv" % (i), title="BPIC15_%i" % (i)) out_files = [] labels = [] for i in range(1, 6): out_files.append("../Data/output2_%i.csv" % (i)) labels.append("MUNIS_%i" % (i)) plot.plot_compare_roc_curve(out_files, labels, "BPIC15 Comparison") plot.plot_compare_prec_recall_curve(out_files, labels, "BPIC15 Comparison")