def stephenRun(): # Use the BPIC15_x_sorted.csv to generate new training and test datafiles with anomalies introduced # After running this once you can comment this line out # preProcessData("../Data/") # Indicate which are the training and test files train_file = "../Data/BPIC15_train_1.csv" test_file = "../Data/BPIC15_test_1.csv" # Load logfile to use as training data train_data = LogFile(train_file, ",", 0, 500000, None, "Case") train_data.remove_attributes(["Anomaly"]) # Train the model model = edbn.train(train_data) # Test the model and save the scores in ../Data/output.csv test_data = LogFile(test_file, ",", header=0, rows=500000, time_attr=None, trace_attr="Case", values=train_data.values) edbn.test(test_data, "../Data/output.csv", model, label="Anomaly", normal_val="0") # Plot the ROC curve based on the results plot.plot_single_roc_curve("../Data/output.csv")
def train_vars_and_test(model, alias, filename, event_emit_obj): file = UPLOAD_FOLDER + "/" + alias + "/" + filename folder = UPLOAD_FOLDER + "/" + alias + "/" train_file = get_constructed_file(file) test_file = get_constructed_file(file, type="test") train_data = LogFile(train_file, ",", 0, 500000, None, "Case") train_data.remove_attributes(["Anomaly", "time"]) event_emit_obj('score_resp', {'step': 2, "msg": "Data loaded."}) train_data.create_k_context() event_emit_obj('score_resp', { 'step': 3, "msg": "Build K-Context for data." }) model_trained_on_data = edbn.train_seperate(train_data, model) event_emit_obj('score_resp', {'step': 4, "msg": "Finished training data."}) test_data = LogFile(test_file, ",", header=0, rows=500000, time_attr=None, trace_attr="Case", values=train_data.values) edbn.test(test_data, folder + "output.csv", model_trained_on_data, label="Anomaly", normal_val="0") event_emit_obj('score_resp', {'step': 5, "msg": "Finished testing"}) # # Plot the ROC curve based on the results # plot.plot_single_roc_curve(experiment_folder + "output.csv") event_emit_obj('score_resp', {'step': 6, "msg": "Preparing to score."}) scores = get_event_scores(test_data.data, model_trained_on_data) r = list(scores.keys()) one = np.random.randint(0, len(r)) random_key = r[one] print(random_key) print(test_data.convert_int2string('Case', int(random_key))) # results = plottable(scores) event_emit_obj('score_resp', {'step': 7, "msg": "Finished scoring!"}) print("Finished scoring...") # plot_single_scores(scores) # r, ps = plot_pvalues(scores, 20) return scores
def test_file(file): split_dataset(file + "_data.csv", file + "_labels.csv", file + "_train.csv", file + "_test.csv", 10000) train_data = LogFile(file + "_train.csv", ",", 0, 1000000, None, "case_id", "event") #train_data.remove_attributes(["event_position"]) model = edbn.train(train_data) test_data = LogFile(file + "_test.csv", ",", 0, 1000000, None, "case_id", "event", values=train_data.values) edbn.test(test_data, file + "_output.csv", model, "label", "0") plot.plot_single_roc_curve(file + "_output.csv", file, save_file="../Data/Nolle_Graphs/" + file.split("/")[-1] + "_roc.png") plot.plot_single_prec_recall_curve(file + "_output.csv", file, save_file="../Data/Nolle_Graphs/" + file.split("/")[-1] + "_precrec.png")
def breast_discrete_exec(): data = "../Data/breast_data.csv" labels = "../Data/breast_labels.csv" log = pd.read_csv(data, header=None) labels = pd.read_csv(labels, header=None) log["Label"] = labels[0] cols = [] for c in log.columns: cols.append("V" + str(c)) log.columns = cols log['ID'] = log.reset_index().index print(log) train = log[:100] test = log[100:] train = train[train.VLabel == 0].drop(columns=["VLabel"]) train.to_csv("../Data/breast_train.csv", index=False) test.to_csv("../Data/breast_test.csv", index=False) train_data = LogFile("../Data/breast_train.csv", ",", 0, 500000, None, "ID", activity_attr="Activity") train_data.k = 0 model = edbn.train(train_data) test_data = LogFile("../Data/breast_test.csv", ",", 0, 500000, None, "ID", activity_attr="Activity") test_data.k = 0 print(test_data.data) edbn.test(test_data, "../Data/breast_discrete_output.csv", model, "VLabel", "0") plot.plot_single_roc_curve("../Data/breast_discrete_output.csv", "breast_discrete") plot.plot_single_prec_recall_curve("../Data/breast_discrete_output.csv", "breast_discrete")
def compare_bpics(path): for i in range(1, 6): # Input Files train = path + "BPIC15_train_%i.csv" % (i) test = path + "BPIC15_test_%i.csv" % (i) output = path + "Output/BPIC15_output_%i.csv" % (i) output_edbn = path + "Output/BPIC15_edbn_output_%i.csv" % (i) prec_recall = path + "Output/prec_recall_%i.png" % (i) roc = path + "Output/roc_%i.png" % (i) train_data = LogFile(train, ",", 0, 500000, "Time", "Case", activity_attr="Activity", convert=False) train_data.remove_attributes(["Anomaly", "Type", "Time"]) test_data = LogFile(test, ",", 0, 500000, "Time", "Case", activity_attr="Activity", values=train_data.values, convert=False) #bohmer_model = bmr.train(train_data) #bmr.test(test_data, output, bohmer_model, label = "Anomaly", normal_val = "0") train_data.convert2int() test_data.convert2int() edbn_model = edbn.train(train_data) edbn.test(test_data, output_edbn, edbn_model, label="Anomaly", normal_val="0") plt.plot_compare_prec_recall_curve([output, output_edbn], ["Likelihood Graph", "eDBN"], save_file=prec_recall) plt.plot_compare_roc_curve([output, output_edbn], ["Likelihood Graph", "eDBN"], roc)
def run(default_dataset="edbn/Data/BPIC15_1_sorted.csv", default_alias="run/"): # Use the BPIC15_x_sorted.csv to generate new training and test datafiles with anomalies introduced # After running this once you can comment this line out # which_dataset = "edbn/Data/BPIC15_1_sorted.csv" # which_dataset = "edbn/Data/BPIC15_1_sorted.csv" # preprocess_folder = "run/" which_dataset = default_dataset preprocess_folder = default_alias train_file, test_file, experiment_folder = preProcessFile( which_dataset, preprocess_folder) # Indicate which are the training and test files # train_file = "../Data/{}BPIC15_train_1.csv".format(preprocess_folder) # test_file = "../Data/{}BPIC15_test_1.csv".format(preprocess_folder) # Load logfile to use as training data train_data = LogFile(train_file, ",", 0, 500000, None, "Case") train_data.remove_attributes(["Anomaly", "time"]) # Train the model model = edbn.train(train_data) # Test the model and save the scores in ../Data/output.csv test_data = LogFile(test_file, ",", header=0, rows=500000, time_attr=None, trace_attr="Case", values=train_data.values) edbn.test(test_data, experiment_folder + "output.csv", model, label="Anomaly", normal_val="0") # # Plot the ROC curve based on the results # plot.plot_single_roc_curve(experiment_folder + "output.csv") scores = get_event_scores(test_data.data, model) print("Finished scoring...") # plot_single_scores(scores) r, ps = plot_pvalues(scores, 20) return scores, (r, ps), model
def compare_bpic_total(path): train = path + "BPIC15_train_total.csv" test = path + "BPIC15_test_total.csv" output = path + "Output/BPIC_15_output_total.csv" output_edbn = path + "Output/BPIC15_edbn_output_total.csv" prec_recall = path + "Output/prec_recall_total.png" roc = path + "Output/roc_total.png" #bohmer_model = bmr.train(train, header = 0, length = 5000000) #bmr.test(train, test, output, bohmer_model, ",", 5000000, skip=0) train_data = LogFile(train, ",", 0, 500000, None, "Case") train_data.remove_attributes(["Anomaly"]) test_data = LogFile(test, ",", 0, 500000, None, "Case", train_data.string_2_int, train_data.int_2_string) edbn_model = edbn.train(train_data) edbn.test(test_data, output_edbn, edbn_model, "Anomaly", "0") plt.plot_compare_prec_recall_curve([output, output_edbn], ["Likelihood Graph", "eDBN"], save_file=prec_recall) plt.plot_compare_roc_curve([output, output_edbn], ["Likelihood Graph", "eDBN"], roc)
def compare_bpics(path): for i in range(1,6): # Input Files train = path + "BPIC15_train_%i.csv" % (i) test = path + "BPIC15_test_%i.csv" % (i) output = path + "Output/BPIC15_output_%i.csv" % (i) output_edbn = path + "Output/BPIC15_edbn_output_%i.csv" % (i) prec_recall = path + "Output/prec_recall_%i.png" % (i) roc = path + "Output/roc_%i.png" % (i) #bohmer_model = bmr.train(train + "_ints", header = 0, length = 500000) #bmr.test(train + "_ints", test + "_ints", output, bohmer_model, ",", 500000, skip=0) train_data = LogFile(train, ",", 0, 500000, None, "Case") train_data.remove_attributes(["Anomaly"]) test_data = LogFile(test, ",", 0, 500000, None, "Case", train_data.string_2_int, train_data.int_2_string) edbn_model = edbn.train(train_data) edbn.test(test_data, output_edbn, edbn_model, "Anomaly", "0") plt.plot_compare_prec_recall_curve([output, output_edbn], ["Likelihood Graph", "eDBN"], save_file=prec_recall) plt.plot_compare_roc_curve([output, output_edbn], ["Likelihood Graph", "eDBN"], roc)
def only_train(default_dataset="edbn/Data/BPIC15_1_sorted.csv", default_alias="run/"): which_dataset = default_dataset preprocess_folder = default_alias train_file, test_file, experiment_folder = preProcessFile( which_dataset, preprocess_folder) # Indicate which are the training and test files # train_file = "../Data/{}BPIC15_train_1.csv".format(preprocess_folder) # test_file = "../Data/{}BPIC15_test_1.csv".format(preprocess_folder) # Load logfile to use as training data train_data = LogFile(train_file, ",", 0, 500000, None, "Case") train_data.remove_attributes(["Anomaly", "time"]) # Train the model model = edbn.train(train_data) return model
test_file = path + "%i_test_%i.csv" % (i, anoms_rates[i][1]) generator.create_shipment_data(10000, 10000, anoms_rates[i][0], anoms_rates[i][1], train_file, test_file) train_date = LogFile(train_file, ",", 0, 1000000, None, "Case") train_date.remove_attributes(["Anomaly"]) test_date = LogFile(test_file, ",", 0, 1000000, None, "Case", string_2_int=train_date.string_2_int, int_2_string=train_date.int_2_string) model = edbn.train(train_date) edbn.test(test_date, path + "Output_%i_%i.csv" % anoms_rates[i], model, "Anomaly", "0") output_file = path + "Output_%i_%i.csv" % anoms_rates[i] output_roc = path + "roc_%i_%i.png" % anoms_rates[i] output_prec = path + "prec_recall_%i_%i.png" % anoms_rates[i] score = plt.get_roc_auc(output_file) scores.append(plt.get_roc_auc(output_file)) print("Score = %f" % score) with open(path + "results.txt", "a") as fout: fout.write("Testing:\ntrain rate: %i\ntest rate: %i\n" % (anoms_rates[i][0], anoms_rates[i][1])) fout.write("Result: " + str(scores) + "\n")
def run_full(): # Use the BPIC15_x_sorted.csv to generate new training and test datafiles with anomalies introduced # After running this once you can comment this line out #preProcessData("../Data/") for i in range(1, 2): # Indicate which are the training and test files train_file = "../Data/bpic15_%i_train.csv" % (i) test_file = "../Data/bpic15_%i_test.csv" % (i) # Load logfile to use as training data train_data = LogFile(train_file, ",", 0, 500000, time_attr="Complete_Timestamp", trace_attr="Case_ID", activity_attr="Activity") train_data.remove_attributes(["Anomaly"]) # train_data.keep_attributes(["Case_ID", "Complete_Timestamp", "Activity", "Resource", "case_termName"]) train_data.remove_attributes(["planned"]) train_data.remove_attributes(["dueDate"]) train_data.remove_attributes(["dateFinished"]) # train_data.keep_attributes(["Case_ID", "Complete_Timestamp", "Activity", "Resource", "Weekday"]) # train_data.create_k_context() # train_data.add_duration_to_k_context() # Train the model model = edbn.train(train_data) # Test the model and save the scores in ../Data/output.csv test_data = LogFile(test_file, ",", header=0, rows=500000, time_attr="Complete_Timestamp", trace_attr="Case_ID", values=train_data.values) # test_data.create_k_context() # test_data.add_duration_to_k_context() edbn.test(test_data, "../Data/output2_%i.csv" % (i), model, label="Anomaly", normal_val="0", train_data=train_data) # Plot the ROC curve based on the results plot.plot_single_roc_curve("../Data/output2_%i.csv" % (i), title="BPIC15_%i" % (i)) plot.plot_single_prec_recall_curve("../Data/output2_%i.csv" % (i), title="BPIC15_%i" % (i)) out_files = [] labels = [] for i in range(1, 6): out_files.append("../Data/output2_%i.csv" % (i)) labels.append("MUNIS_%i" % (i)) plot.plot_compare_roc_curve(out_files, labels, "BPIC15 Comparison") plot.plot_compare_prec_recall_curve(out_files, labels, "BPIC15 Comparison")
train_file = path + "%i_train_%i.csv" % (i, anoms_rates[i][0]) test_file = path + "%i_test_%i.csv" % (i, anoms_rates[i][1]) generator.create_shipment_data(10000, 10000, anoms_rates[i][0], anoms_rates[i][1], train_file, test_file) dict_dict = [] utils.convert2ints(train_file, train_file + "_ints", True, dict_dict) test_length = utils.convert2ints(test_file, test_file + "_ints", True, dict_dict) model = edbn.train(train_file + "_ints", "Case", "Anomaly", 1, 0, 1000000, ignore=["Anomaly"]) edbn.test(test_file + "_ints", path + "Output_%i_%i.csv" % anoms_rates[i], model, "Anomaly", 1, ",", test_length, skip=0) output_file = path + "Output_%i_%i.csv" % anoms_rates[i] output_roc = path + "roc_%i_%i.png" % anoms_rates[i] output_prec = path + "prec_recall_%i_%i.png" % anoms_rates[i]
import pandas as pd import eDBN.Execute as edbn from LogFile import LogFile import ConceptDrift as cd train_data = LogFile("../Data/bpic2019.csv", ",", 0, 1000, "startTime", "case", "event") model = edbn.train(train_data) test_data = LogFile("../Data/bpic2019.csv", ",", 0, 100000, "startTime", "case", "event", values=train_data.values) scores = cd.get_event_scores(test_data, model) cd.plot_single_scores(scores) cd.plot_pvalues(scores, 400)