def _test1(): data = "../Data/BPIC15_1_sorted_new.csv" case_attr = "case" act_attr = "event" logfile = LogFile(data, ",", 0, None, None, case_attr, activity_attr=act_attr, convert=False, k=5) logfile.keep_attributes(["case", "event", "role"]) logfile.convert2int() # logfile.filter_case_length(5) logfile.create_k_context() train_log, test_log = logfile.splitTrainTest(70, case=True, method="train-test") model = edbn_train(train_log) acc = predict_next_event(model, test_log) acc_update = predict_next_event_update(model, test_log) print("ACC:", acc, acc_update)
def train_lin(data_folder, model_folder): from RelatedMethods.Lin.model import train logfile = LogFile(data_folder + "full_log.csv", ",", 0, None, None, "case", activity_attr="event", convert=False, k=0) logfile.add_end_events() logfile.convert2int() train_log = LogFile(data_folder + "train_log.csv", ",", 0, None, None, "case", activity_attr="event", convert=False, k=0, values=logfile.values) train_log.add_end_events() train_log.convert2int() train(logfile, train_log, model_folder)
def compare_bpic_total(path): train = path + "BPIC15_train_total.csv" test = path + "BPIC15_test_total.csv" output = path + "Output/BPIC_15_output_total.csv" output_edbn = path + "Output/BPIC15_edbn_output_total.csv" prec_recall = path + "Output/prec_recall_total.png" roc = path + "Output/roc_total.png" if not os.path.exists(path + "Output"): os.mkdir(path + "Output") train_data = LogFile(train, ",", 0, 500000, "Time", "Case", activity_attr="Activity", convert=False) train_data.remove_attributes(["Anomaly", "Type", "Time"]) test_data = LogFile(test, ",", 0, 500000, "Time", "Case", activity_attr="Activity", values=train_data.values, convert=False) bohmer_model = bmr.train(train_data) bmr.test(test_data, output, bohmer_model, label = "Anomaly", normal_val = 0) train_data.convert2int() test_data.convert2int() edbn_model = edbn_train(train_data) edbn_test(test_data, output_edbn, edbn_model, label = "Anomaly", normal_val = "0") plt.plot_compare_prec_recall_curve([output, output_edbn], ["Likelihood Graph", "EDBN"], save_file=prec_recall) plt.plot_compare_roc_curve([output, output_edbn], ["Likelihood Graph", "EDBN"], roc)
def duration_test_discretize(): path = "../Data/Experiments_Discretize/" train_rates = [0,5,10,25] test_rates = [1,5,10,25,50,100,250,500] anoms_rates = [] for train_rate in train_rates: for test_rate in test_rates: anoms_rates.append((train_rate, test_rate)) for i in range(len(anoms_rates)): print(anoms_rates[i]) scores = [] for run in range(RUNS): print("Run %i" % run) train_file = path + "%i_train_%i.csv" % (i, anoms_rates[i][0]) test_file = path + "%i_test_%i.csv" % (i, anoms_rates[i][1]) duration_generator.generate(10000, 10000, anoms_rates[i][0], anoms_rates[i][1], train_file, test_file) train_data = LogFile(train_file, ",", 0, 1000000, "date", "trace", convert=False) train_data.remove_attributes(["Anomaly"]) train_data.keep_attributes(["event", "date", "trace", "process", "resource", "random"]) train_data.convert2int() train_data.create_k_context() train_data.add_duration_to_k_context() bins = train_data.discretize("duration_0", bins=10) test_data = LogFile(test_file, ",", 0, 1000000, "date", "trace", values=train_data.values, convert=False) test_data.keep_attributes(["event", "date", "trace", "process", "resource", "random", "anomaly"]) test_data.convert2int() test_data.create_k_context() test_data.add_duration_to_k_context() test_data.discretize("duration_0", bins) model = edbn.train(train_data) edbn.test(test_data, path + "Output_%i_%i.csv" % anoms_rates[i], model, "anomaly", "0") output_file = path + "Output_%i_%i.csv" % anoms_rates[i] output_roc = path + "roc_%i_%i.png" % anoms_rates[i] output_prec = path + "prec_recall_%i_%i.png" % anoms_rates[i] score = plt.get_roc_auc(output_file) scores.append(plt.get_roc_auc(output_file)) print("Score = %f" % score) with open(path + "results.txt", "a") as fout: fout.write("Testing:\ntrain rate: %i\ntest rate: %i\n" % (anoms_rates[i][0], anoms_rates[i][1])) fout.write("Result: " + str(scores) + "\n") fout.write("Mean: %f Median: %f\n" % (np.mean(scores), np.median(scores))) fout.write("Variance: %f\n\n" % np.var(scores))
def compare_bpics(path): for i in range(1, 6): # Input Files train = path + "BPIC15_train_%i.csv" % (i) test = path + "BPIC15_test_%i.csv" % (i) output = path + "Output/BPIC15_output_%i.csv" % (i) output_edbn = path + "Output/BPIC15_edbn_output_%i.csv" % (i) prec_recall = path + "Output/prec_recall_%i.png" % (i) roc = path + "Output/roc_%i.png" % (i) train_data = LogFile(train, ",", 0, 500000, "Time", "Case", activity_attr="Activity", convert=False) train_data.remove_attributes(["Anomaly", "Type", "Time"]) test_data = LogFile(test, ",", 0, 500000, "Time", "Case", activity_attr="Activity", values=train_data.values, convert=False) bohmer_model = bmr.train(train_data) bmr.test(test_data, output, bohmer_model, label="Anomaly", normal_val="0") train_data.convert2int() test_data.convert2int() edbn_model = edbn.train(train_data) edbn.test(test_data, output_edbn, edbn_model, label="Anomaly", normal_val="0") plt.plot_compare_prec_recall_curve([output, output_edbn], ["Likelihood Graph", "EDBN"], save_file=prec_recall) plt.plot_compare_roc_curve([output, output_edbn], ["Likelihood Graph", "EDBN"], roc)
def run_experiment(data, prefix_size, add_end_event, split_method, split_cases, train_percentage): logfile = LogFile(data, ",", 0, None, None, "case", activity_attr="event", convert=False, k=prefix_size) if add_end_event: logfile.add_end_events() logfile.keep_attributes(["case", "event", "role"]) logfile.convert2int() logfile.create_k_context() train_log, test_log = logfile.splitTrainTest(train_percentage, case=split_cases, method=split_method) with open("Baseline/results.txt", "a") as fout: fout.write("Data: " + data) fout.write("\nPrefix Size: " + str(prefix_size)) fout.write("\nEnd event: " + str(add_end_event)) fout.write("\nSplit method: " + split_method) fout.write("\nSplit cases: " + str(split_cases)) fout.write("\nTrain percentage: " + str(train_percentage)) fout.write("\nDate: " + time.strftime("%d.%m.%y-%H.%M", time.localtime())) fout.write("\n------------------------------------") baseline_acc = test(test_log, train(train_log, epochs=100, early_stop=10)) fout.write("\nBaseline: " + str(baseline_acc)) fout.write("\n") fout.write("====================================\n\n")
from RelatedMethods.Lin.model import create_model, predict_next from Utils.LogFile import LogFile def train(log, epochs=200, early_stop=42): return create_model(log, "tmp", epochs, early_stop) def test(log, model): return predict_next(log, model) if __name__ == "__main__": data = "../../Data/BPIC15_5_sorted_new.csv" case_attr = "case" act_attr = "event" logfile = LogFile(data, ",", 0, None, None, case_attr, activity_attr=act_attr, convert=False, k=1) logfile.convert2int() logfile.create_k_context() train_log, test_log = logfile.splitTrainTest(70, case=True, method="train-test") # model = train(train_log, epochs=100, early_stop=5) model = load_model("../../Predictions/tmp/model_001-4.51.h5", custom_objects={'Modulator': Modulator}) acc = test(test_log, model) print(acc)
def train_edbn(data_folder, model_folder, k=None, next_event=True): from EDBN.Execute import train from Predictions.eDBN_Prediction import learn_duplicated_events, predict_next_event, predict_suffix if k is None: best_model = {} for k in range(1, 6): train_log = LogFile(data_folder + "train_log.csv", ",", 0, None, None, "case", activity_attr="event", convert=False, k=k) train_train_log, train_test_log = train_log.splitTrainTest(80) train_train_log.add_end_events() train_train_log.convert2int() train_train_log.create_k_context() train_test_log.values = train_train_log.values train_test_log.add_end_events() train_test_log.convert2int() train_test_log.create_k_context() model = train(train_train_log) # Train average number of duplicated events model.duplicate_events = learn_duplicated_events(train_train_log) if next_event: acc = predict_next_event(model, train_test_log) else: acc = predict_suffix(model, train_test_log) print("Testing k=", k, " | Validation acc:", acc) if "Acc" not in best_model or best_model["Acc"] < acc: best_model["Acc"] = acc best_model["Model"] = model best_model["k"] = k print("Best k value:", best_model["k"], " | Validation acc of", best_model["Acc"]) k = best_model["k"] train_log = LogFile(data_folder + "train_log.csv", ",", 0, None, None, "case", activity_attr="event", convert=False, k=k) train_log.add_end_events() train_log.convert2int() train_log.create_k_context() model = train(train_log) # Train average number of duplicated events model.duplicate_events = learn_duplicated_events(train_log) with open(os.path.join(model_folder, "model"), "wb") as pickle_file: pickle.dump(model, pickle_file) with open(os.path.join(model_folder, "k"), "w") as outfile: outfile.write(str(k))
def run_experiment(data, prefix_size, add_end_event, split_method, split_cases, train_percentage, filename="results.txt"): data = DATA_FOLDER + data logfile = LogFile(data, ",", 0, None, "completeTime", "case", activity_attr="event", convert=False, k=prefix_size) if prefix_size is None: prefix_size = max(logfile.data.groupby(logfile.trace).size()) if prefix_size > 40: prefix_size = 40 logfile.k = prefix_size if add_end_event: logfile.add_end_events() # logfile.keep_attributes(["case", "event", "role", "completeTime"]) logfile.keep_attributes(["case", "event", "role"]) logfile.convert2int() logfile.create_k_context() train_log, test_log = logfile.splitTrainTest(train_percentage, case=split_cases, method=split_method) with open(filename, "a") as fout: fout.write("Data: " + data) fout.write("\nPrefix Size: " + str(prefix_size)) fout.write("\nEnd event: " + str(add_end_event)) fout.write("\nSplit method: " + split_method) fout.write("\nSplit cases: " + str(split_cases)) fout.write("\nTrain percentage: " + str(train_percentage)) fout.write("\nDate: " + time.strftime("%d.%m.%y-%H.%M", time.localtime())) fout.write("\n------------------------------------\n") processes = [] processes.append( Process(target=execute_tax, args=(train_log, test_log, filename), name="Tax")) processes.append( Process(target=execute_taymouri, args=(train_log, test_log, filename), name="Taymouri")) processes.append( Process(target=execute_camargo, args=(train_log, test_log, filename), name="Camargo")) processes.append( Process(target=execute_lin, args=(train_log, test_log, filename), name="Lin")) processes.append( Process(target=execute_dimauro, args=(train_log, test_log, filename), name="Di Mauro")) processes.append( Process(target=execute_pasquadibisceglie, args=(train_log, test_log, filename), name="Pasquadibisceglie")) processes.append( Process(target=execute_edbn, args=(train_log, test_log, filename), name="EDBN")) processes.append( Process(target=execute_baseline, args=(train_log, test_log, filename), name="Baseline")) # processes.append(Process(target=execute_new_method, args=(train_log, test_log, filename), name="New Method")) print("Starting Processes") for p in processes: p.start() print(p.name, "started") print("All processes running") for p in processes: p.join() print(p.name, "stopped") with open(filename, "a") as fout: fout.write("====================================\n\n") print("All processes stopped")