def _test1(): data = "../Data/BPIC15_1_sorted_new.csv" case_attr = "case" act_attr = "event" logfile = LogFile(data, ",", 0, None, None, case_attr, activity_attr=act_attr, convert=False, k=5) logfile.keep_attributes(["case", "event", "role"]) logfile.convert2int() # logfile.filter_case_length(5) logfile.create_k_context() train_log, test_log = logfile.splitTrainTest(70, case=True, method="train-test") model = edbn_train(train_log) acc = predict_next_event(model, test_log) acc_update = predict_next_event_update(model, test_log) print("ACC:", acc, acc_update)
def duration_test(): path = "../Data/Experiments_Duration/" train_rates = [0, 5, 10, 25] test_rates = [1, 5, 10, 25, 50, 100, 250, 500] anoms_rates = [] for train_rate in train_rates: for test_rate in test_rates: anoms_rates.append((train_rate, test_rate)) for i in range(len(anoms_rates)): print(anoms_rates[i]) scores = [] for run in range(RUNS): print("Run %i" % run) train_file = path + "%i_train_%i.csv" % (i, anoms_rates[i][0]) test_file = path + "%i_test_%i.csv" % (i, anoms_rates[i][1]) duration_generator.generate(10000, 10000, anoms_rates[i][0], anoms_rates[i][1], train_file, test_file) train_data = LogFile(train_file, ",", 0, 1000000, "date", "trace") train_data.remove_attributes(["Anomaly"]) test_data = LogFile(test_file, ",", 0, 1000000, "date", "trace", values=train_data.values) train_data.keep_attributes( ["event", "date", "trace", "process", "resource", "random"]) train_data.create_k_context() train_data.add_duration_to_k_context() bins = train_data.discretize("duration_0") test_data.create_k_context() test_data.add_duration_to_k_context() test_data.discretize("duration_0", bins) model = edbn.train(train_data) edbn.test(test_data, path + "Output_%i_%i.csv" % anoms_rates[i], model, "anomaly", "0") output_file = path + "Output_%i_%i.csv" % anoms_rates[i] output_roc = path + "roc_%i_%i.png" % anoms_rates[i] output_prec = path + "prec_recall_%i_%i.png" % anoms_rates[i] score = plt.get_roc_auc(output_file) scores.append(plt.get_roc_auc(output_file)) print("Score = %f" % score) with open(path + "results.txt", "a") as fout: fout.write("Testing:\ntrain rate: %i\ntest rate: %i\n" % (anoms_rates[i][0], anoms_rates[i][1])) fout.write("Result: " + str(scores) + "\n") fout.write("Mean: %f Median: %f\n" % (np.mean(scores), np.median(scores))) fout.write("Variance: %f\n\n" % np.var(scores))
def train_vars_and_test(model, alias, filename, event_emit_obj): file = UPLOAD_FOLDER + "/" + alias + "/" + filename folder = UPLOAD_FOLDER + "/" + alias + "/" train_file = get_constructed_file(file) test_file = get_constructed_file(file, type="test") train_data = LogFile(train_file, ",", 0, 500000, None, "Case") train_data.remove_attributes(["Anomaly", "time"]) event_emit_obj('score_resp', {'step': 2, "msg": "Data loaded."}) train_data.create_k_context() event_emit_obj('score_resp', { 'step': 3, "msg": "Build K-Context for data." }) model_trained_on_data = edbn.train_seperate(train_data, model) event_emit_obj('score_resp', {'step': 4, "msg": "Finished training data."}) test_data = LogFile(test_file, ",", header=0, rows=500000, time_attr=None, trace_attr="Case", values=train_data.values) edbn.test(test_data, folder + "output.csv", model_trained_on_data, label="Anomaly", normal_val="0") event_emit_obj('score_resp', {'step': 5, "msg": "Finished testing"}) # # Plot the ROC curve based on the results # plot.plot_single_roc_curve(experiment_folder + "output.csv") event_emit_obj('score_resp', {'step': 6, "msg": "Preparing to score."}) scores = get_event_scores(test_data.data, model_trained_on_data) r = list(scores.keys()) one = np.random.randint(0, len(r)) random_key = r[one] print(random_key) print(test_data.convert_int2string('Case', int(random_key))) # results = plottable(scores) event_emit_obj('score_resp', {'step': 7, "msg": "Finished scoring!"}) print("Finished scoring...") # plot_single_scores(scores) # r, ps = plot_pvalues(scores, 20) return scores
def test_edbn(dataset_folder, model_folder, k): from eDBN_Prediction import predict_next_event model_file = os.path.join(model_folder, "model") with open(model_file, "rb") as pickle_file: model = pickle.load(pickle_file) model.print_parents() if k is None: with open(os.path.join(model_folder, "k")) as finn: k = int(finn.readline()) print("K=", k) train_log = LogFile(dataset_folder + "train_log.csv", ",", 0, None, None, "case", activity_attr="event", convert=True, k=k) test_log = LogFile(dataset_folder + "test_log.csv", ",", 0, None, None, "case", activity_attr="event", convert=True, k=k, values=train_log.values) test_log.create_k_context() acc = predict_next_event(model, test_log) with open(os.path.join(model_folder, "results_next_event.log"), "a") as fout: fout.write("Accuracy: (%s) %s\n" % (time.strftime("%d-%m-%y %H:%M:%S", time.localtime()), acc))
def run_sdl(): from Methods.SDL.sdl import train, test labeled_logfile = "../Data/Outcome_Prediction/BPIC15_1_f2.csv" log = LogFile(labeled_logfile, ";", 0, None, "time_timestamp", "Case_ID", activity_attr="label", convert=True, k=10) columns = [ "label", "Case_ID", "Activity", "monitoringResource", "question", "org_resource", "Responsible_actor", "SUMleges" ] log.keep_attributes(columns) log.create_k_context() train_log, test_log = log.splitTrainTest(80, True, "train-test") train_log.ignoreHistoryAttributes.add("label") test_log.ignoreHistoryAttributes.add("label") model = train(train_log, 200, 42) print(test(test_log, model)) results1 = [] results2 = [] for case in test_log.get_cases(): pass
def run_experiment(data, prefix_size, add_end_event, split_method, split_cases, train_percentage): logfile = LogFile(data, ",", 0, None, None, "case", activity_attr="event", convert=False, k=prefix_size) if add_end_event: logfile.add_end_events() logfile.keep_attributes(["case", "event", "role"]) logfile.convert2int() logfile.create_k_context() train_log, test_log = logfile.splitTrainTest(train_percentage, case=split_cases, method=split_method) with open("Baseline/results.txt", "a") as fout: fout.write("Data: " + data) fout.write("\nPrefix Size: " + str(prefix_size)) fout.write("\nEnd event: " + str(add_end_event)) fout.write("\nSplit method: " + split_method) fout.write("\nSplit cases: " + str(split_cases)) fout.write("\nTrain percentage: " + str(train_percentage)) fout.write("\nDate: " + time.strftime("%d.%m.%y-%H.%M", time.localtime())) fout.write("\n------------------------------------") baseline_acc = test(test_log, train(train_log, epochs=100, early_stop=10)) fout.write("\nBaseline: " + str(baseline_acc)) fout.write("\n") fout.write("====================================\n\n")
from RelatedMethods.Lin.model import create_model, predict_next from Utils.LogFile import LogFile def train(log, epochs=200, early_stop=42): return create_model(log, "tmp", epochs, early_stop) def test(log, model): return predict_next(log, model) if __name__ == "__main__": data = "../../Data/BPIC15_5_sorted_new.csv" case_attr = "case" act_attr = "event" logfile = LogFile(data, ",", 0, None, None, case_attr, activity_attr=act_attr, convert=False, k=1) logfile.convert2int() logfile.create_k_context() train_log, test_log = logfile.splitTrainTest(70, case=True, method="train-test") # model = train(train_log, epochs=100, early_stop=5) model = load_model("../../Predictions/tmp/model_001-4.51.h5", custom_objects={'Modulator': Modulator}) acc = test(test_log, model) print(acc)
def train_edbn(data_folder, model_folder, k=None, next_event=True): from EDBN.Execute import train from Predictions.eDBN_Prediction import learn_duplicated_events, predict_next_event, predict_suffix if k is None: best_model = {} for k in range(1, 6): train_log = LogFile(data_folder + "train_log.csv", ",", 0, None, None, "case", activity_attr="event", convert=False, k=k) train_train_log, train_test_log = train_log.splitTrainTest(80) train_train_log.add_end_events() train_train_log.convert2int() train_train_log.create_k_context() train_test_log.values = train_train_log.values train_test_log.add_end_events() train_test_log.convert2int() train_test_log.create_k_context() model = train(train_train_log) # Train average number of duplicated events model.duplicate_events = learn_duplicated_events(train_train_log) if next_event: acc = predict_next_event(model, train_test_log) else: acc = predict_suffix(model, train_test_log) print("Testing k=", k, " | Validation acc:", acc) if "Acc" not in best_model or best_model["Acc"] < acc: best_model["Acc"] = acc best_model["Model"] = model best_model["k"] = k print("Best k value:", best_model["k"], " | Validation acc of", best_model["Acc"]) k = best_model["k"] train_log = LogFile(data_folder + "train_log.csv", ",", 0, None, None, "case", activity_attr="event", convert=False, k=k) train_log.add_end_events() train_log.convert2int() train_log.create_k_context() model = train(train_log) # Train average number of duplicated events model.duplicate_events = learn_duplicated_events(train_log) with open(os.path.join(model_folder, "model"), "wb") as pickle_file: pickle.dump(model, pickle_file) with open(os.path.join(model_folder, "k"), "w") as outfile: outfile.write(str(k))
def run_edbn(): from eDBN_Prediction import get_probabilities from Methods.EDBN.Train import train labeled_logfile = "../Data/Outcome_Prediction/BPIC15_1_f2.csv" log = LogFile(labeled_logfile, ";", 0, None, "time_timestamp", "Case_ID", activity_attr="label", convert=True, k=1) columns = [ "label", "Case_ID", "time_timestamp", "Activity", "monitoringResource", "question", "org_resource", "Responsible_actor", "SUMleges" ] log.keep_attributes(columns) log.create_k_context() train_log, test_log = log.splitTrainTest(80, True, "train-test") train_log.ignoreHistoryAttributes.add("label") model = train(train_log) results1 = [] results2 = [] for case in test_log.get_cases(): case_df = case[1] case_probs = {1: 1, 2: 1} ground = 0 for row in case_df.iterrows(): ground = getattr(row[1], "label") parents = model.variables["label"].conditional_table.parents value = [] for parent in parents: value.append(getattr(row[1], parent.attr_name)) tuple_val = tuple(value) activity_var = model.variables["label"] probs, unknown = get_probabilities(activity_var, tuple_val, parents) case_probs[1] += probs.get(1, 0) case_probs[2] += probs.get(2, 0) # correct_prob = sum(case_probs) / len(case_probs) if ground == 1: if case_probs[1] > case_probs[2]: results1.append(1) else: results1.append(0) if ground == 2: if case_probs[2] > case_probs[1]: results2.append(1) else: results2.append(0) print(len(results1), sum(results1) / len(results1)) print(len(results2), sum(results2) / len(results2))
def run_experiment(data, prefix_size, add_end_event, split_method, split_cases, train_percentage, filename="results.txt"): data = DATA_FOLDER + data logfile = LogFile(data, ",", 0, None, "completeTime", "case", activity_attr="event", convert=False, k=prefix_size) if prefix_size is None: prefix_size = max(logfile.data.groupby(logfile.trace).size()) if prefix_size > 40: prefix_size = 40 logfile.k = prefix_size if add_end_event: logfile.add_end_events() # logfile.keep_attributes(["case", "event", "role", "completeTime"]) logfile.keep_attributes(["case", "event", "role"]) logfile.convert2int() logfile.create_k_context() train_log, test_log = logfile.splitTrainTest(train_percentage, case=split_cases, method=split_method) with open(filename, "a") as fout: fout.write("Data: " + data) fout.write("\nPrefix Size: " + str(prefix_size)) fout.write("\nEnd event: " + str(add_end_event)) fout.write("\nSplit method: " + split_method) fout.write("\nSplit cases: " + str(split_cases)) fout.write("\nTrain percentage: " + str(train_percentage)) fout.write("\nDate: " + time.strftime("%d.%m.%y-%H.%M", time.localtime())) fout.write("\n------------------------------------\n") processes = [] processes.append( Process(target=execute_tax, args=(train_log, test_log, filename), name="Tax")) processes.append( Process(target=execute_taymouri, args=(train_log, test_log, filename), name="Taymouri")) processes.append( Process(target=execute_camargo, args=(train_log, test_log, filename), name="Camargo")) processes.append( Process(target=execute_lin, args=(train_log, test_log, filename), name="Lin")) processes.append( Process(target=execute_dimauro, args=(train_log, test_log, filename), name="Di Mauro")) processes.append( Process(target=execute_pasquadibisceglie, args=(train_log, test_log, filename), name="Pasquadibisceglie")) processes.append( Process(target=execute_edbn, args=(train_log, test_log, filename), name="EDBN")) processes.append( Process(target=execute_baseline, args=(train_log, test_log, filename), name="Baseline")) # processes.append(Process(target=execute_new_method, args=(train_log, test_log, filename), name="New Method")) print("Starting Processes") for p in processes: p.start() print(p.name, "started") print("All processes running") for p in processes: p.join() print(p.name, "stopped") with open(filename, "a") as fout: fout.write("====================================\n\n") print("All processes stopped")