Example #1
0
def stephenRun():
    # Use the BPIC15_x_sorted.csv to generate new training and test datafiles with anomalies introduced
    # After running this once you can comment this line out
    # preProcessData("../Data/")

    # Indicate which are the training and test files
    train_file = "../Data/BPIC15_train_1.csv"
    test_file = "../Data/BPIC15_test_1.csv"

    # Load logfile to use as training data
    train_data = LogFile(train_file, ",", 0, 500000, None, "Case")
    train_data.remove_attributes(["Anomaly"])

    # Train the model
    model = edbn.train(train_data)

    # Test the model and save the scores in ../Data/output.csv
    test_data = LogFile(test_file,
                        ",",
                        header=0,
                        rows=500000,
                        time_attr=None,
                        trace_attr="Case",
                        values=train_data.values)
    edbn.test(test_data,
              "../Data/output.csv",
              model,
              label="Anomaly",
              normal_val="0")

    # Plot the ROC curve based on the results
    plot.plot_single_roc_curve("../Data/output.csv")
Example #2
0
def compare_bpic_total(path):
    train = path + "BPIC15_train_total.csv"
    test = path + "BPIC15_test_total.csv"
    output = path + "Output/BPIC_15_output_total.csv"
    output_edbn = path + "Output/BPIC15_edbn_output_total.csv"
    prec_recall = path + "Output/prec_recall_total.png"
    roc = path + "Output/roc_total.png"

    if not os.path.exists(path + "Output"):
        os.mkdir(path + "Output")

    train_data = LogFile(train, ",", 0, 500000, "Time", "Case", activity_attr="Activity", convert=False)
    train_data.remove_attributes(["Anomaly", "Type", "Time"])
    test_data = LogFile(test, ",", 0, 500000, "Time", "Case", activity_attr="Activity", values=train_data.values, convert=False)

    bohmer_model = bmr.train(train_data)
    bmr.test(test_data, output, bohmer_model, label = "Anomaly", normal_val = 0)

    train_data.convert2int()
    test_data.convert2int()

    edbn_model = edbn_train(train_data)
    edbn_test(test_data, output_edbn, edbn_model, label = "Anomaly", normal_val = "0")

    plt.plot_compare_prec_recall_curve([output, output_edbn], ["Likelihood Graph", "EDBN"], save_file=prec_recall)
    plt.plot_compare_roc_curve([output, output_edbn], ["Likelihood Graph", "EDBN"], roc)
Example #3
0
def test_file_full(file):
    split_dataset(file + "_data.csv", file + "_labels.csv",
                  file + "_train.csv", file + "_test.csv", None)
    train_data = LogFile(file + "_train.csv", ",", 0, 1000000, None, "case_id",
                         "name")
    train_data.remove_attributes(["label"])
    model = edbn.train(train_data)

    test_data = LogFile(file + "_test.csv",
                        ",",
                        0,
                        1000000,
                        None,
                        "case_id",
                        "name",
                        values=train_data.values)
    edbn.test(test_data, file + "_output_full.csv", model, "label", "0",
              train_data)

    plot.plot_single_roc_curve(file + "_output_full.csv",
                               file,
                               save_file="../Data/Nolle_Graphs/" +
                               file.split("/")[-1] + "_roc.png")
    plot.plot_single_prec_recall_curve(file + "_output_full.csv",
                                       file,
                                       save_file="../Data/Nolle_Graphs/" +
                                       file.split("/")[-1] + "_precrec.png")
Example #4
0
def test_lin(dataset_folder, model_folder):
    from RelatedMethods.Lin.model import predict_next

    logfile = LogFile(dataset_folder + "full_log.csv",
                      ",",
                      0,
                      None,
                      None,
                      "case",
                      activity_attr="event",
                      convert=True,
                      k=0)
    test_log = LogFile(dataset_folder + "test_log.csv",
                       ",",
                       0,
                       None,
                       None,
                       "case",
                       activity_attr="event",
                       convert=True,
                       k=0,
                       values=logfile.values)
    model_file = sorted([
        model_file for model_file in os.listdir(model_folder)
        if model_file.endswith(".h5")
    ])[-1]

    acc = predict_next(os.path.join(model_folder, model_file), test_log.data,
                       test_log.trace, test_log.activity)
    with open(os.path.join(model_folder, "results_next_event.log"),
              "a") as fout:
        fout.write("Accuracy: (%s) %s\n" %
                   (time.strftime("%d-%m-%y %H:%M:%S", time.localtime()), acc))
Example #5
0
def test_file_bohmer(file):
    split_dataset(file + "_data.csv", file + "_labels.csv",
                  file + "_train.csv", file + "_test.csv", 10000)

    train_data = LogFile(file + "_train.csv",
                         ",",
                         0,
                         1000000,
                         None,
                         "case_id",
                         "name",
                         convert=False)
    train_data.remove_attributes(["label"])
    model = bohmer.train(train_data, 3, 4, 1)

    test_data = LogFile(file + "_test.csv",
                        ",",
                        0,
                        1000000,
                        None,
                        "case_id",
                        "name",
                        convert=False,
                        values=train_data.values)
    bohmer.test(test_data, file + "_output_bohmer.csv", model, "label", 0)

    plot.plot_single_roc_curve(file + "_output_bohmer.csv",
                               file,
                               save_file="../Data/Nolle_Graphs/" +
                               file.split("/")[-1] + "_roc_bohmer.png")
    plot.plot_single_prec_recall_curve(file + "_output_bohmer.csv",
                                       file,
                                       save_file="../Data/Nolle_Graphs/" +
                                       file.split("/")[-1] +
                                       "_precrec_bohmer.png")
Example #6
0
def train_vars_and_test(model, alias, filename, event_emit_obj):
    file = UPLOAD_FOLDER + "/" + alias + "/" + filename

    folder = UPLOAD_FOLDER + "/" + alias + "/"

    train_file = get_constructed_file(file)
    test_file = get_constructed_file(file, type="test")

    train_data = LogFile(train_file, ",", 0, 500000, None, "Case")
    train_data.remove_attributes(["Anomaly", "time"])

    event_emit_obj('score_resp', {'step': 2, "msg": "Data loaded."})

    train_data.create_k_context()
    event_emit_obj('score_resp', {
        'step': 3,
        "msg": "Build K-Context for data."
    })

    model_trained_on_data = edbn.train_seperate(train_data, model)

    event_emit_obj('score_resp', {'step': 4, "msg": "Finished training data."})

    test_data = LogFile(test_file,
                        ",",
                        header=0,
                        rows=500000,
                        time_attr=None,
                        trace_attr="Case",
                        values=train_data.values)

    edbn.test(test_data,
              folder + "output.csv",
              model_trained_on_data,
              label="Anomaly",
              normal_val="0")

    event_emit_obj('score_resp', {'step': 5, "msg": "Finished testing"})

    # # Plot the ROC curve based on the results
    # plot.plot_single_roc_curve(experiment_folder + "output.csv")
    event_emit_obj('score_resp', {'step': 6, "msg": "Preparing to score."})
    scores = get_event_scores(test_data.data, model_trained_on_data)

    r = list(scores.keys())
    one = np.random.randint(0, len(r))
    random_key = r[one]

    print(random_key)
    print(test_data.convert_int2string('Case', int(random_key)))

    # results = plottable(scores)
    event_emit_obj('score_resp', {'step': 7, "msg": "Finished scoring!"})

    print("Finished scoring...")

    # plot_single_scores(scores)
    # r, ps = plot_pvalues(scores, 20)
    return scores
Example #7
0
def compare_bpics(path):
    for i in range(1, 6):
        # Input Files
        train = path + "BPIC15_train_%i.csv" % (i)
        test = path + "BPIC15_test_%i.csv" % (i)
        output = path + "Output/BPIC15_output_%i.csv" % (i)
        output_edbn = path + "Output/BPIC15_edbn_output_%i.csv" % (i)
        prec_recall = path + "Output/prec_recall_%i.png" % (i)
        roc = path + "Output/roc_%i.png" % (i)

        train_data = LogFile(train,
                             ",",
                             0,
                             500000,
                             "Time",
                             "Case",
                             activity_attr="Activity",
                             convert=False)
        train_data.remove_attributes(["Anomaly", "Type", "Time"])
        test_data = LogFile(test,
                            ",",
                            0,
                            500000,
                            "Time",
                            "Case",
                            activity_attr="Activity",
                            values=train_data.values,
                            convert=False)

        bohmer_model = bmr.train(train_data)
        bmr.test(test_data,
                 output,
                 bohmer_model,
                 label="Anomaly",
                 normal_val="0")

        train_data.convert2int()
        test_data.convert2int()

        edbn_model = edbn.train(train_data)
        edbn.test(test_data,
                  output_edbn,
                  edbn_model,
                  label="Anomaly",
                  normal_val="0")

        plt.plot_compare_prec_recall_curve([output, output_edbn],
                                           ["Likelihood Graph", "EDBN"],
                                           save_file=prec_recall)
        plt.plot_compare_roc_curve([output, output_edbn],
                                   ["Likelihood Graph", "EDBN"], roc)
Example #8
0
def _test1():
    data = "../Data/BPIC15_1_sorted_new.csv"
    case_attr = "case"
    act_attr = "event"

    logfile = LogFile(data,
                      ",",
                      0,
                      None,
                      None,
                      case_attr,
                      activity_attr=act_attr,
                      convert=False,
                      k=5)
    logfile.keep_attributes(["case", "event", "role"])
    logfile.convert2int()
    # logfile.filter_case_length(5)

    logfile.create_k_context()
    train_log, test_log = logfile.splitTrainTest(70,
                                                 case=True,
                                                 method="train-test")

    model = edbn_train(train_log)
    acc = predict_next_event(model, test_log)
    acc_update = predict_next_event_update(model, test_log)
    print("ACC:", acc, acc_update)
Example #9
0
def categorical_test():
    path = "../Data/Experiments/"
    train_rates = [0, 5, 10, 25]
    test_rates = [1, 5, 10, 25, 50, 100, 250, 500]
    anoms_rates = []
    for train_rate in train_rates:
        for test_rate in test_rates:
            anoms_rates.append((train_rate, test_rate))

    for i in range(len(anoms_rates)):
        print(anoms_rates[i])
        scores = []
        for run in range(RUNS):
            print("Run %i" % run)
            train_file = path + "%i_train_%i.csv" % (i, anoms_rates[i][0])
            test_file = path + "%i_test_%i.csv" % (i, anoms_rates[i][1])
            generator.create_shipment_data(10000, 10000, anoms_rates[i][0],
                                           anoms_rates[i][1], train_file,
                                           test_file)

            train_data = LogFile(train_file, ",", 0, 1000000, None, "Case")
            train_data.remove_attributes(["Anomaly"])
            test_data = LogFile(test_file,
                                ",",
                                0,
                                1000000,
                                None,
                                "Case",
                                values=train_data.values)

            model = edbn.train(train_data)
            edbn.test(test_data, path + "Output_%i_%i.csv" % anoms_rates[i],
                      model, "Anomaly", "0")

            output_file = path + "Output_%i_%i.csv" % anoms_rates[i]
            output_roc = path + "roc_%i_%i.png" % anoms_rates[i]
            output_prec = path + "prec_recall_%i_%i.png" % anoms_rates[i]

            score = plt.get_roc_auc(output_file)
            scores.append(plt.get_roc_auc(output_file))
            print("Score = %f" % score)

        with open(path + "results.txt", "a") as fout:
            fout.write("Testing:\ntrain rate: %i\ntest rate: %i\n" %
                       (anoms_rates[i][0], anoms_rates[i][1]))
            fout.write("Result: " + str(scores) + "\n")
            fout.write("Mean: %f Median: %f\n" %
                       (np.mean(scores), np.median(scores)))
            fout.write("Variance: %f\n\n" % np.var(scores))
Example #10
0
def experiment_standard():
    data = LogFile("../Data/bpic2018.csv", ",", 0, 3000, "startTime", "case")
    #data_str = pd.read_csv("../Data/bpic2018_ints.csv", delimiter=",", header=0, dtype=int, nrows=3000)
    # data.remove_attributes(["eventid", "identity_id", "event_identity_id", "year", "penalty_", "amount_applied", "payment_actual", "penalty_amount", "risk_factor", "cross_compliance", "selected_random", "selected_risk", "selected_manually", "rejected"])
    # model = create_model(data, data)

    #with open("model_30000b", "wb") as fout:
    #    pickle.dump(model, fout)

    with open("model_30000b", "rb") as fin:
        model = pickle.load(fin)

    data = pd.read_csv("../Data/bpic2018_ints.csv",
                       delimiter=",",
                       header=0,
                       dtype=int)
    data = filter_attributes(data, [
        "eventid", "identity_id", "event_identity_id", "year", "penalty_",
        "amount_applied", "payment_actual", "penalty_amount", "risk_factor",
        "cross_compliance", "selected_random", "selected_risk",
        "selected_manually", "rejected"
    ])
    print("Get Scores")
    scores = get_event_scores(data, model)
    plot_single_scores(scores)
    plot_pvalues(scores, 800)

    y = []
    for key in sorted(scores.keys()):
        if sum(scores[key]) != 0:
            y.append(math.log10(sum(scores[key]) / len(scores[key])))
    kernel = stats.gaussian_kde(y)
    plt.plot(np.linspace(0, max(y), 1000), kernel(np.linspace(0, max(y),
                                                              1000)))
    plt.show()
Example #11
0
def run(default_dataset="edbn/Data/BPIC15_1_sorted.csv", default_alias="run/"):
    # Use the BPIC15_x_sorted.csv to generate new training and test datafiles with anomalies introduced
    # After running this once you can comment this line out
    # which_dataset = "edbn/Data/BPIC15_1_sorted.csv"
    # which_dataset = "edbn/Data/BPIC15_1_sorted.csv"
    # preprocess_folder = "run/"

    which_dataset = default_dataset
    preprocess_folder = default_alias

    train_file, test_file, experiment_folder = preProcessFile(
        which_dataset, preprocess_folder)

    # Indicate which are the training and test files
    # train_file = "../Data/{}BPIC15_train_1.csv".format(preprocess_folder)
    # test_file = "../Data/{}BPIC15_test_1.csv".format(preprocess_folder)

    # Load logfile to use as training data
    train_data = LogFile(train_file, ",", 0, 500000, None, "Case")
    train_data.remove_attributes(["Anomaly", "time"])

    # Train the model
    model = edbn.train(train_data)

    # Test the model and save the scores in ../Data/output.csv
    test_data = LogFile(test_file,
                        ",",
                        header=0,
                        rows=500000,
                        time_attr=None,
                        trace_attr="Case",
                        values=train_data.values)
    edbn.test(test_data,
              experiment_folder + "output.csv",
              model,
              label="Anomaly",
              normal_val="0")

    # # Plot the ROC curve based on the results
    # plot.plot_single_roc_curve(experiment_folder + "output.csv")
    scores = get_event_scores(test_data.data, model)

    print("Finished scoring...")

    # plot_single_scores(scores)
    r, ps = plot_pvalues(scores, 20)
    return scores, (r, ps), model
Example #12
0
def train_camargo(data_folder, model_folder, architecture):
    import RelatedMethods.Camargo.embedding_training as em
    import RelatedMethods.Camargo.model_training as mo

    logfile = LogFile(data_folder + "full_log.csv",
                      ",",
                      0,
                      None,
                      None,
                      "case",
                      activity_attr="event",
                      convert=False,
                      k=0)
    train_log = LogFile(data_folder + "train_log.csv",
                        ",",
                        0,
                        None,
                        None,
                        "case",
                        activity_attr="event",
                        convert=False,
                        k=0)
    test_log = LogFile(data_folder + "test_log.csv",
                       ",",
                       0,
                       None,
                       None,
                       "case",
                       activity_attr="event",
                       convert=False,
                       k=0)

    args = {}
    args["file_name"] = "data"
    args[
        "model_type"] = architecture  # Choose from 'joint', 'shared', 'concatenated', 'specialized', 'shared_cat'
    args["norm_method"] = "lognorm"  # Choose from 'lognorm' or 'max'
    args["n_size"] = 5  # n-gram size
    args['lstm_act'] = None  # optimization function see keras doc
    args['l_size'] = 100  # LSTM layer sizes
    args['imp'] = 1  # keras lstm implementation 1 cpu, 2 gpu
    args['dense_act'] = None  # optimization function see keras doc
    args['optim'] = 'Nadam'  # optimization function see keras doc

    em.training_model(logfile, model_folder)
    mo.training_model(logfile, train_log, test_log, model_folder, args)
Example #13
0
def compare_bpic_total(path):
    train = path + "BPIC15_train_total.csv"
    test = path + "BPIC15_test_total.csv"
    output = path + "Output/BPIC_15_output_total.csv"
    output_edbn = path + "Output/BPIC15_edbn_output_total.csv"
    prec_recall = path + "Output/prec_recall_total.png"
    roc = path + "Output/roc_total.png"

    #bohmer_model = bmr.train(train, header = 0, length = 5000000)
    #bmr.test(train, test, output, bohmer_model, ",", 5000000, skip=0)

    train_data = LogFile(train, ",", 0, 500000, None, "Case")
    train_data.remove_attributes(["Anomaly"])
    test_data = LogFile(test, ",", 0, 500000, None, "Case", train_data.string_2_int, train_data.int_2_string)

    edbn_model = edbn.train(train_data)
    edbn.test(test_data, output_edbn, edbn_model, "Anomaly", "0")

    plt.plot_compare_prec_recall_curve([output, output_edbn], ["Likelihood Graph", "eDBN"], save_file=prec_recall)
    plt.plot_compare_roc_curve([output, output_edbn], ["Likelihood Graph", "eDBN"], roc)
Example #14
0
def only_train(default_dataset="edbn/Data/BPIC15_1_sorted.csv",
               default_alias="run/"):
    which_dataset = default_dataset
    preprocess_folder = default_alias

    train_file, test_file, experiment_folder = preProcessFile(
        which_dataset, preprocess_folder)

    # Indicate which are the training and test files
    # train_file = "../Data/{}BPIC15_train_1.csv".format(preprocess_folder)
    # test_file = "../Data/{}BPIC15_test_1.csv".format(preprocess_folder)

    # Load logfile to use as training data
    train_data = LogFile(train_file, ",", 0, 500000, None, "Case")
    train_data.remove_attributes(["Anomaly", "time"])

    # Train the model
    model = edbn.train(train_data)

    return model
Example #15
0
def test_edbn(dataset_folder, model_folder, k):
    from eDBN_Prediction import predict_next_event

    model_file = os.path.join(model_folder, "model")

    with open(model_file, "rb") as pickle_file:
        model = pickle.load(pickle_file)
    model.print_parents()

    if k is None:
        with open(os.path.join(model_folder, "k")) as finn:
            k = int(finn.readline())
            print("K=", k)

    train_log = LogFile(dataset_folder + "train_log.csv",
                        ",",
                        0,
                        None,
                        None,
                        "case",
                        activity_attr="event",
                        convert=True,
                        k=k)

    test_log = LogFile(dataset_folder + "test_log.csv",
                       ",",
                       0,
                       None,
                       None,
                       "case",
                       activity_attr="event",
                       convert=True,
                       k=k,
                       values=train_log.values)
    test_log.create_k_context()

    acc = predict_next_event(model, test_log)
    with open(os.path.join(model_folder, "results_next_event.log"),
              "a") as fout:
        fout.write("Accuracy: (%s) %s\n" %
                   (time.strftime("%d-%m-%y %H:%M:%S", time.localtime()), acc))
Example #16
0
def compare_bpics(path):
    for i in range(1,6):
        # Input Files
        train = path + "BPIC15_train_%i.csv" % (i)
        test = path + "BPIC15_test_%i.csv" % (i)
        output = path + "Output/BPIC15_output_%i.csv" % (i)
        output_edbn = path + "Output/BPIC15_edbn_output_%i.csv" % (i)
        prec_recall = path + "Output/prec_recall_%i.png" % (i)
        roc = path + "Output/roc_%i.png" % (i)

        #bohmer_model = bmr.train(train + "_ints", header = 0, length = 500000)
        #bmr.test(train + "_ints", test + "_ints", output, bohmer_model, ",", 500000, skip=0)

        train_data = LogFile(train, ",", 0, 500000, None, "Case")
        train_data.remove_attributes(["Anomaly"])
        test_data = LogFile(test, ",", 0, 500000, None, "Case", train_data.string_2_int, train_data.int_2_string)

        edbn_model = edbn.train(train_data)
        edbn.test(test_data, output_edbn, edbn_model, "Anomaly", "0")

        plt.plot_compare_prec_recall_curve([output, output_edbn], ["Likelihood Graph", "eDBN"], save_file=prec_recall)
        plt.plot_compare_roc_curve([output, output_edbn], ["Likelihood Graph", "eDBN"], roc)
Example #17
0
def run_experiment(data, prefix_size, add_end_event, split_method, split_cases, train_percentage):
    logfile = LogFile(data, ",", 0, None, None, "case",
                      activity_attr="event", convert=False, k=prefix_size)
    if add_end_event:
        logfile.add_end_events()
    logfile.keep_attributes(["case", "event", "role"])
    logfile.convert2int()
    logfile.create_k_context()
    train_log, test_log = logfile.splitTrainTest(train_percentage, case=split_cases, method=split_method)

    with open("Baseline/results.txt", "a") as fout:
        fout.write("Data: " + data)
        fout.write("\nPrefix Size: " + str(prefix_size))
        fout.write("\nEnd event: " + str(add_end_event))
        fout.write("\nSplit method: " + split_method)
        fout.write("\nSplit cases: " + str(split_cases))
        fout.write("\nTrain percentage: " + str(train_percentage))
        fout.write("\nDate: " + time.strftime("%d.%m.%y-%H.%M", time.localtime()))
        fout.write("\n------------------------------------")

        baseline_acc = test(test_log, train(train_log, epochs=100, early_stop=10))
        fout.write("\nBaseline: " + str(baseline_acc))
        fout.write("\n")
        fout.write("====================================\n\n")
Example #18
0
def run_sdl():
    from Methods.SDL.sdl import train, test

    labeled_logfile = "../Data/Outcome_Prediction/BPIC15_1_f2.csv"

    log = LogFile(labeled_logfile,
                  ";",
                  0,
                  None,
                  "time_timestamp",
                  "Case_ID",
                  activity_attr="label",
                  convert=True,
                  k=10)

    columns = [
        "label", "Case_ID", "Activity", "monitoringResource", "question",
        "org_resource", "Responsible_actor", "SUMleges"
    ]
    log.keep_attributes(columns)

    log.create_k_context()

    train_log, test_log = log.splitTrainTest(80, True, "train-test")

    train_log.ignoreHistoryAttributes.add("label")
    test_log.ignoreHistoryAttributes.add("label")

    model = train(train_log, 200, 42)

    print(test(test_log, model))

    results1 = []
    results2 = []

    for case in test_log.get_cases():
        pass
Example #19
0
    model = edbn_train(train_log)
    acc = predict_next_event(model, test_log)
    acc_update = predict_next_event_update(model, test_log)
    print("ACC:", acc, acc_update)


if __name__ == "__main__":
    data = "../Data/BPIC15_1_sorted_new.csv"
    case_attr = "case"
    act_attr = "event"

    logfile = LogFile(data,
                      ",",
                      0,
                      None,
                      "completeTime",
                      case_attr,
                      activity_attr=act_attr,
                      convert=False,
                      k=5)
    logfile.keep_attributes(["case", "event", "role"])
    logfile.convert2int()
    logfile.create_k_context()

    weeks = logfile.split_days("%Y-%m-%d %H:%M:%S")
    weeks_sorted = sorted(weeks.keys())
    num_weeks = len(weeks_sorted)

    for i in range(num_weeks):
        weeks[weeks_sorted[i]]["model"] = edbn_train(
            weeks[weeks_sorted[i]]["data"])
Example #20
0
    for train_rate in train_rates:
        for test_rate in test_rates:
            anoms_rates.append((train_rate, test_rate))

    for i in range(len(anoms_rates)):
        print(anoms_rates[i])
        scores = []
        for run in range(1):
            print("Run %i" % run)
            train_file = path + "%i_train_%i.csv" % (i, anoms_rates[i][0])
            test_file = path + "%i_test_%i.csv" % (i, anoms_rates[i][1])
            generator.create_shipment_data(10000, 10000, anoms_rates[i][0],
                                           anoms_rates[i][1], train_file,
                                           test_file)

            train_date = LogFile(train_file, ",", 0, 1000000, None, "Case")
            train_date.remove_attributes(["Anomaly"])
            test_date = LogFile(test_file,
                                ",",
                                0,
                                1000000,
                                None,
                                "Case",
                                string_2_int=train_date.string_2_int,
                                int_2_string=train_date.int_2_string)
            model = edbn.train(train_date)
            edbn.test(test_date, path + "Output_%i_%i.csv" % anoms_rates[i],
                      model, "Anomaly", "0")

            output_file = path + "Output_%i_%i.csv" % anoms_rates[i]
            output_roc = path + "roc_%i_%i.png" % anoms_rates[i]
Example #21
0
                  validation_split=0.2,
                  batch_size=train_log.k)

    return results


if __name__ == "__main__":
    # data = "../../Data/Helpdesk.csv"
    data = "../../Data/BPIC12W.csv"
    case_attr = "case"
    act_attr = "event"

    logfile = LogFile(data,
                      ",",
                      0,
                      None,
                      time_attr="completeTime",
                      trace_attr=case_attr,
                      activity_attr=act_attr,
                      convert=False,
                      k=5)
    logfile.convert2int()

    logfile.create_k_context()
    train_log, test_log = logfile.splitTrainTest(80,
                                                 case=True,
                                                 method="random")

    model = train(train_log, epochs=100, early_stop=10)
    acc = test(test_log, model)
    print(acc)
Example #22
0
def train(log):
    return edbn_train(log)

def test(log, model):
    return predict_next_event(model, log)


if __name__ == "__main__":
    # data = "../Data/Helpdesk.csv"
    # data = "../../Data/Taymouri_bpi_12_w.csv"
    data = "../Data/BPIC12W.csv"
    case_attr = "case"
    act_attr = "event"

    logfile = LogFile(data, ",", 0, None, None, case_attr,
                      activity_attr=act_attr, convert=False, k=4)
    logfile.keep_attributes(["case", "event", "role"])
    logfile.convert2int()
    # logfile.filter_case_length(5)

    logfile.create_k_context()
    train_log, test_log = logfile.splitTrainTest(70, case=True, method="test-train")

    model = train(train_log)
    acc = test(test_log, model)
    print(acc)

    import base_adapter
    model2 = base_adapter.train(train_log, 100, 10)
    acc2 = base_adapter.test(test_log, model2)
    print(acc2)
Example #23
0
def train_edbn(data_folder, model_folder, k=None, next_event=True):
    from EDBN.Execute import train
    from Predictions.eDBN_Prediction import learn_duplicated_events, predict_next_event, predict_suffix

    if k is None:
        best_model = {}
        for k in range(1, 6):
            train_log = LogFile(data_folder + "train_log.csv",
                                ",",
                                0,
                                None,
                                None,
                                "case",
                                activity_attr="event",
                                convert=False,
                                k=k)

            train_train_log, train_test_log = train_log.splitTrainTest(80)

            train_train_log.add_end_events()
            train_train_log.convert2int()
            train_train_log.create_k_context()

            train_test_log.values = train_train_log.values
            train_test_log.add_end_events()
            train_test_log.convert2int()
            train_test_log.create_k_context()

            model = train(train_train_log)

            # Train average number of duplicated events
            model.duplicate_events = learn_duplicated_events(train_train_log)

            if next_event:
                acc = predict_next_event(model, train_test_log)
            else:
                acc = predict_suffix(model, train_test_log)
            print("Testing k=", k, " | Validation acc:", acc)
            if "Acc" not in best_model or best_model["Acc"] < acc:
                best_model["Acc"] = acc
                best_model["Model"] = model
                best_model["k"] = k
        print("Best k value:", best_model["k"], " | Validation acc of",
              best_model["Acc"])
        k = best_model["k"]

    train_log = LogFile(data_folder + "train_log.csv",
                        ",",
                        0,
                        None,
                        None,
                        "case",
                        activity_attr="event",
                        convert=False,
                        k=k)

    train_log.add_end_events()
    train_log.convert2int()
    train_log.create_k_context()

    model = train(train_log)

    # Train average number of duplicated events
    model.duplicate_events = learn_duplicated_events(train_log)

    with open(os.path.join(model_folder, "model"), "wb") as pickle_file:
        pickle.dump(model, pickle_file)

    with open(os.path.join(model_folder, "k"), "w") as outfile:
        outfile.write(str(k))
Example #24
0
def get_data(dataset, dataset_size, k, add_end, reduce_tasks, resource_pools, remove_resource):
    filename_parts = [dataset, str(dataset_size), str(k)]
    for v in [add_end, reduce_tasks, resource_pools, remove_resource]:
        if v:
            filename_parts.append(str(1))
        else:
            filename_parts.append(str(0))
    print(filename_parts)
    cache_file = LOGFILE_PATH + "/" + "_".join(filename_parts)

    colTitles = []

    if os.path.exists(cache_file):
        print("Loading file from cache")
        with open(cache_file, "rb") as pickle_file:
            preprocessed_log = pickle.load(pickle_file)
    else:
        resource_attr = None
        if dataset == BPIC15_1 or dataset == BPIC15:
            logfile = LogFile("../Data/BPIC15_1_sorted_new.csv", ",", 0, dataset_size, "Complete Timestamp", "Case ID", activity_attr="Activity", convert=False, k=k)
            resource_attr = "Resource"
            colTitles = ["Case ID", "Activity", "Resource"]
            logfile.keep_attributes(colTitles)
            logfile.filter_case_length(5)
        elif dataset == BPIC15_2:
            logfile = LogFile("../Data/BPIC15_2_sorted_new.csv", ",", 0, dataset_size, "Complete Timestamp", "Case ID",
                              activity_attr="Activity", convert=False, k=k)
            resource_attr = "Resource"
            colTitles = ["Case ID", "Activity", "Resource"]
            logfile.keep_attributes(colTitles)
            logfile.filter_case_length(5)
        elif dataset == BPIC15_3:
            logfile = LogFile("../Data/BPIC15_3_sorted_new.csv", ",", 0, dataset_size, "Complete Timestamp", "Case ID", activity_attr="Activity", convert=False, k=k)
            resource_attr = "Resource"
            colTitles = ["Case ID", "Activity", "Resource"]
            logfile.keep_attributes(colTitles)
            logfile.filter_case_length(5)
        elif dataset == BPIC15_4:
            logfile = LogFile("../Data/BPIC15_4_sorted_new.csv", ",", 0, dataset_size, "Complete Timestamp", "Case ID", activity_attr="Activity", convert=False, k=k)
            resource_attr = "Resource"
            colTitles = ["Case ID", "Activity", "Resource"]
            logfile.keep_attributes(colTitles)
            logfile.filter_case_length(5)
        elif dataset == BPIC15_5:
            logfile = LogFile("../Data/BPIC15_5_sorted_new.csv", ",", 0, dataset_size, "Complete Timestamp", "Case ID", activity_attr="Activity", convert=False, k=k)
            resource_attr = "Resource"
            colTitles = ["Case ID", "Activity", "Resource"]
            logfile.keep_attributes(colTitles)
            logfile.filter_case_length(5)
        elif dataset == BPIC12:
            logfile = LogFile("../Data/BPIC12.csv", ",", 0, dataset_size, "completeTime", "case", activity_attr="event", convert=False, k=k)
            resource_attr = "org:resource"
            colTitles = ["case", "event", "org:resource"]
            logfile.keep_attributes(colTitles)
            logfile.filter_case_length(5)
        elif dataset == BPIC12W:
            logfile = LogFile("../Data/BPIC12W.csv", ",", 0, dataset_size, "completeTime", "case", activity_attr="event", convert=False, k=k)
            resource_attr = "org:resource"
            colTitles = ["case", "event", "org:resource"]
            logfile.keep_attributes(colTitles)
            logfile.filter_case_length(5)
        elif dataset == HELPDESK:
            logfile = LogFile("../Data/Helpdesk.csv", ",", 0, dataset_size, "completeTime", "case", activity_attr="event", convert=False, k=k)
            resource_attr = "Resource"
            colTitles = ["case", "event", "Resource"]
            logfile.keep_attributes(colTitles)
            logfile.filter_case_length(3)
        elif dataset == BPIC18:
            logfile = LogFile("../Data/bpic2018.csv", ",", 0, dataset_size, "startTime", "case", activity_attr="event", convert=False, k=k)
            colTitles = ["case", "event", "subprocess"]
            logfile.keep_attributes(colTitles)
        else:
            print("Unknown Dataset")
            return None

        preprocessed_log = preprocess(logfile, add_end, reduce_tasks, resource_pools, resource_attr, remove_resource)

        preprocessed_log.create_k_context()
        with open(cache_file, "wb") as pickle_file:
            pickle.dump(preprocessed_log, pickle_file)
    return preprocessed_log, "_".join(filename_parts)
Example #25
0
                                hidden_size=2 * len(selected_columns), num_layers=2, num_directions=1)
    optimizerD = torch.optim.Adam(rnnD.parameters(), lr=0.0002, betas=(0.5, 0.999))

    # Training and testing
    ep.train(rnnD=rnnD, rnnG=rnnG, optimizerD=optimizerD, optimizerG=optimizerG, obj=train_data, epoch=epoch)

    return rnnG

def test(model, test_log, batch_size=5):
    test_data = adapted_Input()
    test_data.run(test_log, batch_size, False)

    rnng_validation = torch.load(test_data.path + "/rnnG(validation).m")
    print("EVAL model")
    acc = ep.model_eval_test(modelG=model, mode='test', obj=test_data)
    print("EVAL model from validation")
    ep.model_eval_test(modelG=rnng_validation, mode='test', obj=test_data)
    return acc

if __name__ == "__main__":
    from data import Data
    import setting
    from metric import ACCURACY

    d = Data("Helpdesk", LogFile("../../Data/Helpdesk.csv", ",", 0, None, "completeTime", "case", activity_attr="event", convert=False))
    d.logfile.keep_attributes(["event", "role", "completeTime"])
    d.prepare(setting.STANDARD)

    r = test(train(d.train), d.test_orig)
    print("Accuracy:", ACCURACY.calculate(r))
Example #26
0
def duration_test_discretize():
    path = "../Data/Experiments_Discretize/"
    train_rates = [0, 5, 10, 25]
    test_rates = [1, 5, 10, 25, 50, 100, 250, 500]
    anoms_rates = []
    for train_rate in train_rates:
        for test_rate in test_rates:
            anoms_rates.append((train_rate, test_rate))

    for i in range(len(anoms_rates)):
        print(anoms_rates[i])
        scores = []
        for run in range(RUNS):
            print("Run %i" % run)
            train_file = path + "%i_train_%i.csv" % (i, anoms_rates[i][0])
            test_file = path + "%i_test_%i.csv" % (i, anoms_rates[i][1])
            duration_generator.generate(10000, 10000, anoms_rates[i][0],
                                        anoms_rates[i][1], train_file,
                                        test_file)

            train_data = LogFile(train_file,
                                 ",",
                                 0,
                                 1000000,
                                 "date",
                                 "trace",
                                 convert=False)
            train_data.remove_attributes(["Anomaly"])

            train_data.keep_attributes(
                ["event", "date", "trace", "process", "resource", "random"])
            train_data.convert2int()

            train_data.create_k_context()
            train_data.add_duration_to_k_context()
            bins = train_data.discretize("duration_0", bins=10)

            test_data = LogFile(test_file,
                                ",",
                                0,
                                1000000,
                                "date",
                                "trace",
                                values=train_data.values,
                                convert=False)
            test_data.keep_attributes([
                "event", "date", "trace", "process", "resource", "random",
                "anomaly"
            ])
            test_data.convert2int()

            test_data.create_k_context()
            test_data.add_duration_to_k_context()
            test_data.discretize("duration_0", bins)

            model = edbn.train(train_data)
            edbn.test(test_data, path + "Output_%i_%i.csv" % anoms_rates[i],
                      model, "anomaly", "0")

            output_file = path + "Output_%i_%i.csv" % anoms_rates[i]
            output_roc = path + "roc_%i_%i.png" % anoms_rates[i]
            output_prec = path + "prec_recall_%i_%i.png" % anoms_rates[i]

            score = plt.get_roc_auc(output_file)
            scores.append(plt.get_roc_auc(output_file))
            print("Score = %f" % score)

        with open(path + "results.txt", "a") as fout:
            fout.write("Testing:\ntrain rate: %i\ntest rate: %i\n" %
                       (anoms_rates[i][0], anoms_rates[i][1]))
            fout.write("Result: " + str(scores) + "\n")
            fout.write("Mean: %f Median: %f\n" %
                       (np.mean(scores), np.median(scores)))
            fout.write("Variance: %f\n\n" % np.var(scores))
Example #27
0
def train_lin(data_folder, model_folder):
    from RelatedMethods.Lin.model import train

    logfile = LogFile(data_folder + "full_log.csv",
                      ",",
                      0,
                      None,
                      None,
                      "case",
                      activity_attr="event",
                      convert=False,
                      k=0)
    logfile.add_end_events()
    logfile.convert2int()
    train_log = LogFile(data_folder + "train_log.csv",
                        ",",
                        0,
                        None,
                        None,
                        "case",
                        activity_attr="event",
                        convert=False,
                        k=0,
                        values=logfile.values)
    train_log.add_end_events()
    train_log.convert2int()

    train(logfile, train_log, model_folder)
Example #28
0
def run_experiment(data,
                   prefix_size,
                   add_end_event,
                   split_method,
                   split_cases,
                   train_percentage,
                   filename="results.txt"):
    data = DATA_FOLDER + data
    logfile = LogFile(data,
                      ",",
                      0,
                      None,
                      "completeTime",
                      "case",
                      activity_attr="event",
                      convert=False,
                      k=prefix_size)

    if prefix_size is None:
        prefix_size = max(logfile.data.groupby(logfile.trace).size())
        if prefix_size > 40:
            prefix_size = 40
    logfile.k = prefix_size

    if add_end_event:
        logfile.add_end_events()
    # logfile.keep_attributes(["case", "event", "role", "completeTime"])
    logfile.keep_attributes(["case", "event", "role"])
    logfile.convert2int()
    logfile.create_k_context()
    train_log, test_log = logfile.splitTrainTest(train_percentage,
                                                 case=split_cases,
                                                 method=split_method)

    with open(filename, "a") as fout:
        fout.write("Data: " + data)
        fout.write("\nPrefix Size: " + str(prefix_size))
        fout.write("\nEnd event: " + str(add_end_event))
        fout.write("\nSplit method: " + split_method)
        fout.write("\nSplit cases: " + str(split_cases))
        fout.write("\nTrain percentage: " + str(train_percentage))
        fout.write("\nDate: " +
                   time.strftime("%d.%m.%y-%H.%M", time.localtime()))
        fout.write("\n------------------------------------\n")

    processes = []
    processes.append(
        Process(target=execute_tax,
                args=(train_log, test_log, filename),
                name="Tax"))
    processes.append(
        Process(target=execute_taymouri,
                args=(train_log, test_log, filename),
                name="Taymouri"))
    processes.append(
        Process(target=execute_camargo,
                args=(train_log, test_log, filename),
                name="Camargo"))
    processes.append(
        Process(target=execute_lin,
                args=(train_log, test_log, filename),
                name="Lin"))
    processes.append(
        Process(target=execute_dimauro,
                args=(train_log, test_log, filename),
                name="Di Mauro"))
    processes.append(
        Process(target=execute_pasquadibisceglie,
                args=(train_log, test_log, filename),
                name="Pasquadibisceglie"))
    processes.append(
        Process(target=execute_edbn,
                args=(train_log, test_log, filename),
                name="EDBN"))
    processes.append(
        Process(target=execute_baseline,
                args=(train_log, test_log, filename),
                name="Baseline"))
    # processes.append(Process(target=execute_new_method, args=(train_log, test_log, filename), name="New Method"))

    print("Starting Processes")
    for p in processes:
        p.start()
        print(p.name, "started")

    print("All processes running")

    for p in processes:
        p.join()
        print(p.name, "stopped")

    with open(filename, "a") as fout:
        fout.write("====================================\n\n")

    print("All processes stopped")
Example #29
0
from RelatedMethods.Lin.model import create_model, predict_next
from Utils.LogFile import LogFile


def train(log, epochs=200, early_stop=42):
    return create_model(log, "tmp", epochs, early_stop)


def test(log, model):
    return predict_next(log, model)


if __name__ == "__main__":
    data = "../../Data/BPIC15_5_sorted_new.csv"
    case_attr = "case"
    act_attr = "event"

    logfile = LogFile(data, ",", 0, None, None, case_attr,
                      activity_attr=act_attr, convert=False, k=1)
    logfile.convert2int()

    logfile.create_k_context()
    train_log, test_log = logfile.splitTrainTest(70, case=True, method="train-test")

    # model = train(train_log, epochs=100, early_stop=5)
    model = load_model("../../Predictions/tmp/model_001-4.51.h5", custom_objects={'Modulator': Modulator})

    acc = test(test_log, model)
    print(acc)

Example #30
0
def run_edbn():
    from eDBN_Prediction import get_probabilities
    from Methods.EDBN.Train import train

    labeled_logfile = "../Data/Outcome_Prediction/BPIC15_1_f2.csv"

    log = LogFile(labeled_logfile,
                  ";",
                  0,
                  None,
                  "time_timestamp",
                  "Case_ID",
                  activity_attr="label",
                  convert=True,
                  k=1)

    columns = [
        "label", "Case_ID", "time_timestamp", "Activity", "monitoringResource",
        "question", "org_resource", "Responsible_actor", "SUMleges"
    ]
    log.keep_attributes(columns)

    log.create_k_context()

    train_log, test_log = log.splitTrainTest(80, True, "train-test")

    train_log.ignoreHistoryAttributes.add("label")

    model = train(train_log)

    results1 = []
    results2 = []

    for case in test_log.get_cases():
        case_df = case[1]
        case_probs = {1: 1, 2: 1}
        ground = 0
        for row in case_df.iterrows():
            ground = getattr(row[1], "label")

            parents = model.variables["label"].conditional_table.parents

            value = []
            for parent in parents:
                value.append(getattr(row[1], parent.attr_name))
            tuple_val = tuple(value)

            activity_var = model.variables["label"]
            probs, unknown = get_probabilities(activity_var, tuple_val,
                                               parents)
            case_probs[1] += probs.get(1, 0)
            case_probs[2] += probs.get(2, 0)

        # correct_prob = sum(case_probs) / len(case_probs)
        if ground == 1:
            if case_probs[1] > case_probs[2]:
                results1.append(1)
            else:
                results1.append(0)

        if ground == 2:
            if case_probs[2] > case_probs[1]:
                results2.append(1)
            else:
                results2.append(0)

    print(len(results1), sum(results1) / len(results1))
    print(len(results2), sum(results2) / len(results2))