Esempio n. 1
0
def stephenRun():
    # Use the BPIC15_x_sorted.csv to generate new training and test datafiles with anomalies introduced
    # After running this once you can comment this line out
    # preProcessData("../Data/")

    # Indicate which are the training and test files
    train_file = "../Data/BPIC15_train_1.csv"
    test_file = "../Data/BPIC15_test_1.csv"

    # Load logfile to use as training data
    train_data = LogFile(train_file, ",", 0, 500000, None, "Case")
    train_data.remove_attributes(["Anomaly"])

    # Train the model
    model = edbn.train(train_data)

    # Test the model and save the scores in ../Data/output.csv
    test_data = LogFile(test_file,
                        ",",
                        header=0,
                        rows=500000,
                        time_attr=None,
                        trace_attr="Case",
                        values=train_data.values)
    edbn.test(test_data,
              "../Data/output.csv",
              model,
              label="Anomaly",
              normal_val="0")

    # Plot the ROC curve based on the results
    plot.plot_single_roc_curve("../Data/output.csv")
Esempio n. 2
0
def train_vars_and_test(model, alias, filename, event_emit_obj):
    file = UPLOAD_FOLDER + "/" + alias + "/" + filename

    folder = UPLOAD_FOLDER + "/" + alias + "/"

    train_file = get_constructed_file(file)
    test_file = get_constructed_file(file, type="test")

    train_data = LogFile(train_file, ",", 0, 500000, None, "Case")
    train_data.remove_attributes(["Anomaly", "time"])

    event_emit_obj('score_resp', {'step': 2, "msg": "Data loaded."})

    train_data.create_k_context()
    event_emit_obj('score_resp', {
        'step': 3,
        "msg": "Build K-Context for data."
    })

    model_trained_on_data = edbn.train_seperate(train_data, model)

    event_emit_obj('score_resp', {'step': 4, "msg": "Finished training data."})

    test_data = LogFile(test_file,
                        ",",
                        header=0,
                        rows=500000,
                        time_attr=None,
                        trace_attr="Case",
                        values=train_data.values)

    edbn.test(test_data,
              folder + "output.csv",
              model_trained_on_data,
              label="Anomaly",
              normal_val="0")

    event_emit_obj('score_resp', {'step': 5, "msg": "Finished testing"})

    # # Plot the ROC curve based on the results
    # plot.plot_single_roc_curve(experiment_folder + "output.csv")
    event_emit_obj('score_resp', {'step': 6, "msg": "Preparing to score."})
    scores = get_event_scores(test_data.data, model_trained_on_data)

    r = list(scores.keys())
    one = np.random.randint(0, len(r))
    random_key = r[one]

    print(random_key)
    print(test_data.convert_int2string('Case', int(random_key)))

    # results = plottable(scores)
    event_emit_obj('score_resp', {'step': 7, "msg": "Finished scoring!"})

    print("Finished scoring...")

    # plot_single_scores(scores)
    # r, ps = plot_pvalues(scores, 20)
    return scores
Esempio n. 3
0
def test_file(file):
    split_dataset(file + "_data.csv", file + "_labels.csv", file + "_train.csv", file + "_test.csv", 10000)
    train_data = LogFile(file + "_train.csv", ",", 0, 1000000, None, "case_id", "event")
    #train_data.remove_attributes(["event_position"])
    model = edbn.train(train_data)

    test_data = LogFile(file + "_test.csv", ",", 0, 1000000, None, "case_id", "event", values=train_data.values)
    edbn.test(test_data, file + "_output.csv", model, "label", "0")

    plot.plot_single_roc_curve(file + "_output.csv", file, save_file="../Data/Nolle_Graphs/" + file.split("/")[-1] + "_roc.png")
    plot.plot_single_prec_recall_curve(file + "_output.csv", file, save_file="../Data/Nolle_Graphs/" + file.split("/")[-1] + "_precrec.png")
Esempio n. 4
0
def breast_discrete_exec():
    data = "../Data/breast_data.csv"
    labels = "../Data/breast_labels.csv"

    log = pd.read_csv(data, header=None)
    labels = pd.read_csv(labels, header=None)
    log["Label"] = labels[0]

    cols = []
    for c in log.columns:
        cols.append("V" + str(c))
    log.columns = cols
    log['ID'] = log.reset_index().index
    print(log)

    train = log[:100]
    test = log[100:]
    train = train[train.VLabel == 0].drop(columns=["VLabel"])

    train.to_csv("../Data/breast_train.csv", index=False)
    test.to_csv("../Data/breast_test.csv", index=False)

    train_data = LogFile("../Data/breast_train.csv",
                         ",",
                         0,
                         500000,
                         None,
                         "ID",
                         activity_attr="Activity")
    train_data.k = 0
    model = edbn.train(train_data)

    test_data = LogFile("../Data/breast_test.csv",
                        ",",
                        0,
                        500000,
                        None,
                        "ID",
                        activity_attr="Activity")
    test_data.k = 0
    print(test_data.data)
    edbn.test(test_data, "../Data/breast_discrete_output.csv", model, "VLabel",
              "0")

    plot.plot_single_roc_curve("../Data/breast_discrete_output.csv",
                               "breast_discrete")
    plot.plot_single_prec_recall_curve("../Data/breast_discrete_output.csv",
                                       "breast_discrete")
Esempio n. 5
0
def compare_bpics(path):
    for i in range(1, 6):
        # Input Files
        train = path + "BPIC15_train_%i.csv" % (i)
        test = path + "BPIC15_test_%i.csv" % (i)
        output = path + "Output/BPIC15_output_%i.csv" % (i)
        output_edbn = path + "Output/BPIC15_edbn_output_%i.csv" % (i)
        prec_recall = path + "Output/prec_recall_%i.png" % (i)
        roc = path + "Output/roc_%i.png" % (i)

        train_data = LogFile(train,
                             ",",
                             0,
                             500000,
                             "Time",
                             "Case",
                             activity_attr="Activity",
                             convert=False)
        train_data.remove_attributes(["Anomaly", "Type", "Time"])
        test_data = LogFile(test,
                            ",",
                            0,
                            500000,
                            "Time",
                            "Case",
                            activity_attr="Activity",
                            values=train_data.values,
                            convert=False)

        #bohmer_model = bmr.train(train_data)
        #bmr.test(test_data, output, bohmer_model, label = "Anomaly", normal_val = "0")

        train_data.convert2int()
        test_data.convert2int()

        edbn_model = edbn.train(train_data)
        edbn.test(test_data,
                  output_edbn,
                  edbn_model,
                  label="Anomaly",
                  normal_val="0")

        plt.plot_compare_prec_recall_curve([output, output_edbn],
                                           ["Likelihood Graph", "eDBN"],
                                           save_file=prec_recall)
        plt.plot_compare_roc_curve([output, output_edbn],
                                   ["Likelihood Graph", "eDBN"], roc)
Esempio n. 6
0
def run(default_dataset="edbn/Data/BPIC15_1_sorted.csv", default_alias="run/"):
    # Use the BPIC15_x_sorted.csv to generate new training and test datafiles with anomalies introduced
    # After running this once you can comment this line out
    # which_dataset = "edbn/Data/BPIC15_1_sorted.csv"
    # which_dataset = "edbn/Data/BPIC15_1_sorted.csv"
    # preprocess_folder = "run/"

    which_dataset = default_dataset
    preprocess_folder = default_alias

    train_file, test_file, experiment_folder = preProcessFile(
        which_dataset, preprocess_folder)

    # Indicate which are the training and test files
    # train_file = "../Data/{}BPIC15_train_1.csv".format(preprocess_folder)
    # test_file = "../Data/{}BPIC15_test_1.csv".format(preprocess_folder)

    # Load logfile to use as training data
    train_data = LogFile(train_file, ",", 0, 500000, None, "Case")
    train_data.remove_attributes(["Anomaly", "time"])

    # Train the model
    model = edbn.train(train_data)

    # Test the model and save the scores in ../Data/output.csv
    test_data = LogFile(test_file,
                        ",",
                        header=0,
                        rows=500000,
                        time_attr=None,
                        trace_attr="Case",
                        values=train_data.values)
    edbn.test(test_data,
              experiment_folder + "output.csv",
              model,
              label="Anomaly",
              normal_val="0")

    # # Plot the ROC curve based on the results
    # plot.plot_single_roc_curve(experiment_folder + "output.csv")
    scores = get_event_scores(test_data.data, model)

    print("Finished scoring...")

    # plot_single_scores(scores)
    r, ps = plot_pvalues(scores, 20)
    return scores, (r, ps), model
Esempio n. 7
0
def compare_bpic_total(path):
    train = path + "BPIC15_train_total.csv"
    test = path + "BPIC15_test_total.csv"
    output = path + "Output/BPIC_15_output_total.csv"
    output_edbn = path + "Output/BPIC15_edbn_output_total.csv"
    prec_recall = path + "Output/prec_recall_total.png"
    roc = path + "Output/roc_total.png"

    #bohmer_model = bmr.train(train, header = 0, length = 5000000)
    #bmr.test(train, test, output, bohmer_model, ",", 5000000, skip=0)

    train_data = LogFile(train, ",", 0, 500000, None, "Case")
    train_data.remove_attributes(["Anomaly"])
    test_data = LogFile(test, ",", 0, 500000, None, "Case", train_data.string_2_int, train_data.int_2_string)

    edbn_model = edbn.train(train_data)
    edbn.test(test_data, output_edbn, edbn_model, "Anomaly", "0")

    plt.plot_compare_prec_recall_curve([output, output_edbn], ["Likelihood Graph", "eDBN"], save_file=prec_recall)
    plt.plot_compare_roc_curve([output, output_edbn], ["Likelihood Graph", "eDBN"], roc)
Esempio n. 8
0
def compare_bpics(path):
    for i in range(1,6):
        # Input Files
        train = path + "BPIC15_train_%i.csv" % (i)
        test = path + "BPIC15_test_%i.csv" % (i)
        output = path + "Output/BPIC15_output_%i.csv" % (i)
        output_edbn = path + "Output/BPIC15_edbn_output_%i.csv" % (i)
        prec_recall = path + "Output/prec_recall_%i.png" % (i)
        roc = path + "Output/roc_%i.png" % (i)

        #bohmer_model = bmr.train(train + "_ints", header = 0, length = 500000)
        #bmr.test(train + "_ints", test + "_ints", output, bohmer_model, ",", 500000, skip=0)

        train_data = LogFile(train, ",", 0, 500000, None, "Case")
        train_data.remove_attributes(["Anomaly"])
        test_data = LogFile(test, ",", 0, 500000, None, "Case", train_data.string_2_int, train_data.int_2_string)

        edbn_model = edbn.train(train_data)
        edbn.test(test_data, output_edbn, edbn_model, "Anomaly", "0")

        plt.plot_compare_prec_recall_curve([output, output_edbn], ["Likelihood Graph", "eDBN"], save_file=prec_recall)
        plt.plot_compare_roc_curve([output, output_edbn], ["Likelihood Graph", "eDBN"], roc)
Esempio n. 9
0
def only_train(default_dataset="edbn/Data/BPIC15_1_sorted.csv",
               default_alias="run/"):
    which_dataset = default_dataset
    preprocess_folder = default_alias

    train_file, test_file, experiment_folder = preProcessFile(
        which_dataset, preprocess_folder)

    # Indicate which are the training and test files
    # train_file = "../Data/{}BPIC15_train_1.csv".format(preprocess_folder)
    # test_file = "../Data/{}BPIC15_test_1.csv".format(preprocess_folder)

    # Load logfile to use as training data
    train_data = LogFile(train_file, ",", 0, 500000, None, "Case")
    train_data.remove_attributes(["Anomaly", "time"])

    # Train the model
    model = edbn.train(train_data)

    return model
Esempio n. 10
0
            test_file = path + "%i_test_%i.csv" % (i, anoms_rates[i][1])
            generator.create_shipment_data(10000, 10000, anoms_rates[i][0],
                                           anoms_rates[i][1], train_file,
                                           test_file)

            train_date = LogFile(train_file, ",", 0, 1000000, None, "Case")
            train_date.remove_attributes(["Anomaly"])
            test_date = LogFile(test_file,
                                ",",
                                0,
                                1000000,
                                None,
                                "Case",
                                string_2_int=train_date.string_2_int,
                                int_2_string=train_date.int_2_string)
            model = edbn.train(train_date)
            edbn.test(test_date, path + "Output_%i_%i.csv" % anoms_rates[i],
                      model, "Anomaly", "0")

            output_file = path + "Output_%i_%i.csv" % anoms_rates[i]
            output_roc = path + "roc_%i_%i.png" % anoms_rates[i]
            output_prec = path + "prec_recall_%i_%i.png" % anoms_rates[i]

            score = plt.get_roc_auc(output_file)
            scores.append(plt.get_roc_auc(output_file))
            print("Score = %f" % score)

        with open(path + "results.txt", "a") as fout:
            fout.write("Testing:\ntrain rate: %i\ntest rate: %i\n" %
                       (anoms_rates[i][0], anoms_rates[i][1]))
            fout.write("Result: " + str(scores) + "\n")
Esempio n. 11
0
def run_full():
    # Use the BPIC15_x_sorted.csv to generate new training and test datafiles with anomalies introduced
    # After running this once you can comment this line out
    #preProcessData("../Data/")

    for i in range(1, 2):
        # Indicate which are the training and test files
        train_file = "../Data/bpic15_%i_train.csv" % (i)
        test_file = "../Data/bpic15_%i_test.csv" % (i)

        # Load logfile to use as training data
        train_data = LogFile(train_file,
                             ",",
                             0,
                             500000,
                             time_attr="Complete_Timestamp",
                             trace_attr="Case_ID",
                             activity_attr="Activity")
        train_data.remove_attributes(["Anomaly"])

        # train_data.keep_attributes(["Case_ID", "Complete_Timestamp", "Activity", "Resource", "case_termName"])
        train_data.remove_attributes(["planned"])
        train_data.remove_attributes(["dueDate"])
        train_data.remove_attributes(["dateFinished"])

        # train_data.keep_attributes(["Case_ID", "Complete_Timestamp", "Activity", "Resource", "Weekday"])

        # train_data.create_k_context()
        # train_data.add_duration_to_k_context()

        # Train the model
        model = edbn.train(train_data)

        # Test the model and save the scores in ../Data/output.csv
        test_data = LogFile(test_file,
                            ",",
                            header=0,
                            rows=500000,
                            time_attr="Complete_Timestamp",
                            trace_attr="Case_ID",
                            values=train_data.values)
        # test_data.create_k_context()
        # test_data.add_duration_to_k_context()

        edbn.test(test_data,
                  "../Data/output2_%i.csv" % (i),
                  model,
                  label="Anomaly",
                  normal_val="0",
                  train_data=train_data)

        # Plot the ROC curve based on the results
        plot.plot_single_roc_curve("../Data/output2_%i.csv" % (i),
                                   title="BPIC15_%i" % (i))
        plot.plot_single_prec_recall_curve("../Data/output2_%i.csv" % (i),
                                           title="BPIC15_%i" % (i))

    out_files = []
    labels = []
    for i in range(1, 6):
        out_files.append("../Data/output2_%i.csv" % (i))
        labels.append("MUNIS_%i" % (i))
    plot.plot_compare_roc_curve(out_files, labels, "BPIC15 Comparison")
    plot.plot_compare_prec_recall_curve(out_files, labels, "BPIC15 Comparison")
Esempio n. 12
0
            train_file = path + "%i_train_%i.csv" % (i, anoms_rates[i][0])
            test_file = path + "%i_test_%i.csv" % (i, anoms_rates[i][1])
            generator.create_shipment_data(10000, 10000, anoms_rates[i][0],
                                           anoms_rates[i][1], train_file,
                                           test_file)

            dict_dict = []
            utils.convert2ints(train_file, train_file + "_ints", True,
                               dict_dict)
            test_length = utils.convert2ints(test_file, test_file + "_ints",
                                             True, dict_dict)

            model = edbn.train(train_file + "_ints",
                               "Case",
                               "Anomaly",
                               1,
                               0,
                               1000000,
                               ignore=["Anomaly"])
            edbn.test(test_file + "_ints",
                      path + "Output_%i_%i.csv" % anoms_rates[i],
                      model,
                      "Anomaly",
                      1,
                      ",",
                      test_length,
                      skip=0)

            output_file = path + "Output_%i_%i.csv" % anoms_rates[i]
            output_roc = path + "roc_%i_%i.png" % anoms_rates[i]
            output_prec = path + "prec_recall_%i_%i.png" % anoms_rates[i]
Esempio n. 13
0
import pandas as pd
import eDBN.Execute as edbn
from LogFile import LogFile
import ConceptDrift as cd

train_data = LogFile("../Data/bpic2019.csv", ",", 0, 1000, "startTime", "case",
                     "event")
model = edbn.train(train_data)

test_data = LogFile("../Data/bpic2019.csv",
                    ",",
                    0,
                    100000,
                    "startTime",
                    "case",
                    "event",
                    values=train_data.values)
scores = cd.get_event_scores(test_data, model)
cd.plot_single_scores(scores)
cd.plot_pvalues(scores, 400)