Ejemplo n.º 1
0
def _test1():
    data = "../Data/BPIC15_1_sorted_new.csv"
    case_attr = "case"
    act_attr = "event"

    logfile = LogFile(data,
                      ",",
                      0,
                      None,
                      None,
                      case_attr,
                      activity_attr=act_attr,
                      convert=False,
                      k=5)
    logfile.keep_attributes(["case", "event", "role"])
    logfile.convert2int()
    # logfile.filter_case_length(5)

    logfile.create_k_context()
    train_log, test_log = logfile.splitTrainTest(70,
                                                 case=True,
                                                 method="train-test")

    model = edbn_train(train_log)
    acc = predict_next_event(model, test_log)
    acc_update = predict_next_event_update(model, test_log)
    print("ACC:", acc, acc_update)
Ejemplo n.º 2
0
def duration_test():
    path = "../Data/Experiments_Duration/"
    train_rates = [0, 5, 10, 25]
    test_rates = [1, 5, 10, 25, 50, 100, 250, 500]
    anoms_rates = []
    for train_rate in train_rates:
        for test_rate in test_rates:
            anoms_rates.append((train_rate, test_rate))

    for i in range(len(anoms_rates)):
        print(anoms_rates[i])
        scores = []
        for run in range(RUNS):
            print("Run %i" % run)
            train_file = path + "%i_train_%i.csv" % (i, anoms_rates[i][0])
            test_file = path + "%i_test_%i.csv" % (i, anoms_rates[i][1])
            duration_generator.generate(10000, 10000, anoms_rates[i][0],
                                        anoms_rates[i][1], train_file,
                                        test_file)

            train_data = LogFile(train_file, ",", 0, 1000000, "date", "trace")
            train_data.remove_attributes(["Anomaly"])
            test_data = LogFile(test_file,
                                ",",
                                0,
                                1000000,
                                "date",
                                "trace",
                                values=train_data.values)

            train_data.keep_attributes(
                ["event", "date", "trace", "process", "resource", "random"])

            train_data.create_k_context()
            train_data.add_duration_to_k_context()
            bins = train_data.discretize("duration_0")
            test_data.create_k_context()
            test_data.add_duration_to_k_context()
            test_data.discretize("duration_0", bins)

            model = edbn.train(train_data)
            edbn.test(test_data, path + "Output_%i_%i.csv" % anoms_rates[i],
                      model, "anomaly", "0")

            output_file = path + "Output_%i_%i.csv" % anoms_rates[i]
            output_roc = path + "roc_%i_%i.png" % anoms_rates[i]
            output_prec = path + "prec_recall_%i_%i.png" % anoms_rates[i]

            score = plt.get_roc_auc(output_file)
            scores.append(plt.get_roc_auc(output_file))
            print("Score = %f" % score)

        with open(path + "results.txt", "a") as fout:
            fout.write("Testing:\ntrain rate: %i\ntest rate: %i\n" %
                       (anoms_rates[i][0], anoms_rates[i][1]))
            fout.write("Result: " + str(scores) + "\n")
            fout.write("Mean: %f Median: %f\n" %
                       (np.mean(scores), np.median(scores)))
            fout.write("Variance: %f\n\n" % np.var(scores))
Ejemplo n.º 3
0
def run_sdl():
    from Methods.SDL.sdl import train, test

    labeled_logfile = "../Data/Outcome_Prediction/BPIC15_1_f2.csv"

    log = LogFile(labeled_logfile,
                  ";",
                  0,
                  None,
                  "time_timestamp",
                  "Case_ID",
                  activity_attr="label",
                  convert=True,
                  k=10)

    columns = [
        "label", "Case_ID", "Activity", "monitoringResource", "question",
        "org_resource", "Responsible_actor", "SUMleges"
    ]
    log.keep_attributes(columns)

    log.create_k_context()

    train_log, test_log = log.splitTrainTest(80, True, "train-test")

    train_log.ignoreHistoryAttributes.add("label")
    test_log.ignoreHistoryAttributes.add("label")

    model = train(train_log, 200, 42)

    print(test(test_log, model))

    results1 = []
    results2 = []

    for case in test_log.get_cases():
        pass
Ejemplo n.º 4
0
def run_experiment(data, prefix_size, add_end_event, split_method, split_cases, train_percentage):
    logfile = LogFile(data, ",", 0, None, None, "case",
                      activity_attr="event", convert=False, k=prefix_size)
    if add_end_event:
        logfile.add_end_events()
    logfile.keep_attributes(["case", "event", "role"])
    logfile.convert2int()
    logfile.create_k_context()
    train_log, test_log = logfile.splitTrainTest(train_percentage, case=split_cases, method=split_method)

    with open("Baseline/results.txt", "a") as fout:
        fout.write("Data: " + data)
        fout.write("\nPrefix Size: " + str(prefix_size))
        fout.write("\nEnd event: " + str(add_end_event))
        fout.write("\nSplit method: " + split_method)
        fout.write("\nSplit cases: " + str(split_cases))
        fout.write("\nTrain percentage: " + str(train_percentage))
        fout.write("\nDate: " + time.strftime("%d.%m.%y-%H.%M", time.localtime()))
        fout.write("\n------------------------------------")

        baseline_acc = test(test_log, train(train_log, epochs=100, early_stop=10))
        fout.write("\nBaseline: " + str(baseline_acc))
        fout.write("\n")
        fout.write("====================================\n\n")
Ejemplo n.º 5
0
if __name__ == "__main__":
    data = "../Data/BPIC15_1_sorted_new.csv"
    case_attr = "case"
    act_attr = "event"

    logfile = LogFile(data,
                      ",",
                      0,
                      None,
                      "completeTime",
                      case_attr,
                      activity_attr=act_attr,
                      convert=False,
                      k=5)
    logfile.keep_attributes(["case", "event", "role"])
    logfile.convert2int()
    logfile.create_k_context()

    weeks = logfile.split_days("%Y-%m-%d %H:%M:%S")
    weeks_sorted = sorted(weeks.keys())
    num_weeks = len(weeks_sorted)

    for i in range(num_weeks):
        weeks[weeks_sorted[i]]["model"] = edbn_train(
            weeks[weeks_sorted[i]]["data"])
    #
    # accs1 = []
    # for i in range(1, num_weeks):
    #     accs1.append(predict_next_event_multi([weeks[w]["model"] for w in weeks_sorted[:i]], weeks[weeks_sorted[i]]["data"]))
    #
Ejemplo n.º 6
0
def run_edbn():
    from eDBN_Prediction import get_probabilities
    from Methods.EDBN.Train import train

    labeled_logfile = "../Data/Outcome_Prediction/BPIC15_1_f2.csv"

    log = LogFile(labeled_logfile,
                  ";",
                  0,
                  None,
                  "time_timestamp",
                  "Case_ID",
                  activity_attr="label",
                  convert=True,
                  k=1)

    columns = [
        "label", "Case_ID", "time_timestamp", "Activity", "monitoringResource",
        "question", "org_resource", "Responsible_actor", "SUMleges"
    ]
    log.keep_attributes(columns)

    log.create_k_context()

    train_log, test_log = log.splitTrainTest(80, True, "train-test")

    train_log.ignoreHistoryAttributes.add("label")

    model = train(train_log)

    results1 = []
    results2 = []

    for case in test_log.get_cases():
        case_df = case[1]
        case_probs = {1: 1, 2: 1}
        ground = 0
        for row in case_df.iterrows():
            ground = getattr(row[1], "label")

            parents = model.variables["label"].conditional_table.parents

            value = []
            for parent in parents:
                value.append(getattr(row[1], parent.attr_name))
            tuple_val = tuple(value)

            activity_var = model.variables["label"]
            probs, unknown = get_probabilities(activity_var, tuple_val,
                                               parents)
            case_probs[1] += probs.get(1, 0)
            case_probs[2] += probs.get(2, 0)

        # correct_prob = sum(case_probs) / len(case_probs)
        if ground == 1:
            if case_probs[1] > case_probs[2]:
                results1.append(1)
            else:
                results1.append(0)

        if ground == 2:
            if case_probs[2] > case_probs[1]:
                results2.append(1)
            else:
                results2.append(0)

    print(len(results1), sum(results1) / len(results1))
    print(len(results2), sum(results2) / len(results2))
Ejemplo n.º 7
0

if __name__ == "__main__":
    data = "../../Data/Helpdesk.csv"
    # data = "../../Data/BPIC15_1_sorted_new.csv"
    case_attr = "case"
    act_attr = "event"

    logfile = LogFile(data,
                      ",",
                      0,
                      None,
                      "completeTime",
                      case_attr,
                      activity_attr=act_attr,
                      convert=False,
                      k=5)
    logfile.keep_attributes(["case", "event", "role", "completeTime"])
    logfile.convert2int()

    logfile.create_k_context()
    train_log, test_log = logfile.splitTrainTest(70,
                                                 case=True,
                                                 method="train-test")

    create_data(train_log, test_log, "helpdesk/")

    # model = train(train_log, 5, 20)
    # model = keras.models.load_model("premiere_model")
    # print("Accuracy:", test(test_log, model))
Ejemplo n.º 8
0
def run_experiment(data,
                   prefix_size,
                   add_end_event,
                   split_method,
                   split_cases,
                   train_percentage,
                   filename="results.txt"):
    data = DATA_FOLDER + data
    logfile = LogFile(data,
                      ",",
                      0,
                      None,
                      "completeTime",
                      "case",
                      activity_attr="event",
                      convert=False,
                      k=prefix_size)

    if prefix_size is None:
        prefix_size = max(logfile.data.groupby(logfile.trace).size())
        if prefix_size > 40:
            prefix_size = 40
    logfile.k = prefix_size

    if add_end_event:
        logfile.add_end_events()
    # logfile.keep_attributes(["case", "event", "role", "completeTime"])
    logfile.keep_attributes(["case", "event", "role"])
    logfile.convert2int()
    logfile.create_k_context()
    train_log, test_log = logfile.splitTrainTest(train_percentage,
                                                 case=split_cases,
                                                 method=split_method)

    with open(filename, "a") as fout:
        fout.write("Data: " + data)
        fout.write("\nPrefix Size: " + str(prefix_size))
        fout.write("\nEnd event: " + str(add_end_event))
        fout.write("\nSplit method: " + split_method)
        fout.write("\nSplit cases: " + str(split_cases))
        fout.write("\nTrain percentage: " + str(train_percentage))
        fout.write("\nDate: " +
                   time.strftime("%d.%m.%y-%H.%M", time.localtime()))
        fout.write("\n------------------------------------\n")

    processes = []
    processes.append(
        Process(target=execute_tax,
                args=(train_log, test_log, filename),
                name="Tax"))
    processes.append(
        Process(target=execute_taymouri,
                args=(train_log, test_log, filename),
                name="Taymouri"))
    processes.append(
        Process(target=execute_camargo,
                args=(train_log, test_log, filename),
                name="Camargo"))
    processes.append(
        Process(target=execute_lin,
                args=(train_log, test_log, filename),
                name="Lin"))
    processes.append(
        Process(target=execute_dimauro,
                args=(train_log, test_log, filename),
                name="Di Mauro"))
    processes.append(
        Process(target=execute_pasquadibisceglie,
                args=(train_log, test_log, filename),
                name="Pasquadibisceglie"))
    processes.append(
        Process(target=execute_edbn,
                args=(train_log, test_log, filename),
                name="EDBN"))
    processes.append(
        Process(target=execute_baseline,
                args=(train_log, test_log, filename),
                name="Baseline"))
    # processes.append(Process(target=execute_new_method, args=(train_log, test_log, filename), name="New Method"))

    print("Starting Processes")
    for p in processes:
        p.start()
        print(p.name, "started")

    print("All processes running")

    for p in processes:
        p.join()
        print(p.name, "stopped")

    with open(filename, "a") as fout:
        fout.write("====================================\n\n")

    print("All processes stopped")
Ejemplo n.º 9
0
def get_data(dataset, dataset_size, k, add_end, reduce_tasks, resource_pools, remove_resource):
    filename_parts = [dataset, str(dataset_size), str(k)]
    for v in [add_end, reduce_tasks, resource_pools, remove_resource]:
        if v:
            filename_parts.append(str(1))
        else:
            filename_parts.append(str(0))
    print(filename_parts)
    cache_file = LOGFILE_PATH + "/" + "_".join(filename_parts)

    colTitles = []

    if os.path.exists(cache_file):
        print("Loading file from cache")
        with open(cache_file, "rb") as pickle_file:
            preprocessed_log = pickle.load(pickle_file)
    else:
        resource_attr = None
        if dataset == BPIC15_1 or dataset == BPIC15:
            logfile = LogFile("../Data/BPIC15_1_sorted_new.csv", ",", 0, dataset_size, "Complete Timestamp", "Case ID", activity_attr="Activity", convert=False, k=k)
            resource_attr = "Resource"
            colTitles = ["Case ID", "Activity", "Resource"]
            logfile.keep_attributes(colTitles)
            logfile.filter_case_length(5)
        elif dataset == BPIC15_2:
            logfile = LogFile("../Data/BPIC15_2_sorted_new.csv", ",", 0, dataset_size, "Complete Timestamp", "Case ID",
                              activity_attr="Activity", convert=False, k=k)
            resource_attr = "Resource"
            colTitles = ["Case ID", "Activity", "Resource"]
            logfile.keep_attributes(colTitles)
            logfile.filter_case_length(5)
        elif dataset == BPIC15_3:
            logfile = LogFile("../Data/BPIC15_3_sorted_new.csv", ",", 0, dataset_size, "Complete Timestamp", "Case ID", activity_attr="Activity", convert=False, k=k)
            resource_attr = "Resource"
            colTitles = ["Case ID", "Activity", "Resource"]
            logfile.keep_attributes(colTitles)
            logfile.filter_case_length(5)
        elif dataset == BPIC15_4:
            logfile = LogFile("../Data/BPIC15_4_sorted_new.csv", ",", 0, dataset_size, "Complete Timestamp", "Case ID", activity_attr="Activity", convert=False, k=k)
            resource_attr = "Resource"
            colTitles = ["Case ID", "Activity", "Resource"]
            logfile.keep_attributes(colTitles)
            logfile.filter_case_length(5)
        elif dataset == BPIC15_5:
            logfile = LogFile("../Data/BPIC15_5_sorted_new.csv", ",", 0, dataset_size, "Complete Timestamp", "Case ID", activity_attr="Activity", convert=False, k=k)
            resource_attr = "Resource"
            colTitles = ["Case ID", "Activity", "Resource"]
            logfile.keep_attributes(colTitles)
            logfile.filter_case_length(5)
        elif dataset == BPIC12:
            logfile = LogFile("../Data/BPIC12.csv", ",", 0, dataset_size, "completeTime", "case", activity_attr="event", convert=False, k=k)
            resource_attr = "org:resource"
            colTitles = ["case", "event", "org:resource"]
            logfile.keep_attributes(colTitles)
            logfile.filter_case_length(5)
        elif dataset == BPIC12W:
            logfile = LogFile("../Data/BPIC12W.csv", ",", 0, dataset_size, "completeTime", "case", activity_attr="event", convert=False, k=k)
            resource_attr = "org:resource"
            colTitles = ["case", "event", "org:resource"]
            logfile.keep_attributes(colTitles)
            logfile.filter_case_length(5)
        elif dataset == HELPDESK:
            logfile = LogFile("../Data/Helpdesk.csv", ",", 0, dataset_size, "completeTime", "case", activity_attr="event", convert=False, k=k)
            resource_attr = "Resource"
            colTitles = ["case", "event", "Resource"]
            logfile.keep_attributes(colTitles)
            logfile.filter_case_length(3)
        elif dataset == BPIC18:
            logfile = LogFile("../Data/bpic2018.csv", ",", 0, dataset_size, "startTime", "case", activity_attr="event", convert=False, k=k)
            colTitles = ["case", "event", "subprocess"]
            logfile.keep_attributes(colTitles)
        else:
            print("Unknown Dataset")
            return None

        preprocessed_log = preprocess(logfile, add_end, reduce_tasks, resource_pools, resource_attr, remove_resource)

        preprocessed_log.create_k_context()
        with open(cache_file, "wb") as pickle_file:
            pickle.dump(preprocessed_log, pickle_file)
    return preprocessed_log, "_".join(filename_parts)