Exemple #1
0
from grm import GRM, preprocessing
from pm4py.objects.log.util import sampling

# import data
data_raw = preprocessing.import_data("data",
                                     "BPI2020_PermitLog.csv",
                                     separator=";",
                                     quote='',
                                     case_id="Case ID",
                                     activity="Activity",
                                     time_stamp="Complete Timestamp",
                                     target="(case) Overspent")

# Create new GRM model object
hyper_params = {'num_epochs': 1}
grm_model = GRM.GRM(data_raw, params=hyper_params)

# Train GGNN model
grm_model.train()

# Evaluation of the GGNN model
evaluation_metrics = grm_model.testing_log()
print(evaluation_metrics)

# Visualization as DFG (with sample of evaluation data)
multi_instance_log = sampling.sample(data_raw, n=100)
grm_model.visualize_dfg(save_file=False,
                        log=multi_instance_log,
                        file_name="multi")
Exemple #2
0
hyper_params = {'num_epochs': 1000}
k = 10
# Load data
log = preprocessing.import_data("../data",
                                log_file,
                                separator=";",
                                quote='"',
                                case_id=name_of_case_id,
                                activity=name_of_activity,
                                time_stamp=name_of_timestamp,
                                target=name_of_label)

# restore trained GGNN model
grm_model = GRM.GRM(
    log,
    get_activities(log),
    restore_file=
    "../predictive_quality/logged_models/2020-07-11-08-12_best_model.pickle",
    params=hyper_params)

# create process model (full)
grm_model.visualize_dfg(save_file=True,
                        log=log,
                        file_name="bpi2020_all_",
                        variant="all")
# create process model with filter (top 5 most relevant)
grm_model.visualize_dfg(save_file=True,
                        log=log,
                        file_name="bpi2020_5_",
                        variant="all",
                        topK=5)
# create process model with filter (top 10 most relevant)
hyper_params = {'num_epochs': 1000}
k = 10

log = preprocessing.import_data("../data",
                                logfile,
                                separator=";",
                                quote='',
                                case_id=name_of_case_id,
                                activity=name_of_activity,
                                time_stamp=name_of_timestamp,
                                target=name_of_label)

# filter out most relevant activity
model_path = '../best_models/sp2020/2020-05-05-14-59_best_model.pickle'
activities = get_activities(log)
grm_model = GRM.GRM(log, activities, restore_file=model_path)

filtered_log = EventLog()
for trace in log:
    case_id, pred, rel_scores = grm_model.predict(trace)
    if len(rel_scores) > 1:
        most_relevant = max(rel_scores.items(), key=operator.itemgetter(1))[0]
        log_trace = attributes_filter.apply_events(
            log, [case_id],
            parameters={
                attributes_filter.PARAMETER_CONSTANT_ATTRIBUTE_KEY:
                name_of_case_id,
                "positive": True
            })

        trace_without_most = attributes_filter.apply_events(
Exemple #4
0
def run_experiment(data_raw,
                   hyper_params=None,
                   k=10,
                   ml_flow_uri="databricks",
                   ml_flow_exp="/Shared/grm-review",
                   ml_flow_run_name_prefix="Experiment",
                   save_artifact=True):
    """
    Performs experiment.
    :param data_raw: raw data from event log file.
    :param hyper_params: set of hyper-parameters.
    :param k: index of k-fold cross-validation.
    :param ml_flow_uri: ??
    :param ml_flow_exp: ??
    :param ml_flow_run_name_prefix: ??
    :param save_artifact: set False if >1GB
    :return: none.
    """

    # init ml flow
    mlflow.set_tracking_uri(ml_flow_uri)
    mlflow.set_experiment(ml_flow_exp)

    # load event log
    activities = get_activities(data_raw)
    num_activities = len(activities)
    with mlflow.start_run(run_name=ml_flow_run_name_prefix + "_" +
                          str(uuid.uuid1())) as run:
        if hyper_params:
            for key, value in hyper_params.items():
                log_param(key, value)

        log_param("k", k)
        log_metric("number of activities", num_activities)
        results_measures = dict()
        i = 0

        # Perform k-fold cross-validation
        kf = KFold(n_splits=k, shuffle=True)
        for train_idx, test_idx in kf.split(data_raw):
            i += 1
            data_training = [data_raw[j] for j in train_idx]
            data_testing = [data_raw[j] for j in test_idx]

            with mlflow.start_run(nested=True,
                                  run_name="run_%d" % i) as run_cv:
                print("Starting Run " + str(i))

                # Create new GGNN model object
                grm_model = GRM.GRM(data_training,
                                    activities,
                                    restore_file=None,
                                    params=hyper_params)

                # Train GGNN model
                grm_model.train()

                # Perform evaluation
                measures = grm_model.testing_log(data_testing)
                for key in measures.keys():
                    log_metric(key, measures[key], i)
                    if key in results_measures:
                        pass
                    else:
                        results_measures[key] = []
                    results_measures[key].append(measures[key])
                    print(key + " of run " + str(i) + ": " +
                          str(round(measures[key], 3)))

                if save_artifact is True:
                    log_artifact(grm_model.best_model_file)
                log_artifact('../results/cm.pdf')

        for key in results_measures.keys():
            overall_measure = mean(results_measures[key])
            log_metric(key, overall_measure)
            print("Overall " + key + ": " + str(overall_measure))

        overall_st_dev = stdev(results_measures["accuracy"])
        log_metric("st_dev", overall_st_dev)
        print("Standard deviation: " + str(overall_st_dev))
        """ Relevance visualisation for one instance """
        # Extract one random instance from the log
        single_instance_log = sampling.sample(data_raw, n=1)

        # Visualization as direct follower graph (DFG) with evaluation data
        filenames = grm_model.visualize_dfg(save_file=True,
                                            log=single_instance_log,
                                            file_name="single")
        for file in filenames:
            log_artifact(file)
        """ Relevance visualisation for 1000 instances """
        # Extract 1000 instances from the event log
        multi_instance_log = sampling.sample(data_raw, n=1000)

        # Visualization as DFG (with evaluation data)
        for file in grm_model.visualize_dfg(save_file=True,
                                            log=multi_instance_log,
                                            file_name="multi"):
            log_artifact(file)