def test_applyAlphaMinerToCSV(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" # calculate and compare Petri nets obtained on the same log to verify that instances # are working correctly log1, net1, marking1, fmarking1 = self.obtainPetriNetThroughAlphaMiner( os.path.join(INPUT_DATA_DIR, "running-example.csv")) log2, net2, marking2, fmarking2 = self.obtainPetriNetThroughAlphaMiner( os.path.join(INPUT_DATA_DIR, "running-example.csv")) log1 = sorting.sort_timestamp(log1) log1 = sampling.sample(log1) log1 = index_attribute.insert_trace_index_as_event_attribute(log1) log2 = sorting.sort_timestamp(log2) log2 = sampling.sample(log2) log2 = index_attribute.insert_trace_index_as_event_attribute(log2) petri_exporter.export_net( net1, marking1, os.path.join(OUTPUT_DATA_DIR, "running-example.pnml")) os.remove(os.path.join(OUTPUT_DATA_DIR, "running-example.pnml")) self.assertEqual(len(net1.places), len(net2.places)) self.assertEqual(len(net1.transitions), len(net2.transitions)) self.assertEqual(len(net1.arcs), len(net2.arcs)) final_marking = petri.petrinet.Marking() for p in net1.places: if not p.out_arcs: final_marking[p] = 1 aligned_traces = token_replay.apply_log(log1, net1, marking1, final_marking) self.assertEqual(aligned_traces, aligned_traces)
def test_importExportCSVtoCSV(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" event_log = csv_importer.import_event_stream( os.path.join(INPUT_DATA_DIR, "running-example.csv")) event_log = sorting.sort_timestamp(event_log) event_log = sampling.sample(event_log) event_log = index_attribute.insert_event_index_as_event_attribute( event_log) log = log_conv_fact.apply(event_log) log = sorting.sort_timestamp(log) log = sampling.sample(log) log = index_attribute.insert_trace_index_as_event_attribute(log) event_log_transformed = log_conv_fact.apply( log, variant=log_conv_fact.TO_EVENT_STREAM) csv_exporter.export( event_log_transformed, os.path.join(OUTPUT_DATA_DIR, "running-example-exported.csv")) event_log_imported_after_export = csv_importer.import_event_stream( os.path.join(OUTPUT_DATA_DIR, "running-example-exported.csv")) log_imported_after_export = log_conv_fact.apply( event_log_imported_after_export) self.assertEqual(len(log), len(log_imported_after_export)) os.remove(os.path.join(OUTPUT_DATA_DIR, "running-example-exported.csv"))
def test_importExportCSVtoCSV(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" df = pd.read_csv(os.path.join(INPUT_DATA_DIR, "running-example.csv")) df = dataframe_utils.convert_timestamp_columns_in_df(df) event_log = log_conversion.apply( df, variant=log_conversion.TO_EVENT_STREAM) event_log = sorting.sort_timestamp(event_log) event_log = sampling.sample(event_log) event_log = index_attribute.insert_event_index_as_event_attribute( event_log) log = log_conversion.apply(event_log) log = sorting.sort_timestamp(log) log = sampling.sample(log) log = index_attribute.insert_trace_index_as_event_attribute(log) event_log_transformed = log_conversion.apply( log, variant=log_conversion.TO_EVENT_STREAM) df = log_conversion.apply(event_log_transformed, variant=log_conversion.TO_DATA_FRAME) df.to_csv(os.path.join(OUTPUT_DATA_DIR, "running-example-exported.csv")) df = pd.read_csv( os.path.join(OUTPUT_DATA_DIR, "running-example-exported.csv")) df = dataframe_utils.convert_timestamp_columns_in_df(df) event_log_imported_after_export = log_conversion.apply( df, variant=log_conversion.TO_EVENT_STREAM) log_imported_after_export = log_conversion.apply( event_log_imported_after_export) self.assertEqual(len(log), len(log_imported_after_export)) os.remove(os.path.join(OUTPUT_DATA_DIR, "running-example-exported.csv"))
def test_importExportCSVtoXES(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" event_log = csv_importer.import_event_stream(os.path.join(INPUT_DATA_DIR, "running-example.csv")) event_log = sorting.sort_timestamp(event_log) event_log = sampling.sample(event_log) event_log = index_attribute.insert_event_index_as_event_attribute(event_log) log = log_transform.transform_event_stream_to_event_log(event_log) log = sorting.sort_timestamp(log) log = sampling.sample(log) log = index_attribute.insert_trace_index_as_event_attribute(log) xes_exporter.export_log(log, os.path.join(OUTPUT_DATA_DIR, "running-example-exported.xes")) log_imported_after_export = xes_importer.import_log( os.path.join(OUTPUT_DATA_DIR, "running-example-exported.xes")) self.assertEqual(len(log), len(log_imported_after_export)) os.remove(os.path.join(OUTPUT_DATA_DIR, "running-example-exported.xes"))
def select_attributes_from_log_for_tree(log, max_cases_for_attr_selection=DEFAULT_MAX_CASES_FOR_ATTR_SELECTION, max_diff_occ=DEFAULT_MAX_CASES_FOR_ATTR_SELECTION / 4): """ Select attributes from log for tree Parameters ------------ log Log max_cases_for_attr_selection Maximum number of cases to consider for attribute selection max_diff_occ Maximum number of different occurrences Returns ------------ """ if len(log) > max_cases_for_attr_selection: filtered_log = sampling.sample(log, max_cases_for_attr_selection) else: filtered_log = log event_attributes = get_all_event_attributes_from_log(filtered_log) trace_attributes = get_all_trace_attributes_from_log(filtered_log) event_attributes_values = {} trace_attributes_values = {} for attr in event_attributes: event_attributes_values[attr] = set(get_attribute_values(log, attr).keys()) for attr in trace_attributes: trace_attributes_values[attr] = set(get_trace_attribute_values(log, attr).keys()) numeric_event_attributes_to_consider = list() string_event_attributes_to_consider = list() numeric_trace_attributes_to_consider = list() string_trace_attributes_to_consider = list() for attr in event_attributes_values: if type(list(event_attributes_values[attr])[0]) is int or type(list(event_attributes_values[attr])[0]) is float: numeric_event_attributes_to_consider.append(attr) elif type(list(event_attributes_values[attr])[0]) is str and len(event_attributes_values[attr]) < max_diff_occ: string_event_attributes_to_consider.append(attr) for attr in trace_attributes_values: if type(list(trace_attributes_values[attr])[0]) is int or type(list(trace_attributes_values[attr])[0]) is float: numeric_trace_attributes_to_consider.append(attr) elif type(list(trace_attributes_values[attr])[0]) is str and len(trace_attributes_values[attr]) < max_diff_occ: string_trace_attributes_to_consider.append(attr) numeric_event_attributes_to_consider = check_event_attributes_presence(log, numeric_event_attributes_to_consider) string_event_attributes_to_consider = check_event_attributes_presence(log, string_event_attributes_to_consider) numeric_trace_attributes_to_consider = check_trace_attributes_presence(log, numeric_trace_attributes_to_consider) string_trace_attributes_to_consider = check_trace_attributes_presence(log, string_trace_attributes_to_consider) return string_trace_attributes_to_consider, string_event_attributes_to_consider, numeric_trace_attributes_to_consider, numeric_event_attributes_to_consider
def test_alphaMinerVisualizationFromXES(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" log, net, marking, fmarking = self.obtainPetriNetThroughAlphaMiner( os.path.join(INPUT_DATA_DIR, "running-example.xes")) log = sorting.sort_timestamp(log) log = sampling.sample(log) log = index_attribute.insert_trace_index_as_event_attribute(log) petri_exporter.apply(net, marking, os.path.join(OUTPUT_DATA_DIR, "running-example.pnml")) os.remove(os.path.join(OUTPUT_DATA_DIR, "running-example.pnml")) gviz = pn_viz.graphviz_visualization(net) self.assertEqual(gviz, gviz) final_marking = petri.petrinet.Marking() for p in net.places: if not p.out_arcs: final_marking[p] = 1 aligned_traces = token_replay.apply(log, net, marking, fmarking) self.assertEqual(aligned_traces, aligned_traces)
def sample_cases(log: Union[EventLog, pd.DataFrame], num_cases: int) -> Union[EventLog, pd.DataFrame]: """ (Random) Sample a given number of cases from the event log. Parameters --------------- log Event log / Pandas dataframe num_cases Number of cases to sample Returns --------------- sampled_log Sampled event log (containing the specified amount of cases) """ if isinstance(log, EventLog): from pm4py.objects.log.util import sampling return sampling.sample(log, num_cases) elif isinstance(log, pd.DataFrame): from pm4py.objects.log.util import dataframe_utils return dataframe_utils.sample_dataframe( log, parameters={"max_no_cases": num_cases})
from grm import GRM, preprocessing from pm4py.objects.log.util import sampling # import data data_raw = preprocessing.import_data("data", "BPI2020_PermitLog.csv", separator=";", quote='', case_id="Case ID", activity="Activity", time_stamp="Complete Timestamp", target="(case) Overspent") # Create new GRM model object hyper_params = {'num_epochs': 1} grm_model = GRM.GRM(data_raw, params=hyper_params) # Train GGNN model grm_model.train() # Evaluation of the GGNN model evaluation_metrics = grm_model.testing_log() print(evaluation_metrics) # Visualization as DFG (with sample of evaluation data) multi_instance_log = sampling.sample(data_raw, n=100) grm_model.visualize_dfg(save_file=False, log=multi_instance_log, file_name="multi")
from grm import preprocessing, GRM from grm.util import get_activities from pm4py.algo.filtering.log.attributes import attributes_filter from pm4py.util import constants from pm4py.objects.log.util import sampling model_path = '../best_models/sp2020/2020-05-06-05-40_best_model.pickle' logfile = "sp2020.csv" name_of_case_id = "CASE_ID" name_of_activity = "ACTIVITY" name_of_timestamp = "TIMESTAMP" name_of_label = "REPAIR_IN_TIME_5D" log = preprocessing.import_data("data", logfile, separator=";", quote='"', case_id=name_of_case_id, activity=name_of_activity, time_stamp=name_of_timestamp, target=name_of_label) activities = get_activities(log) grm_model = GRM.GRM(log, activities, restore_file=model_path) log = attributes_filter.apply(log, [0], parameters={constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: "label", "positive": True}) log = sampling.sample(log, n=5000) grm_model.visualize_dfg(save_file=True, log=log, file_name="sp2020_", variant="all")
import os from pm4py.objects.log.adapters.pandas import csv_import_adapter from pm4py.objects.conversion.log import factory as conversion_factory dataframe = csv_import_adapter.import_dataframe_from_path(os.path.join( "pmdata/", "running-example.csv"), sep=',') dataframe dataframe.head() dataframe.summary() log = conversion_factory.apply(dataframe) from pm4py.objects.log.exporter.csv import factory as csv_exporter csv_exporter.export(event_stream, "data/outputFile1.csv") #sorting log from pm4py.objects.log.util import sorting log = sorting.sort_timestamp(log) log from pm4py.objects.log.util import sorting sorted_log = sorting.sort_lambda(log, lambda x: x.attributes["concept:name"], reverse=False) sorted_log #sampling from pm4py.objects.log.util import sampling sampled_log = sampling.sample(log, n=50) sampled_log #links #http://www.processmining.org/event_logs_and_models_used_in_book
def run_experiment(data_raw, hyper_params=None, k=10, ml_flow_uri="databricks", ml_flow_exp="/Shared/grm-review", ml_flow_run_name_prefix="Experiment", save_artifact=True): """ Performs experiment. :param data_raw: raw data from event log file. :param hyper_params: set of hyper-parameters. :param k: index of k-fold cross-validation. :param ml_flow_uri: ?? :param ml_flow_exp: ?? :param ml_flow_run_name_prefix: ?? :param save_artifact: set False if >1GB :return: none. """ # init ml flow mlflow.set_tracking_uri(ml_flow_uri) mlflow.set_experiment(ml_flow_exp) # load event log activities = get_activities(data_raw) num_activities = len(activities) with mlflow.start_run(run_name=ml_flow_run_name_prefix + "_" + str(uuid.uuid1())) as run: if hyper_params: for key, value in hyper_params.items(): log_param(key, value) log_param("k", k) log_metric("number of activities", num_activities) results_measures = dict() i = 0 # Perform k-fold cross-validation kf = KFold(n_splits=k, shuffle=True) for train_idx, test_idx in kf.split(data_raw): i += 1 data_training = [data_raw[j] for j in train_idx] data_testing = [data_raw[j] for j in test_idx] with mlflow.start_run(nested=True, run_name="run_%d" % i) as run_cv: print("Starting Run " + str(i)) # Create new GGNN model object grm_model = GRM.GRM(data_training, activities, restore_file=None, params=hyper_params) # Train GGNN model grm_model.train() # Perform evaluation measures = grm_model.testing_log(data_testing) for key in measures.keys(): log_metric(key, measures[key], i) if key in results_measures: pass else: results_measures[key] = [] results_measures[key].append(measures[key]) print(key + " of run " + str(i) + ": " + str(round(measures[key], 3))) if save_artifact is True: log_artifact(grm_model.best_model_file) log_artifact('../results/cm.pdf') for key in results_measures.keys(): overall_measure = mean(results_measures[key]) log_metric(key, overall_measure) print("Overall " + key + ": " + str(overall_measure)) overall_st_dev = stdev(results_measures["accuracy"]) log_metric("st_dev", overall_st_dev) print("Standard deviation: " + str(overall_st_dev)) """ Relevance visualisation for one instance """ # Extract one random instance from the log single_instance_log = sampling.sample(data_raw, n=1) # Visualization as direct follower graph (DFG) with evaluation data filenames = grm_model.visualize_dfg(save_file=True, log=single_instance_log, file_name="single") for file in filenames: log_artifact(file) """ Relevance visualisation for 1000 instances """ # Extract 1000 instances from the event log multi_instance_log = sampling.sample(data_raw, n=1000) # Visualization as DFG (with evaluation data) for file in grm_model.visualize_dfg(save_file=True, log=multi_instance_log, file_name="multi"): log_artifact(file)