def execute_script(): log_path = os.path.join("..", "tests", "input_data", "interval_event_log.xes") #log_path = os.path.join("..", "tests", "input_data", "reviewing.xes") log = xes_importer.apply(log_path) parameters = {} parameters[constants.PARAMETER_CONSTANT_START_TIMESTAMP_KEY] = "start_timestamp" parameters[constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] = "time:timestamp" parameters[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = "concept:name" parameters["strict"] = False parameters["format"] = "svg" start_activities = sa_get.get_start_activities(log, parameters=parameters) end_activities = ea_get.get_end_activities(log, parameters=parameters) parameters["start_activities"] = start_activities parameters["end_activities"] = end_activities soj_time = soj_time_get.apply(log, parameters=parameters) print("soj_time") print(soj_time) conc_act = conc_act_get.apply(log, parameters=parameters) print("conc_act") print(conc_act) efg = efg_get.apply(log, parameters=parameters) print("efg") print(efg) dfg_freq = dfg_algorithm.apply(log, parameters=parameters, variant=dfg_algorithm.Variants.FREQUENCY) dfg_perf = dfg_algorithm.apply(log, parameters=parameters, variant=dfg_algorithm.Variants.PERFORMANCE) dfg_gv_freq = dfg_vis_fact.apply(dfg_freq, log=log, variant=dfg_vis_fact.Variants.FREQUENCY, parameters=parameters) dfg_vis_fact.view(dfg_gv_freq) dfg_gv_perf = dfg_vis_fact.apply(dfg_perf, log=log, variant=dfg_vis_fact.Variants.PERFORMANCE, parameters=parameters) dfg_vis_fact.view(dfg_gv_perf) net, im, fm = dfg_conv.apply(dfg_freq) gviz = pn_vis.apply(net, im, fm, parameters=parameters) pn_vis.view(gviz)
def detect(log: EventLog, alphabet: Dict[str, int], act_key: str, use_msd: bool) -> Optional[str]: candidates = set(alphabet.keys()) for t in log: candidates = candidates.intersection(set(map(lambda e: e[act_key], t))) if len(candidates) == 0: return None for a in candidates: proj = EventLog() for t in log: proj.append(pm4py.filter_trace(lambda e: e[act_key] != a, t)) if len(list(filter(lambda t: len(t) == 0, proj))) == 0: dfg_proj = discover_dfg.apply(proj, parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) alphabet_proj = pm4py.get_attribute_values(proj, act_key) start_act_proj = get_starters.get_start_activities(proj, parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) end_act_proj = get_ends.get_end_activities(log, parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) pre_proj, post_proj = dfg_utils.get_transitive_relations(dfg_proj, alphabet_proj) cut = sequence_cut.detect(alphabet_proj, pre_proj, post_proj) if cut is not None: return a cut = xor_cut.detect(dfg_proj, alphabet_proj) if cut is not None: return a cut = concurrent_cut.detect(dfg_proj, alphabet_proj, start_act_proj, end_act_proj, msd= msdw_algo.derive_msd_witnesses(proj, msd_algo.apply(log, parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}), parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) if use_msd else None) if cut is not None: return a cut = loop_cut.detect(dfg_proj, alphabet_proj, start_act_proj, end_act_proj) if cut is not None: return a return None
def discover_dfg(log): """ Discovers a DFG from a log_skeleton Parameters -------------- log Event log_skeleton Returns -------------- dfg DFG start_activities Start activities end_activities End activities """ from pm4py.algo.discovery.dfg import algorithm as dfg_discovery dfg = dfg_discovery.apply(log) from pm4py.statistics.start_activities.log import get as start_activities_module from pm4py.statistics.end_activities.log import get as end_activities_module start_activities = start_activities_module.get_start_activities(log) end_activities = end_activities_module.get_end_activities(log) return dfg, start_activities, end_activities
def apply(log, parameters=None): """ Discovers a footprint object from an event log (the footprints of the event log are returned) Parameters -------------- log Log parameters Parameters of the algorithm: - Parameters.ACTIVITY_KEY Returns -------------- footprints_obj Footprints object """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) log = converter.apply(log, variant=converter.TO_EVENT_LOG, parameters=parameters) dfg = dfg_discovery.apply(log, parameters=parameters) parallel = {(x, y) for (x, y) in dfg if (y, x) in dfg} sequence = set( causal_discovery.apply(dfg, causal_discovery.Variants.CAUSAL_ALPHA)) start_activities = set( get_start_activities.get_start_activities(log, parameters=parameters)) end_activities = set( get_end_activities.get_end_activities(log, parameters=parameters)) activities = set(y[activity_key] for x in log for y in x) return { Outputs.DFG.value: dfg, Outputs.SEQUENCE.value: sequence, Outputs.PARALLEL.value: parallel, Outputs.START_ACTIVITIES.value: start_activities, Outputs.END_ACTIVITIES.value: end_activities, Outputs.ACTIVITIES.value: activities, Outputs.MIN_TRACE_LENGTH.value: min(len(x) for x in log) if len(log) > 0 else 0 }
def test_exporting_dfg_with_sa_ea(self): log = xes_importer.apply(os.path.join("input_data", "running-example.xes")) dfg = dfg_discovery.apply(log) sa = start_activities.get_start_activities(log) ea = end_activities.get_end_activities(log) dfg_exporter.apply(dfg, os.path.join("test_output_data", "running-example.dfg"), parameters={dfg_exporter.Variants.CLASSIC.value.Parameters.START_ACTIVITIES: sa, dfg_exporter.Variants.CLASSIC.value.Parameters.END_ACTIVITIES: ea}) dfg, sa, ea = dfg_importer.apply(os.path.join("test_output_data", "running-example.dfg")) os.remove(os.path.join("test_output_data", "running-example.dfg"))
def discover_dfg_miner(log): dfg = dfg_discovery.apply(log) sa = sa_get.get_start_activities(log) ea = ea_get.get_end_activities(log) net, im, fm = dfg_converter.apply(dfg, parameters={ "start_activities": sa, "end_activities": ea }) return net, im, fm
def apply_heu(log, parameters=None): """ Discovers an Heuristics Net using Heuristics Miner Parameters ------------ log Event log parameters Possible parameters of the algorithm, including: - Parameters.ACTIVITY_KEY - Parameters.TIMESTAMP_KEY - Parameters.CASE_ID_KEY - Parameters.DEPENDENCY_THRESH - Parameters.AND_MEASURE_THRESH - Parameters.MIN_ACT_COUNT - Parameters.MIN_DFG_OCCURRENCES - Parameters.DFG_PRE_CLEANING_NOISE_THRESH - Parameters.LOOP_LENGTH_TWO_THRESH Returns ------------ heu Heuristics Net """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY) start_activities = log_sa_filter.get_start_activities( log, parameters=parameters) end_activities = log_ea_filter.get_end_activities(log, parameters=parameters) activities_occurrences = log_attributes.get_attribute_values( log, activity_key, parameters=parameters) activities = list(activities_occurrences.keys()) dfg = dfg_alg.apply(log, parameters=parameters) parameters_w2 = deepcopy(parameters) parameters_w2["window"] = 2 dfg_window_2 = dfg_alg.apply(log, parameters=parameters_w2) freq_triples = dfg_alg.apply(log, parameters=parameters, variant=dfg_alg.Variants.FREQ_TRIPLES) return apply_heu_dfg(dfg, activities=activities, activities_occurrences=activities_occurrences, start_activities=start_activities, end_activities=end_activities, dfg_window_2=dfg_window_2, freq_triples=freq_triples, parameters=parameters)
def apply_tree(log, parameters=None): """ Apply the IMDF algorithm to a log obtaining a process tree Parameters ---------- log Log parameters Parameters of the algorithm, including: pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY -> attribute of the log to use as activity name (default concept:name) Returns ---------- tree Process tree """ if parameters is None: parameters = {} if pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY not in parameters: parameters[pmutil.constants. PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_util.DEFAULT_NAME_KEY activity_key = parameters[pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY] # get the DFG dfg = [(k, v) for k, v in dfg_inst.apply( log, parameters={ pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY: activity_key }).items() if v > 0] # gets the start activities from the log start_activities = log_start_act_stats.get_start_activities( log, parameters=parameters) # gets the end activities from the log end_activities = log_end_act_stats.get_end_activities( log, parameters=parameters) # get the activities in the log activities = log_attributes_stats.get_attribute_values(log, activity_key) # check if the log contains empty traces contains_empty_traces = False traces_length = [len(trace) for trace in log] if traces_length: contains_empty_traces = min([len(trace) for trace in log]) == 0 return apply_tree_dfg(dfg, parameters=parameters, activities=activities, contains_empty_traces=contains_empty_traces, start_activities=start_activities, end_activities=end_activities)
def discover_abstraction_log( log: EventLog, parameters: Optional[Dict[Any, Any]] = None ) -> Tuple[Any, Any, Any, Any, Any, Any, Any]: """ Discovers an abstraction from a log that is useful for the Heuristics Miner ++ algorithm Parameters -------------- log Event log parameters Parameters of the algorithm, including: - Parameters.ACTIVITY_KEY - Parameters.START_TIMESTAMP_KEY - Parameters.TIMESTAMP_KEY - Parameters.CASE_ID_KEY Returns -------------- start_activities Start activities end_activities End activities activities_occurrences Activities along with their number of occurrences dfg Directly-follows graph performance_dfg (Performance) Directly-follows graph sojourn_time Sojourn time for each activity concurrent_activities Concurrent activities """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY) start_activities = log_sa.get_start_activities(log, parameters=parameters) end_activities = log_ea.get_end_activities(log, parameters=parameters) activities_occurrences = log_attributes.get_attribute_values( log, activity_key, parameters=parameters) efg_parameters = copy(parameters) efg_parameters[efg_get.Parameters.KEEP_FIRST_FOLLOWING] = True dfg = efg_get.apply(log, parameters=efg_parameters) performance_dfg = dfg_alg.apply(log, variant=dfg_alg.Variants.PERFORMANCE, parameters=parameters) sojourn_time = soj_get.apply(log, parameters=parameters) concurrent_activities = conc_act_get.apply(log, parameters=parameters) return (start_activities, end_activities, activities_occurrences, dfg, performance_dfg, sojourn_time, concurrent_activities)
def apply_tree(log, parameters=None): """ Apply the IMDF algorithm to a log_skeleton obtaining a process tree Parameters ---------- log Log parameters Parameters of the algorithm, including: Parameters.ACTIVITY_KEY -> attribute of the log_skeleton to use as activity name (default concept:name) Returns ---------- tree Process tree """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value( Parameters.ACTIVITY_KEY, parameters, pmutil.xes_constants.DEFAULT_NAME_KEY) # get the DFG dfg = [(k, v) for k, v in dfg_inst.apply(log, parameters=parameters).items() if v > 0] # gets the start activities from the log_skeleton start_activities = log_start_act_stats.get_start_activities( log, parameters=parameters) # gets the end activities from the log_skeleton end_activities = log_end_act_stats.get_end_activities( log, parameters=parameters) # get the activities in the log_skeleton activities = log_attributes_stats.get_attribute_values(log, activity_key) # check if the log_skeleton contains empty traces contains_empty_traces = False traces_length = [len(trace) for trace in log] if traces_length: contains_empty_traces = min([len(trace) for trace in log]) == 0 return apply_tree_dfg(dfg, parameters=parameters, activities=activities, contains_empty_traces=contains_empty_traces, start_activities=start_activities, end_activities=end_activities)
def gerar_previsoes_modelo_from_log_eventos(eventLog): dfg_perf = dfg_discovery.apply(eventLog, variant=dfg_discovery.Variants.PERFORMANCE) sa = start_activities.get_start_activities(eventLog) ea = end_activities.get_end_activities(eventLog) reach_graph, tang_reach_graph, stochastic_map, q_matrix = ctmc.get_tangible_reachability_and_q_matrix_from_dfg_performance( dfg_perf, parameters={ "start_activities": sa, "end_activities": ea }) intervalo_um_dia_em_segundos = 60 * 60 * 24 intervalos = [ intervalo_um_dia_em_segundos * 30, intervalo_um_dia_em_segundos * 60, intervalo_um_dia_em_segundos * 90, intervalo_um_dia_em_segundos * 180, intervalo_um_dia_em_segundos * 365, intervalo_um_dia_em_segundos * 365 * 2, intervalo_um_dia_em_segundos * 365 * 3, intervalo_um_dia_em_segundos * 365 * 4, intervalo_um_dia_em_segundos * 365 * 5, intervalo_um_dia_em_segundos * 365 * 6, intervalo_um_dia_em_segundos * 365 * 7, intervalo_um_dia_em_segundos * 365 * 8, intervalo_um_dia_em_segundos * 365 * 9, intervalo_um_dia_em_segundos * 365 * 10 ] previsoes_por_intervalo = [] # pick the source state initial_state = [ x for x in tang_reach_graph.states if x.name == "source1" ][0] final_state = [x for x in tang_reach_graph.states if x.name == "sink1"][0] for intervalo in intervalos: # analyse the distribution over the states of the system starting from the source after 172800.0 seconds (2 days) transient_result = ctmc.transient_analysis_from_tangible_q_matrix_and_single_state( tang_reach_graph, q_matrix, initial_state, intervalo) for key, value in filter(lambda elem: elem[0].name == "sink1", transient_result.items()): previsoes_por_intervalo.append({ "intervaloEmDias": intervalo / intervalo_um_dia_em_segundos, "probabilidadeDeTermino": float(value) }) return previsoes_por_intervalo
def apply_auto_filter(log, variants=None, parameters=None): """ Apply an end attributes filter detecting automatically a percentage Parameters ---------- log Log variants (If specified) Dictionary with variant as the key and the list of traces as the value parameters Parameters of the algorithm, including: Parameters.DECREASING_FACTOR -> Decreasing factor (stops the algorithm when the next activity by occurrence is below this factor in comparison to previous) Parameters.ATTRIBUTE_KEY -> Attribute key (must be specified if different from concept:name) Returns --------- filtered_log Filtered log """ if parameters is None: parameters = {} attribute_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, DEFAULT_NAME_KEY) decreasing_factor = exec_utils.get_param_value( Parameters.DECREASING_FACTOR, parameters, DECREASING_FACTOR) parameters_variants = { constants.PARAMETER_CONSTANT_ACTIVITY_KEY: attribute_key } if variants is None: variants = variants_filter.get_variants(log, parameters=parameters_variants) vc = variants_filter.get_variants_sorted_by_count(variants) start_activities = get_start_activities(log, parameters=parameters_variants) salist = start_activities_common.get_sorted_start_activities_list( start_activities) sathreshold = start_activities_common.get_start_activities_threshold( salist, decreasing_factor) filtered_log = filter_log_by_start_activities(start_activities, variants, vc, sathreshold, attribute_key) return filtered_log
def apply_heu(log, parameters=None): """ Discovers an Heuristics Net using Heuristics Miner Parameters ------------ log Event log parameters Possible parameters of the algorithm, including: activity_key, case_id_glue, timestamp_key, dependency_thresh, and_measure_thresh, min_act_count, min_dfg_occurrences, dfg_pre_cleaning_noise_thresh, loops_length_two_thresh Returns ------------ heu Heuristics Net """ if parameters is None: parameters = {} activity_key = parameters[ constants.PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY start_activities = log_sa_filter.get_start_activities(log, parameters=parameters) end_activities = log_ea_filter.get_end_activities(log, parameters=parameters) activities_occurrences = log_attributes.get_attribute_values(log, activity_key, parameters=parameters) activities = list(activities_occurrences.keys()) dfg = dfg_factory.apply(log, parameters=parameters) parameters_w2 = deepcopy(parameters) parameters_w2["window"] = 2 dfg_window_2 = dfg_factory.apply(log, parameters=parameters_w2) freq_triples = dfg_factory.apply(log, parameters=parameters, variant="freq_triples") return apply_heu_dfg(dfg, activities=activities, activities_occurrences=activities_occurrences, start_activities=start_activities, end_activities=end_activities, dfg_window_2=dfg_window_2, freq_triples=freq_triples, parameters=parameters)
def apply_cut_im_plain(self, type_of_cut, cut, activity_key): if type_of_cut == 'concurrent': self.detected_cut = 'concurrent' new_logs = split.split_xor(cut[1], self.log, activity_key) for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply(l, parameters=self.parameters).items() if v > 0] activities = attributes_get.get_attribute_values(l, activity_key) start_activities = list( start_activities_get.get_start_activities(l, parameters=self.parameters).keys()) end_activities = list( end_activities_get.get_end_activities(l, parameters=self.parameters).keys()) self.children.append( SubtreeInfrequent(l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=self.parameters)) elif type_of_cut == 'sequential': new_logs = split.split_sequence(cut[1], self.log, activity_key) self.detected_cut = "sequential" for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply(l, parameters=self.parameters).items() if v > 0] activities = attributes_get.get_attribute_values(l, activity_key) start_activities = list( start_activities_get.get_start_activities(l, parameters=self.parameters).keys()) end_activities = list( end_activities_get.get_end_activities(l, parameters=self.parameters).keys()) self.children.append( SubtreeInfrequent(l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=self.parameters)) elif type_of_cut == 'parallel': new_logs = split.split_parallel(cut[1], self.log, activity_key) self.detected_cut = "parallel" for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply(l, parameters=self.parameters).items() if v > 0] activities = attributes_get.get_attribute_values(l, activity_key) start_activities = list( start_activities_get.get_start_activities(l, parameters=self.parameters).keys()) end_activities = list( end_activities_get.get_end_activities(l, parameters=self.parameters).keys()) self.children.append( SubtreeInfrequent(l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=self.parameters)) elif type_of_cut == 'loopCut': new_logs = split.split_loop(cut[1], self.log, activity_key) self.detected_cut = "loopCut" for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply(l, parameters=self.parameters).items() if v > 0] activities = attributes_get.get_attribute_values(l, activity_key) start_activities = list( start_activities_get.get_start_activities(l, parameters=self.parameters).keys()) end_activities = list( end_activities_get.get_end_activities(l, parameters=self.parameters).keys()) self.children.append( SubtreeInfrequent(l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=self.parameters))
def apply(log, net, marking, final_marking, parameters=None): """ Get Align-ET Conformance precision Parameters ---------- log Trace log net Petri net marking Initial marking final_marking Final marking parameters Parameters of the algorithm, including: Parameters.ACTIVITY_KEY -> Activity key """ if parameters is None: parameters = {} debug_level = parameters[ "debug_level"] if "debug_level" in parameters else 0 activity_key = exec_utils.get_param_value( Parameters.ACTIVITY_KEY, parameters, log_lib.util.xes.DEFAULT_NAME_KEY) # default value for precision, when no activated transitions (not even by looking at the initial marking) are found precision = 1.0 sum_ee = 0 sum_at = 0 unfit = 0 if not petri.check_soundness.check_relaxed_soundness_net_in_fin_marking( net, marking, final_marking): raise Exception( "trying to apply Align-ETConformance on a Petri net that is not a relaxed sound net!!" ) prefixes, prefix_count = precision_utils.get_log_prefixes( log, activity_key=activity_key) prefixes_keys = list(prefixes.keys()) fake_log = precision_utils.form_fake_log(prefixes_keys, activity_key=activity_key) align_stop_marking = align_fake_log_stop_marking(fake_log, net, marking, final_marking, parameters=parameters) all_markings = transform_markings_from_sync_to_original_net( align_stop_marking, net, parameters=parameters) for i in range(len(prefixes)): markings = all_markings[i] if markings is not None: log_transitions = set(prefixes[prefixes_keys[i]]) activated_transitions_labels = set() for m in markings: # add to the set of activated transitions in the model the activated transitions # for each prefix activated_transitions_labels = activated_transitions_labels.union( x.label for x in utils. get_visible_transitions_eventually_enabled_by_marking( net, m) if x.label is not None) escaping_edges = activated_transitions_labels.difference( log_transitions) sum_at += len(activated_transitions_labels) * prefix_count[ prefixes_keys[i]] sum_ee += len(escaping_edges) * prefix_count[prefixes_keys[i]] if debug_level > 1: print("") print("prefix=", prefixes_keys[i]) print("log_transitions=", log_transitions) print("activated_transitions=", activated_transitions_labels) print("escaping_edges=", escaping_edges) else: unfit += prefix_count[prefixes_keys[i]] if debug_level > 0: print("\n") print("overall unfit", unfit) print("overall activated transitions", sum_at) print("overall escaping edges", sum_ee) # fix: also the empty prefix should be counted! start_activities = set(get_start_activities(log, parameters=parameters)) trans_en_ini_marking = set([ x.label for x in get_visible_transitions_eventually_enabled_by_marking( net, marking) ]) diff = trans_en_ini_marking.difference(start_activities) sum_at += len(log) * len(trans_en_ini_marking) sum_ee += len(log) * len(diff) # end fix if sum_at > 0: precision = 1 - float(sum_ee) / float(sum_at) return precision
def apply(log, net, marking, final_marking, parameters=None): """ Get ET Conformance precision Parameters ---------- log Trace log net Petri net marking Initial marking final_marking Final marking parameters Parameters of the algorithm, including: Parameters.ACTIVITY_KEY -> Activity key """ if parameters is None: parameters = {} cleaning_token_flood = exec_utils.get_param_value( Parameters.CLEANING_TOKEN_FLOOD, parameters, False) token_replay_variant = exec_utils.get_param_value( Parameters.TOKEN_REPLAY_VARIANT, parameters, executor.Variants.TOKEN_REPLAY) activity_key = exec_utils.get_param_value( Parameters.ACTIVITY_KEY, parameters, log_lib.util.xes.DEFAULT_NAME_KEY) # default value for precision, when no activated transitions (not even by looking at the initial marking) are found precision = 1.0 sum_ee = 0 sum_at = 0 parameters_tr = { token_replay.Parameters.CONSIDER_REMAINING_IN_FITNESS: False, token_replay.Parameters.TRY_TO_REACH_FINAL_MARKING_THROUGH_HIDDEN: False, token_replay.Parameters.STOP_IMMEDIATELY_UNFIT: True, token_replay.Parameters.WALK_THROUGH_HIDDEN_TRANS: True, token_replay.Parameters.CLEANING_TOKEN_FLOOD: cleaning_token_flood, token_replay.Parameters.ACTIVITY_KEY: activity_key } prefixes, prefix_count = precision_utils.get_log_prefixes( log, activity_key=activity_key) prefixes_keys = list(prefixes.keys()) fake_log = precision_utils.form_fake_log(prefixes_keys, activity_key=activity_key) aligned_traces = executor.apply(fake_log, net, marking, final_marking, variant=token_replay_variant, parameters=parameters_tr) # fix: also the empty prefix should be counted! start_activities = set(get_start_activities(log, parameters=parameters)) trans_en_ini_marking = set([ x.label for x in get_visible_transitions_eventually_enabled_by_marking( net, marking) ]) diff = trans_en_ini_marking.difference(start_activities) sum_at += len(log) * len(trans_en_ini_marking) sum_ee += len(log) * len(diff) # end fix for i in range(len(aligned_traces)): if aligned_traces[i]["trace_is_fit"]: log_transitions = set(prefixes[prefixes_keys[i]]) activated_transitions_labels = set([ x.label for x in aligned_traces[i]["enabled_transitions_in_marking"] if x.label is not None ]) sum_at += len(activated_transitions_labels) * prefix_count[ prefixes_keys[i]] escaping_edges = activated_transitions_labels.difference( log_transitions) sum_ee += len(escaping_edges) * prefix_count[prefixes_keys[i]] if sum_at > 0: precision = 1 - float(sum_ee) / float(sum_at) return precision
def apply(log, net, marking, final_marking, parameters=None): """ Get ET Conformance precision Parameters ---------- log Trace log net Petri net marking Initial marking final_marking Final marking parameters Parameters of the algorithm, including: pm4py.util.constants.PARAMETER_CONSTANT_ACTIVITY_KEY -> Activity key """ if parameters is None: parameters = {} cleaning_token_flood = parameters[ "cleaning_token_flood"] if "cleaning_token_flood" in parameters else False token_replay_variant = parameters[ PARAMETER_TOKEN_REPLAY_VARIANT] if PARAMETER_TOKEN_REPLAY_VARIANT in parameters else DEFAULT_TOKEN_REPLAY_VARIANT activity_key = parameters[ PARAM_ACTIVITY_KEY] if PARAM_ACTIVITY_KEY in parameters else log_lib.util.xes.DEFAULT_NAME_KEY # default value for precision, when no activated transitions (not even by looking at the initial marking) are found precision = 1.0 sum_ee = 0 sum_at = 0 parameters_tr = { "consider_remaining_in_fitness": False, "try_to_reach_final_marking_through_hidden": False, "stop_immediately_unfit": True, "walk_through_hidden_trans": True, "cleaning_token_flood": cleaning_token_flood, PARAM_ACTIVITY_KEY: activity_key } prefixes, prefix_count = precision_utils.get_log_prefixes( log, activity_key=activity_key) prefixes_keys = list(prefixes.keys()) fake_log = precision_utils.form_fake_log(prefixes_keys, activity_key=activity_key) aligned_traces = token_replay.apply(fake_log, net, marking, final_marking, variant=token_replay_variant, parameters=parameters_tr) # fix: also the empty prefix should be counted! start_activities = set(get_start_activities(log, parameters=parameters)) trans_en_ini_marking = set([ x.label for x in get_visible_transitions_eventually_enabled_by_marking( net, marking) ]) diff = trans_en_ini_marking.difference(start_activities) sum_at += len(log) * len(trans_en_ini_marking) sum_ee += len(log) * len(diff) # end fix for i in range(len(aligned_traces)): if aligned_traces[i]["trace_is_fit"]: log_transitions = set(prefixes[prefixes_keys[i]]) activated_transitions_labels = set([ x.label for x in aligned_traces[i]["enabled_transitions_in_marking"] if x.label is not None ]) sum_at += len(activated_transitions_labels) * prefix_count[ prefixes_keys[i]] escaping_edges = activated_transitions_labels.difference( log_transitions) sum_ee += len(escaping_edges) * prefix_count[prefixes_keys[i]] if sum_at > 0: precision = 1 - float(sum_ee) / float(sum_at) return precision
def inductive_miner(log, dfg, threshold, root, act_key, use_msd): alphabet = pm4py.get_attribute_values(log, act_key) start_activities = get_starters.get_start_activities( log, parameters={constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) end_activities = get_ends.get_end_activities( log, parameters={constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) empty_traces = pm4py.filter_log(lambda trace: len(trace) == 0, log) if len(empty_traces) == 0: if _is_base_case_act(log, act_key) or _is_base_case_silent(log): return _apply_base_case(log, root, act_key) pre, post = dfg_utils.get_transitive_relations(dfg, alphabet) cut = sequence_cut.detect(alphabet, pre, post) if cut is not None: return _add_operator_recursive( pt.ProcessTree(pt.Operator.SEQUENCE, root), threshold, act_key, sequence_cut.project(log, cut, act_key), use_msd) cut = xor_cut.detect(dfg, alphabet) if cut is not None: return _add_operator_recursive( pt.ProcessTree(pt.Operator.XOR, root), threshold, act_key, xor_cut.project(log, cut, act_key), use_msd) cut = concurrent_cut.detect( dfg, alphabet, start_activities, end_activities, msd=msdw_algo.derive_msd_witnesses( log, msd_algo.apply(log, parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key }), parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key }) if use_msd else None) if cut is not None: return _add_operator_recursive( pt.ProcessTree(pt.Operator.PARALLEL, root), threshold, act_key, concurrent_cut.project(log, cut, act_key), use_msd) cut = loop_cut.detect(dfg, alphabet, start_activities, end_activities) if cut is not None: return _add_operator_recursive( pt.ProcessTree(pt.Operator.LOOP, root), threshold, act_key, loop_cut.project(log, cut, act_key), use_msd) if len(empty_traces) > 0: nempty = pm4py.filter_log(lambda t: len(t) > 0, log) return _add_operator_recursive(pt.ProcessTree(pt.Operator.XOR, root), threshold, act_key, [EventLog(), nempty], use_msd) aopt = activity_once_per_trace.detect(log, alphabet, act_key) if aopt is not None: operator = pt.ProcessTree(operator=pt.Operator.PARALLEL, parent=root) operator.children.append( pt.ProcessTree(operator=None, parent=operator, label=aopt)) return _add_operator_recursive( operator, threshold, act_key, activity_once_per_trace.project(log, aopt, act_key), use_msd) act_conc = activity_concurrent.detect(log, alphabet, act_key, use_msd) if act_conc is not None: return _add_operator_recursive( pt.ProcessTree(pt.Operator.PARALLEL, root), threshold, act_key, activity_concurrent.project(log, act_conc, act_key), use_msd) stl = strict_tau_loop.detect(log, start_activities, end_activities, act_key) if stl is not None: return _add_operator_recursive(pt.ProcessTree(pt.Operator.LOOP, root), threshold, act_key, [stl, EventLog()], use_msd) tl = tau_loop.detect(log, start_activities, act_key) if tl is not None: return _add_operator_recursive(pt.ProcessTree(pt.Operator.LOOP, root), threshold, act_key, [tl, EventLog()], use_msd) return _flower(alphabet, root)
def detect_loop(self): # p0 is part of return value, it contains the partition of activities # write all start and end activities in p1 if self.contains_empty_trace(): return [False, []] start_activities = list( start_activities_get.get_start_activities(self.log, parameters=self.parameters).keys()) end_activities = list(end_activities_get.get_end_activities(self.log, parameters=self.parameters).keys()) p1 = [] for act in start_activities: if act not in p1: p1.append(act) for act in end_activities: if act not in p1: p1.append(act) # create new dfg without the transitions to start and end activities new_dfg = copy(self.dfg) copy_dfg = copy(new_dfg) for ele in copy_dfg: if ele[0][0] in p1 or ele[0][1] in p1: new_dfg.remove(ele) # get connected components of this new dfg new_ingoing = get_ingoing_edges(new_dfg) new_outgoing = get_outgoing_edges(new_dfg) # it was a pain in the *** to get a working directory of the current_activities, as we can't iterate ove the dfg current_activities = {} for element in self.activities: if element not in p1: current_activities.update({element: 1}) p0 = detection_utils.get_connected_components(new_ingoing, new_outgoing, current_activities) p0.insert(0, p1) iterable_dfg = [] for i in range(0, len(self.dfg)): iterable_dfg.append(self.dfg[i][0]) # p0 is like P1,P2,...,Pn in line 3 on page 190 of the IM Thesis # check for subsets in p0 that have connections to and end or from a start activity p0_copy = [] for int_el in p0: p0_copy.append(int_el) for element in p0_copy: # for every set in p0 removed = False if element in p0 and element != p0[0]: for act in element: # for every activity in this set for e in end_activities: # for every end activity if e not in start_activities: if (act, e) in iterable_dfg: # check if connected # is there an element in dfg pointing from any act in a subset of p0 to an end activity for activ in element: if activ not in p0[0]: p0[0].append(activ) if element in p0: p0.remove(element) # remove subsets that are connected to an end activity removed = True break if removed: break for s in start_activities: if s not in end_activities: if not removed: if (s, act) in iterable_dfg: for acti in element: if acti not in p0[0]: p0[0].append(acti) if element in p0: p0.remove(element) # remove subsets that are connected to an end activity removed = True break else: break if removed: break iterable_dfg = [] for i in range(0, len(self.dfg)): iterable_dfg.append(self.dfg[i][0]) p0_copy = [] for int_el in p0: p0_copy.append(int_el) for element in p0_copy: if element in p0 and element != p0[0]: for act in element: for e in self.end_activities: if (e, act) in iterable_dfg: # get those act, that are connected from an end activity for e2 in self.end_activities: # check, if the act is connected from all end activities if (e2, act) not in iterable_dfg: for acti in element: if acti not in p0[0]: p0[0].append(acti) if element in p0: p0.remove(element) # remove subsets that are connected to an end activity break for s in self.start_activities: if (act, s) in iterable_dfg: # same as above (in this case for activities connected to # a start activity) for s2 in self.start_activities: if (act, s2) not in iterable_dfg: for acti in element: if acti not in p0[0]: p0[0].append(acti) if element in p0: p0.remove(element) # remove subsets that are connected to an end activity break if len(p0) > 1: return [True, p0] else: return [False, []]
def detect_cut_if(self, second_iteration=False, parameters=None): if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, pmutil.xes_constants.DEFAULT_NAME_KEY) # check base cases: empty_log = base_case.empty_log(self.log) single_activity = base_case.single_activity(self.log, activity_key) if empty_log: self.detected_cut = 'empty_log' elif single_activity: self.detected_cut = 'single_activity' # if no base cases are found, search for a cut: # use the cutting and splitting functions of im_plain: else: found_plain_cut, type_of_cut, cut = self.check_cut_im_plain() if found_plain_cut: self.apply_cut_im_plain(type_of_cut, cut, activity_key) # if im_plain does not find a cut, we filter on our threshold and then again apply the im_cut detection # but this time, we have to use different splitting functions: else: self.filter_dfg_on_threshold() found_plain_cut, type_of_cut, cut = self.check_cut_im_plain() if found_plain_cut: if type_of_cut == 'concurrent': logging.debug("concurrent_cut_if") self.detected_cut = 'concurrent' new_logs = splitting_infrequent.split_xor_infrequent(cut[1], self.log, activity_key) for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply(l, parameters=parameters).items() if v > 0] activities = attributes_get.get_attribute_values(l, activity_key) start_activities = list( start_activities_get.get_start_activities(l, parameters=parameters).keys()) end_activities = list( end_activities_get.get_end_activities(l, parameters=parameters).keys()) self.children.append( SubtreeInfrequent(l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) elif type_of_cut == 'sequential': logging.debug("sequential_if") new_logs = splitting_infrequent.split_sequence_infrequent(cut[1], self.log, activity_key) self.detected_cut = "sequential" for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply(l, parameters=parameters).items() if v > 0] activities = attributes_get.get_attribute_values(l, activity_key) start_activities = list( start_activities_get.get_start_activities(l, parameters=parameters).keys()) end_activities = list( end_activities_get.get_end_activities(l, parameters=parameters).keys()) self.children.append( SubtreeInfrequent(l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) elif type_of_cut == 'parallel': logging.debug("parallel_if") new_logs = split.split_parallel(cut[1], self.log, activity_key) self.detected_cut = "parallel" for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply(l, parameters=parameters).items() if v > 0] activities = attributes_get.get_attribute_values(l, activity_key) start_activities = list( start_activities_get.get_start_activities(l, parameters=parameters).keys()) end_activities = list( end_activities_get.get_end_activities(l, parameters=parameters).keys()) self.children.append( SubtreeInfrequent(l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) elif type_of_cut == 'loopCut': logging.debug("loopCut_if") new_logs = splitting_infrequent.split_loop_infrequent(cut[1], self.log, activity_key) self.detected_cut = "loopCut" for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply(l, parameters=parameters).items() if v > 0] activities = attributes_get.get_attribute_values(l, activity_key) start_activities = list( start_activities_get.get_start_activities(l, parameters=parameters).keys()) end_activities = list( end_activities_get.get_end_activities(l, parameters=parameters).keys()) self.children.append( SubtreeInfrequent(l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) else: self.apply_fall_through_infrequent(parameters)
def apply_tree(log, parameters): """ Apply the IM_FF algorithm to a log obtaining a process tree Parameters ---------- log Log parameters Parameters of the algorithm, including: Parameters.ACTIVITY_KEY -> attribute of the log to use as activity name (default concept:name) Returns ---------- process_tree Process tree """ if parameters is None: parameters = {} if pkgutil.find_loader("pandas"): import pandas as pd from pm4py.statistics.variants.pandas import get as variants_get if type(log) is pd.DataFrame: vars = variants_get.get_variants_count(log, parameters=parameters) return apply_tree_variants(vars, parameters=parameters) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, pmutil.xes_constants.DEFAULT_NAME_KEY) log = converter.apply(log, parameters=parameters) # keep only the activity attribute (since the others are not used) log = filtering_utils.keep_only_one_attribute_per_event(log, activity_key) noise_threshold = exec_utils.get_param_value(Parameters.NOISE_THRESHOLD, parameters, shared_constants.NOISE_THRESHOLD_IMF) dfg = [(k, v) for k, v in dfg_inst.apply(log, parameters=parameters).items() if v > 0] c = Counts() activities = attributes_get.get_attribute_values(log, activity_key) start_activities = list(start_activities_get.get_start_activities(log, parameters=parameters).keys()) end_activities = list(end_activities_get.get_end_activities(log, parameters=parameters).keys()) contains_empty_traces = False traces_length = [len(trace) for trace in log] if traces_length: contains_empty_traces = min([len(trace) for trace in log]) == 0 # set the threshold parameter based on f and the max value in the dfg: max_value = 0 for key, value in dfg: if value > max_value: max_value = value threshold = noise_threshold * max_value recursion_depth = 0 sub = subtree.make_tree(log, dfg, dfg, dfg, activities, c, recursion_depth, noise_threshold, threshold, start_activities, end_activities, start_activities, end_activities, parameters=parameters) process_tree = get_tree_repr_implain.get_repr(sub, 0, contains_empty_traces=contains_empty_traces) # Ensures consistency to the parent pointers in the process tree tree_consistency.fix_parent_pointers(process_tree) # Fixes a 1 child XOR that is added when single-activities flowers are found tree_consistency.fix_one_child_xor_flower(process_tree) # folds the process tree (to simplify it in case fallthroughs/filtering is applied) process_tree = util.fold(process_tree) return process_tree
def apply_fall_through(self, parameters=None): if parameters is None: parameters = {} activity_key = exec_utils.get_param_value( Parameters.ACTIVITY_KEY, parameters, pmutil.xes_constants.DEFAULT_NAME_KEY) # set flags for fall_throughs, base case is True (enabled) use_empty_trace = (Parameters.EMPTY_TRACE_KEY not in parameters ) or parameters[Parameters.EMPTY_TRACE_KEY] use_act_once_per_trace = ( Parameters.ONCE_PER_TRACE_KEY not in parameters) or parameters[Parameters.ONCE_PER_TRACE_KEY] use_act_concurrent = (Parameters.CONCURRENT_KEY not in parameters ) or parameters[Parameters.CONCURRENT_KEY] use_strict_tau_loop = (Parameters.STRICT_TAU_LOOP_KEY not in parameters ) or parameters[Parameters.STRICT_TAU_LOOP_KEY] use_tau_loop = (Parameters.TAU_LOOP_KEY not in parameters ) or parameters[Parameters.TAU_LOOP_KEY] if use_empty_trace: empty_trace, new_log = fall_through.empty_trace(self.log) # if an empty trace is found, the empty trace fallthrough applies # else: empty_trace = False if empty_trace: logging.debug("empty_trace") activites_left = [] for trace in new_log: for act in trace: if act[activity_key] not in activites_left: activites_left.append(act[activity_key]) self.detected_cut = 'empty_trace' new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=parameters).items() if v > 0] activities = attributes_get.get_attribute_values( new_log, activity_key) start_activities = list( start_activities_get.get_start_activities( new_log, parameters=self.parameters).keys()) end_activities = list( end_activities_get.get_end_activities( new_log, parameters=self.parameters).keys()) self.children.append( SubtreePlain( new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) else: if use_act_once_per_trace: activity_once, new_log, small_log = fall_through.act_once_per_trace( self.log, self.activities, activity_key) small_log = filtering_utils.keep_one_trace_per_variant( small_log, parameters=parameters) else: activity_once = False if use_act_once_per_trace and activity_once: self.detected_cut = 'parallel' # create two new dfgs as we need them to append to self.children later new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=parameters).items() if v > 0] activities = attributes_get.get_attribute_values( new_log, activity_key) small_dfg = [(k, v) for k, v in dfg_inst.apply( small_log, parameters=parameters).items() if v > 0] small_activities = attributes_get.get_attribute_values( small_log, activity_key) self.children.append( SubtreePlain( small_log, small_dfg, self.master_dfg, self.initial_dfg, small_activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) # continue with the recursion on the new log start_activities = list( start_activities_get.get_start_activities( new_log, parameters=self.parameters).keys()) end_activities = list( end_activities_get.get_end_activities( new_log, parameters=self.parameters).keys()) self.children.append( SubtreePlain( new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) else: if use_act_concurrent: activity_concurrent, new_log, small_log, activity_left_out = fall_through.activity_concurrent( self, self.log, self.activities, activity_key, parameters=parameters) small_log = filtering_utils.keep_one_trace_per_variant( small_log, parameters=parameters) else: activity_concurrent = False if use_act_concurrent and activity_concurrent: self.detected_cut = 'parallel' # create two new dfgs on to append later new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=parameters).items() if v > 0] activities = attributes_get.get_attribute_values( new_log, activity_key) small_dfg = [(k, v) for k, v in dfg_inst.apply( small_log, parameters=parameters).items() if v > 0] small_activities = attributes_get.get_attribute_values( small_log, activity_key) # append the concurrent activity as leaf: self.children.append( SubtreePlain( small_log, small_dfg, self.master_dfg, self.initial_dfg, small_activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, initial_start_activities=self. initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) # continue with the recursion on the new log: start_activities = list( start_activities_get.get_start_activities( new_log, parameters=self.parameters).keys()) end_activities = list( end_activities_get.get_end_activities( new_log, parameters=self.parameters).keys()) self.children.append( SubtreePlain( new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) else: if use_strict_tau_loop: strict_tau_loop, new_log = fall_through.strict_tau_loop( self.log, self.start_activities, self.end_activities, activity_key) new_log = filtering_utils.keep_one_trace_per_variant( new_log, parameters=parameters) else: strict_tau_loop = False if use_strict_tau_loop and strict_tau_loop: activites_left = [] for trace in new_log: for act in trace: if act[activity_key] not in activites_left: activites_left.append(act[activity_key]) self.detected_cut = 'strict_tau_loop' new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=parameters).items() if v > 0] activities = attributes_get.get_attribute_values( new_log, activity_key) start_activities = list( start_activities_get.get_start_activities( new_log, parameters=self.parameters).keys()) end_activities = list( end_activities_get.get_end_activities( new_log, parameters=self.parameters).keys()) self.children.append( SubtreePlain(new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) else: if use_tau_loop: tau_loop, new_log = fall_through.tau_loop( self.log, self.start_activities, activity_key) new_log = filtering_utils.keep_one_trace_per_variant( new_log, parameters=parameters) else: tau_loop = False if use_tau_loop and tau_loop: activites_left = [] for trace in new_log: for act in trace: if act[activity_key] not in activites_left: activites_left.append( act[activity_key]) self.detected_cut = 'tau_loop' new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=parameters).items() if v > 0] activities = attributes_get.get_attribute_values( new_log, activity_key) start_activities = list( start_activities_get.get_start_activities( new_log, parameters=self.parameters).keys()) end_activities = list( end_activities_get.get_end_activities( new_log, parameters=self.parameters).keys()) self.children.append( SubtreePlain( new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) else: logging.debug("flower model") activites_left = [] for trace in self.log: for act in trace: if act[activity_key] not in activites_left: activites_left.append( act[activity_key]) self.detected_cut = 'flower'
def __inductive_miner_internal(log, dfg, threshold, root, act_key, use_msd, remove_noise=False): alphabet = pm4py.get_event_attribute_values(log, act_key) if threshold > 0 and remove_noise: end_activities = get_ends.get_end_activities( log, parameters={constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) dfg = __filter_dfg_on_threshold(dfg, end_activities, threshold) original_length = len(log) log = pm4py.filter_log(lambda t: len(t) > 0, log) # revised EMPTYSTRACES if original_length - len(log) > original_length * threshold: return __add_operator_recursive_logs( pt.ProcessTree(pt.Operator.XOR, root), threshold, act_key, [EventLog(), log], use_msd) start_activities = get_starters.get_start_activities( log, parameters={constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) end_activities = get_ends.get_end_activities( log, parameters={constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) if __is_base_case_act(log, act_key) or __is_base_case_silent(log): return __apply_base_case(log, root, act_key) pre, post = dfg_utils.get_transitive_relations(dfg, alphabet) cut = sequence_cut.detect(alphabet, pre, post) if cut is not None: return __add_operator_recursive_logs( pt.ProcessTree(pt.Operator.SEQUENCE, root), threshold, act_key, sequence_cut.project(log, cut, act_key), use_msd) cut = xor_cut.detect(dfg, alphabet) if cut is not None: return __add_operator_recursive_logs( pt.ProcessTree(pt.Operator.XOR, root), threshold, act_key, xor_cut.project(log, cut, act_key), use_msd) cut = concurrent_cut.detect( dfg, alphabet, start_activities, end_activities, msd=msdw_algo.derive_msd_witnesses( log, msd_algo.apply(log, parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key }), parameters={constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) if use_msd else None) if cut is not None: return __add_operator_recursive_logs( pt.ProcessTree(pt.Operator.PARALLEL, root), threshold, act_key, concurrent_cut.project(log, cut, act_key), use_msd) cut = loop_cut.detect(dfg, alphabet, start_activities, end_activities) if cut is not None: return __add_operator_recursive_logs( pt.ProcessTree(pt.Operator.LOOP, root), threshold, act_key, loop_cut.project(log, cut, act_key), use_msd) aopt = activity_once_per_trace.detect(log, alphabet, act_key) if aopt is not None: operator = pt.ProcessTree(operator=pt.Operator.PARALLEL, parent=root) operator.children.append( pt.ProcessTree(operator=None, parent=operator, label=aopt)) return __add_operator_recursive_logs( operator, threshold, act_key, activity_once_per_trace.project(log, aopt, act_key), use_msd) act_conc = activity_concurrent.detect(log, alphabet, act_key, use_msd) if act_conc is not None: return __add_operator_recursive_logs( pt.ProcessTree(pt.Operator.PARALLEL, root), threshold, act_key, activity_concurrent.project(log, act_conc, act_key), use_msd) stl = strict_tau_loop.detect(log, start_activities, end_activities, act_key) if stl is not None: return __add_operator_recursive_logs( pt.ProcessTree(pt.Operator.LOOP, root), threshold, act_key, [stl, EventLog()], use_msd) tl = tau_loop.detect(log, start_activities, act_key) if tl is not None: return __add_operator_recursive_logs( pt.ProcessTree(pt.Operator.LOOP, root), threshold, act_key, [tl, EventLog()], use_msd) if threshold > 0 and not remove_noise: return __inductive_miner(log, dfg, threshold, root, act_key, use_msd, remove_noise=True) return __flower(alphabet, root)
def apply_fall_through_infrequent(self, parameters=None): if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, self.parameters, pmutil.xes_constants.DEFAULT_NAME_KEY) # set flags for fall_throughs, base case is True (enabled) use_empty_trace = (Parameters.EMPTY_TRACE_KEY not in parameters) or parameters[ Parameters.EMPTY_TRACE_KEY] use_act_once_per_trace = (Parameters.ONCE_PER_TRACE_KEY not in parameters) or parameters[ Parameters.ONCE_PER_TRACE_KEY] use_act_concurrent = (Parameters.CONCURRENT_KEY not in parameters) or parameters[ Parameters.CONCURRENT_KEY] use_strict_tau_loop = (Parameters.STRICT_TAU_LOOP_KEY not in parameters) or parameters[ Parameters.STRICT_TAU_LOOP_KEY] use_tau_loop = (Parameters.TAU_LOOP_KEY not in parameters) or parameters[Parameters.TAU_LOOP_KEY] if use_empty_trace: empty_traces_present, enough_traces, new_log = fall_through_infrequent.empty_trace_filtering(self.log, self.f) self.log = new_log else: empty_traces_present = False enough_traces = False # if an empty trace is found, the empty trace fallthrough applies if empty_traces_present and enough_traces: logging.debug("empty_trace_if") self.detected_cut = 'empty_trace' new_dfg = [(k, v) for k, v in dfg_inst.apply(new_log, parameters=self.parameters).items() if v > 0] activities = attributes_get.get_attribute_values(new_log, activity_key) start_activities = list( start_activities_get.get_start_activities(new_log, parameters=parameters).keys()) end_activities = list( end_activities_get.get_end_activities(new_log, parameters=parameters).keys()) self.children.append( SubtreeInfrequent(new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) elif empty_traces_present and not enough_traces: # no node is added to the PT, instead we just use recursion on the log without the empty traces self.detect_cut_if(parameters=parameters) else: if use_act_once_per_trace: activity_once, new_log, small_log = fall_through.act_once_per_trace(self.log, self.activities, activity_key) else: activity_once = False if activity_once: self.detected_cut = 'parallel' # create two new dfgs as we need them to append to self.children later new_dfg = [(k, v) for k, v in dfg_inst.apply(new_log, parameters=parameters).items() if v > 0] activities = attributes_get.get_attribute_values(new_log, activity_key) small_dfg = [(k, v) for k, v in dfg_inst.apply(small_log, parameters=parameters).items() if v > 0] small_activities = attributes_get.get_attribute_values(small_log, activity_key) start_activities = list( start_activities_get.get_start_activities(new_log, parameters=parameters).keys()) end_activities = list( end_activities_get.get_end_activities(new_log, parameters=parameters).keys()) # append the chosen activity as leaf: self.children.append( SubtreeInfrequent(small_log, small_dfg, self.master_dfg, self.initial_dfg, small_activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) # continue with the recursion on the new log self.children.append( SubtreeInfrequent(new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) else: if use_act_concurrent: activity_concurrent, new_log, small_log, key = fall_through.activity_concurrent(self, self.log, self.activities, activity_key, parameters=parameters) else: activity_concurrent = False if activity_concurrent: self.detected_cut = 'parallel' # create two new dfgs on to append later new_dfg = [(k, v) for k, v in dfg_inst.apply(new_log, parameters=parameters).items() if v > 0] activities = attributes_get.get_attribute_values(new_log, activity_key) small_dfg = [(k, v) for k, v in dfg_inst.apply(small_log, parameters=parameters).items() if v > 0] small_activities = attributes_get.get_attribute_values(small_log, activity_key) start_activities = list( start_activities_get.get_start_activities(new_log, parameters=parameters).keys()) end_activities = list( end_activities_get.get_end_activities(new_log, parameters=parameters).keys()) # append the concurrent activity as leaf: self.children.append( SubtreeInfrequent(small_log, small_dfg, self.master_dfg, self.initial_dfg, small_activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) # continue with the recursion on the new log: self.children.append( SubtreeInfrequent(new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) else: if use_strict_tau_loop: strict_tau_loop, new_log = fall_through.strict_tau_loop(self.log, self.start_activities, self.end_activities, activity_key) else: strict_tau_loop = False if strict_tau_loop: self.detected_cut = 'strict_tau_loop' new_dfg = [(k, v) for k, v in dfg_inst.apply(new_log, parameters=parameters).items() if v > 0] activities = attributes_get.get_attribute_values(new_log, activity_key) start_activities = list( start_activities_get.get_start_activities(new_log, parameters=parameters).keys()) end_activities = list( end_activities_get.get_end_activities(new_log, parameters=parameters).keys()) self.children.append( SubtreeInfrequent(new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) else: if use_tau_loop: tau_loop, new_log = fall_through.tau_loop(self.log, self.start_activities, activity_key) else: tau_loop = False if tau_loop: self.detected_cut = 'tau_loop' new_dfg = [(k, v) for k, v in dfg_inst.apply(new_log, parameters=parameters).items() if v > 0] activities = attributes_get.get_attribute_values(new_log, activity_key) start_activities = list( start_activities_get.get_start_activities(new_log, parameters=parameters).keys()) end_activities = list( end_activities_get.get_end_activities(new_log, parameters=parameters).keys()) self.children.append( SubtreeInfrequent(new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) else: logging.debug("flower_if") self.detected_cut = 'flower'
def detect_cut(self, second_iteration=False, parameters=None): if pkgutil.find_loader("networkx"): import networkx as nx if parameters is None: parameters = {} activity_key = exec_utils.get_param_value( Parameters.ACTIVITY_KEY, parameters, pmutil.xes_constants.DEFAULT_NAME_KEY) # check base cases: empty_log = base_case.empty_log(self.log) single_activity = base_case.single_activity(self.log, activity_key) if empty_log: self.detected_cut = 'empty_log' elif single_activity: self.detected_cut = 'single_activity' # if no base cases are found, search for a cut: else: conn_components = detection_utils.get_connected_components( self.ingoing, self.outgoing, self.activities) this_nx_graph = transform_dfg_to_directed_nx_graph( self.dfg, activities=self.activities) strongly_connected_components = [ list(x) for x in nx.strongly_connected_components(this_nx_graph) ] xor_cut = self.detect_xor(conn_components) # the following part searches for a cut in the current log # if a cut is found, the log is split according to the cut, the resulting logs are saved in new_logs # recursion is used on all the logs in new_logs if xor_cut[0]: logging.debug("xor_cut") self.detected_cut = 'concurrent' new_logs = split.split_xor(xor_cut[1], self.log, activity_key) for i in range(len(new_logs)): new_logs[ i] = filtering_utils.keep_one_trace_per_variant( new_logs[i], parameters=parameters) for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply( l, parameters=parameters).items() if v > 0] activities = attributes_get.get_attribute_values( l, activity_key) start_activities = list( start_activities_get.get_start_activities( l, parameters=parameters).keys()) end_activities = list( end_activities_get.get_end_activities( l, parameters=parameters).keys()) self.children.append( SubtreePlain(l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) else: sequence_cut = cut_detection.detect_sequential_cut( self, self.dfg, strongly_connected_components) if sequence_cut[0]: logging.debug("sequence_cut") new_logs = split.split_sequence( sequence_cut[1], self.log, activity_key) for i in range(len(new_logs)): new_logs[ i] = filtering_utils.keep_one_trace_per_variant( new_logs[i], parameters=parameters) self.detected_cut = "sequential" for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply( l, parameters=parameters).items() if v > 0] activities = attributes_get.get_attribute_values( l, activity_key) start_activities = list( start_activities_get.get_start_activities( l, parameters=parameters).keys()) end_activities = list( end_activities_get.get_end_activities( l, parameters=parameters).keys()) self.children.append( SubtreePlain( l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) else: parallel_cut = self.detect_concurrent() if parallel_cut[0]: logging.debug("parallel_cut") new_logs = split.split_parallel( parallel_cut[1], self.log, activity_key) for i in range(len(new_logs)): new_logs[ i] = filtering_utils.keep_one_trace_per_variant( new_logs[i], parameters=parameters) self.detected_cut = "parallel" for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply( l, parameters=parameters).items() if v > 0] activities = attributes_get.get_attribute_values( l, activity_key) start_activities = list( start_activities_get.get_start_activities( l, parameters=parameters).keys()) end_activities = list( end_activities_get.get_end_activities( l, parameters=parameters).keys()) self.children.append( SubtreePlain( l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) else: loop_cut = self.detect_loop() if loop_cut[0]: logging.debug("loop_cut") new_logs = split.split_loop( loop_cut[1], self.log, activity_key) for i in range(len(new_logs)): new_logs[ i] = filtering_utils.keep_one_trace_per_variant( new_logs[i], parameters=parameters) self.detected_cut = "loopCut" for l in new_logs: new_dfg = [ (k, v) for k, v in dfg_inst.apply( l, parameters=parameters).items() if v > 0 ] activities = attributes_get.get_attribute_values( l, activity_key) start_activities = list( start_activities_get. get_start_activities( l, parameters=parameters).keys()) end_activities = list( end_activities_get.get_end_activities( l, parameters=parameters).keys()) self.children.append( SubtreePlain( l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self. noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) # if the code gets to this point, there is no base_case and no cut found in the log # therefore, we now apply fall through: else: self.apply_fall_through(parameters) else: msg = "networkx is not available. inductive miner cannot be used!" logging.error(msg) raise Exception(msg)
def create_process_models(output_case_traces_cluster, path_data_sources, dir_runtime_files, dir_dfg_cluster_files, filename_dfg_cluster, rel_proportion_dfg_threshold, logging_level): """ Creates directly follows graphs out of a event log. :param output_case_traces_cluster: traces that are visualised :param path_data_sources: path of sources and outputs :param dir_runtime_files: folder containing files read and written during runtime :param dir_dfg_cluster_files: folder containing dfg png files :param filename_dfg_cluster: filename of dfg file (per cluster) :param rel_proportion_dfg_threshold: threshold for filtering out sensors in dfg relative to max occurrences of a sensor :param logging_level: level of logging :return: """ # keep only needed columns output_case_traces_cluster = output_case_traces_cluster.reindex( columns={'Case', 'LC_Activity', 'Timestamp', 'Cluster'}) output_case_traces_cluster = output_case_traces_cluster.rename( columns={ 'Case': 'case:concept:name', 'LC_Activity': 'concept:name', 'Timestamp': 'time:timestamp' }) # create directory for dfg pngs os.mkdir(path_data_sources + dir_runtime_files + dir_dfg_cluster_files) # create dfg for each cluster clusters = output_case_traces_cluster.Cluster.unique() for cluster in clusters: log = output_case_traces_cluster.loc[output_case_traces_cluster.Cluster == cluster] log = log.astype(str) # convert pandas data frame to pm4py event log for further processing log = log_converter.apply(log) # keep only activities with more than certain number of occurrences activities = attributes_get.get_attribute_values(log, 'concept:name') # determine that number relative to the max number of occurrences of a sensor in a cluster. (the result is # the threshold at which an activity/activity strand is kept) min_number_of_occurrences = round( (max(activities.values()) * rel_proportion_dfg_threshold), 0) activities = { x: y for x, y in activities.items() if y >= min_number_of_occurrences } log = attributes_filter.apply(log, activities) # create dfg out of event log dfg = dfg_discovery.apply(log) # define start and start_activities = sa_get.get_start_activities(log) end_activities = ea_get.get_end_activities(log) # create png of dfg (if the graph does not show a graph, it is possible that the sensors did not trigger often) gviz = dfg_visualization.apply( dfg=dfg, log=log, variant=dfg_visualization.Variants.FREQUENCY, parameters={ 'start_activities': start_activities, 'end_activities': end_activities }) dfg_visualization.save( gviz, path_data_sources + dir_runtime_files + dir_dfg_cluster_files + (filename_dfg_cluster.format(cluster=str(cluster)))) # logger logger = logging.getLogger(inspect.stack()[0][3]) logger.setLevel(logging_level) logger.info("Saved directly follows graphs into '../%s'.", path_data_sources + dir_runtime_files + dir_dfg_cluster_files)