def apply(bytes, parameters=None): """ Apply the deserialization to the bytes produced by Pyarrow serialization Parameters -------------- bytes Bytes parameters Parameters of the algorithm Returns -------------- deser Deserialized object """ if parameters is None: parameters = {} buffer = pyarrow.py_buffer(bytes) list_objs = pyarrow.deserialize(buffer) log = EventLog(attributes=list_objs[0], extensions=list_objs[1], omni_present=list_objs[2], classifiers=list_objs[3]) for i in range(len(list_objs[4])): trace = Trace(attributes=list_objs[4][i]) for j in range(len(list_objs[5][i])): trace.append(Event(list_objs[5][i][j])) log.append(trace) return log
def apply(log, admitted_variants, parameters=None): """ Filter log keeping/removing only provided variants Parameters ----------- log Log object admitted_variants Admitted variants parameters Parameters of the algorithm, including: Parameters.ACTIVITY_KEY -> Attribute identifying the activity in the log Parameters.POSITIVE -> Indicate if events should be kept/removed """ if parameters is None: parameters = {} positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True) variants = get_variants(log, parameters=parameters) log = EventLog(list(), attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers, omni_present=log.omni_present) for variant in variants: if (positive and variant in admitted_variants) or (not positive and variant not in admitted_variants): for trace in variants[variant]: log.append(trace) return log
def sample_log(log, no_traces=100): """ Randomly sample a fixed number of traces from the original log Parameters ----------- log Log no_traces Number of traces that the sample should have Returns ----------- newLog Filtered log """ new_log = EventLog(attributes=log.attributes, extensions=log.extensions, globals=log._omni, classifiers=log.classifiers) set_traces = set() for i in range(0, min(no_traces, len(log._list))): set_traces.add(random.randrange(0, len(log._list))) set_traces = list(set_traces) for trace in set_traces: new_log.append(copy(log._list[trace])) return new_log
def apply(df, parameters=None): """ Convert a dataframe into a log containing 1 case per variant (only control-flow perspective is considered) Parameters ------------- df Dataframe parameters Parameters of the algorithm Returns ------------- log Event log """ from pm4py.statistics.traces.pandas import case_statistics if parameters is None: parameters = {} variant_stats = case_statistics.get_variant_statistics(df, parameters=parameters) activity_key = parameters[ pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] if pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY log = EventLog() for vd in variant_stats: variant = vd['variant'].split(",") trace = Trace() for activity in variant: event = Event() event[activity_key] = activity trace.append(event) log.append(trace) return log
def filter_log_by_variants_percentage(log, variants, variants_percentage=0.0): """ Filter the log by variants percentage Parameters ---------- log Log variants Dictionary with variant as the key and the list of traces as the value variants_percentage Percentage of variants that should be kept (the most common variant is always kept) Returns ---------- filtered_log Filtered log """ filtered_log = EventLog() no_of_traces = len(log) variant_count = get_variants_sorted_by_count(variants) already_added_sum = 0 for i in range(len(variant_count)): variant = variant_count[i][0] varcount = variant_count[i][1] percentage_already_added = already_added_sum / no_of_traces if already_added_sum == 0 or percentage_already_added < variants_percentage: for trace in variants[variant]: filtered_log.append(trace) already_added_sum = already_added_sum + varcount return filtered_log
def apply(log, admitted_variants, parameters=None): """ Filter log keeping/removing only provided variants Parameters ----------- log Log object admitted_variants Admitted variants parameters Parameters of the algorithm, including: activity_key -> Attribute identifying the activity in the log positive -> Indicate if events should be kept/removed """ if parameters is None: parameters = {} positive = parameters["positive"] if "positive" in parameters else True variants = get_variants(log, parameters=parameters) log = EventLog() for variant in variants: if (positive and variant in admitted_variants) or ( not positive and variant not in admitted_variants): for trace in variants[variant]: log.append(trace) return log
def update_merge(loglist): mergedlog = EventLog() for i in range(len(loglist)): for trace in loglist[i]: mergedlog.append(trace) return mergedlog
def sample_eventlog(event_log, no_events=100): """ Randomly sample a fixed number of events from the original event log Parameters ----------- event_log Event log no_events Number of events that the sample should have Returns ----------- newLog Filtered log """ new_log = EventLog(attributes=event_log.attributes, extensions=event_log.extensions, globals=event_log._omni, classifiers=event_log.classifiers) set_events = set() for i in range(0, min(no_events, len(event_log._list))): set_events.add(random.randrange(0, len(event_log._list))) set_events = list(set_events) for event in set_events: new_log.append(copy(event_log._list[event])) return new_log
def __align(obj: Union[Trace, EventLog], pt: ProcessTree, max_trace_length: int = 1, max_process_tree_height: int = 1, parameters=None): """ this function approximates alignments for a given event log or trace and a process tree :param obj: event log or single trace :param pt: process tree :param max_trace_length: specifies when the recursive splitting stops based on the trace's length :param max_process_tree_height: specifies when the recursive splitting stops based on the tree's height :return: """ assert isinstance(pt, ProcessTree) if isinstance(obj, Trace): e = EventLog() e.append(obj) obj = e assert isinstance(obj, EventLog) pt = process_tree_to_binary_process_tree(pt) pt = EfficientTree(pt) return __approximate_alignments_for_log(obj, pt, max_trace_length, max_process_tree_height, parameters=parameters)
def write_sample_logs_to_fs(clus_dict, filepath): """ Build separate logs with traces corresponding to each cluster and write them to the filesystem. Parameters ----------- clus_dict : dict Dictionary using the cluster labels as keys and the corresponding list of case ids as values. filepath Path to the XES log file """ log = xes_importer.import_log(filepath) for key, value in clus_dict.items(): args = { 'attributes': log.attributes, 'extensions': log.extensions, 'omni_present': log.omni_present, 'classifiers': log.classifiers } samplelog = EventLog(**args) goalpath = filepath[:-4] + "_" + key + ".xes" for trace in log: if trace.attributes['concept:name'] in value: samplelog.append(deepcopy(trace)) xes_exporter.export_log(samplelog, goalpath)
def apply_variants_filter(log, admitted_variants, parameters=None): """ Filter log keeping/removing only provided variants Parameters ----------- log Log object admitted_variants Admitted variants parameters Parameters of the algorithm, including: activity_key -> Attribute identifying the activity in the log positive -> Indicate if events should be kept/removed """ if parameters is None: parameters = {} positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True) variants = variants_statistics.get_variants(log, parameters=parameters) log = EventLog() for variant in variants: if (positive and variant in admitted_variants) or ( not positive and variant not in admitted_variants): for trace in variants[variant]: log.append(trace) return log
def apply(df, parameters=None): """ Convert a dataframe into a log containing 1 case per variant (only control-flow perspective is considered) Parameters ------------- df Dataframe parameters Parameters of the algorithm Returns ------------- log Event log """ if parameters is None: parameters = {} variant_stats = case_statistics.get_variant_statistics( df, parameters=parameters) log = EventLog() for vd in variant_stats: variant = vd['variant'].split(",") trace = Trace() for activity in variant: event = Event() event[xes.DEFAULT_NAME_KEY] = activity trace.append(event) log.append(trace) return log
def get_log_with_log_prefixes(log, parameters=None): """ Gets an extended log that contains, in order, all the prefixes for a case of the original log Parameters -------------- log Original log parameters Possible parameters of the algorithm Returns ------------- all_prefixes_log Log with all the prefixes """ all_prefixes_log = EventLog() for trace in log: cumulative_trace = Trace() for event in trace: all_prefixes_log.append(deepcopy(cumulative_trace)) cumulative_trace.append(event) all_prefixes_log.append(deepcopy(cumulative_trace)) return all_prefixes_log
def project(log, groups, activity_key): ''' This method projects the log based on a presumed sequence cut and a list of activity groups Parameters ---------- log original log groups list of activity sets to be used in projection (activities can only appear in one group) activity_key key to use in the event to derive the activity name Returns ------- list of corresponding logs according to the sequence cut. ''' # currently, not 'noise' proof logs = list() for group in groups: proj = EventLog() for t in log: proj.append( pm4py.filter_trace(lambda e: e[activity_key] in group, t)) logs.append(proj) return logs
def apply_from_variants_list(var_list, parameters=None): """ Discovers the log skeleton from the variants list Parameters --------------- var_list Variants list parameters Parameters Returns --------------- model Log skeleton model """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY) variant_delimiter = exec_utils.get_param_value( Parameters.PARAMETER_VARIANT_DELIMITER, parameters, constants.DEFAULT_VARIANT_SEP) log = EventLog() for cv in var_list: v = cv[0] tr = v.split(variant_delimiter) trace = Trace() for act in tr: trace.append(Event({activity_key: act})) log.append(trace) return apply(log, parameters=parameters)
def import_tel_from_yawl(input_file_path): ''' Imports translucent event log from yawl logging Parameters ---------- :param input_file_path: input file path of yawl logging Returns -------- :return: translucent event log (only complete) ''' log = import_tel(input_file_path) new_log = EventLog() s = set() for trace in log: new_trace = Trace() ci = trace.attributes['concept:name'] for event in trace: if event['lifecycle:instance'] == ci: if event['lifecycle:transition'] == 'schedule': s.add(event['concept:name']) elif event['lifecycle:transition'] == 'complete': event.set_enabled(frozenset(s)) new_trace.append(event) s.remove(event['concept:name']) new_log.append(new_trace) return new_log
def project(log: EventLog, cut: Cut, activity_key: str) -> List[EventLog]: do = cut[0] redo = cut[1:] do_log = EventLog() redo_logs = [EventLog()] * len(redo) for t in log: do_trace = Trace() redo_trace = Trace() for e in t: if e[activity_key] in do: do_trace.append(e) if len(redo_trace) > 0: redo_logs = _append_trace_to_redo_log(redo_trace, redo_logs, redo, activity_key) redo_trace = Trace() else: redo_trace.append(e) if len(do_trace) > 0: do_log.append(do_trace) do_trace = Trace() if len(redo_trace) > 0: redo_logs = _append_trace_to_redo_log(redo_trace, redo_logs, redo, activity_key) do_log.append(do_trace) logs = [do_log] logs.extend(redo_logs) return logs
def create_log(G, conn_comp, timestamps, max_comp_len=50, include_loops=False): log = EventLog() for i in range(len(conn_comp)): if len(conn_comp[i]) <= max_comp_len: trace = Trace() trace.attributes["concept:name"] = str(i) SG = G.subgraph(conn_comp[i]) SGG = networkx.DiGraph(SG) edges = list(SGG.edges) for e in edges: if e[0] == e[1]: SGG.remove_edge(e[0], e[1]) sorted_nodes = list(networkx.topological_sort(SGG)) for n in sorted_nodes: selfloop = 1 if (n, n) in SG.edges else 0 trace.append( Event({ 'time:timestamp': timestamps[n.split("=")[1]], 'concept:name': n.split("=")[0], 'value': n.split("=")[1], 'typevalue': n, 'selfloop': selfloop })) if include_loops and selfloop: trace.append( Event({ 'time:timestamp': timestamps[n.split("=")[1]], 'concept:name': n.split("=")[0], 'value': n.split("=")[1], 'typevalue': n, 'selfloop': selfloop })) log.append(trace) log = sorting.sort_timestamp_log(log, "time:timestamp") return log
def apply_from_variants_list(var_list, tree, parameters=None): """ Apply the alignments from the specification of a list of variants in the log Parameters ------------- var_list List of variants (for each item, the first entry is the variant itself, the second entry may be the number of cases) tree Process tree parameters Parameters of the algorithm Returns -------------- dictio_alignments Dictionary that assigns to each variant its alignment """ if parameters is None: parameters = {} dictio_alignments = {} log = EventLog() for index, varitem in enumerate(var_list): trace = variants_util.variant_to_trace(varitem[0], parameters=parameters) log.append(trace) alignments = apply(log, tree, parameters=parameters) for index, varitem in enumerate(var_list): dictio_alignments[varitem[0]] = alignments[index] return dictio_alignments
def filter_log_by_end_activities(end_activities, variants, vc, threshold, activity_key="concept:name"): """ Keep only variants of the log with an end activity which number of occurrences is above the threshold Parameters ---------- end_activities Dictionary of end attributes associated with their count variants (If specified) Dictionary with variant as the key and the list of traces as the value vc List of variant names along with their count threshold Cutting threshold (remove variants having end attributes which number of occurrences is below the threshold activity_key (If specified) Specify the activity key in the log (default concept:name) Returns ---------- filtered_log Filtered log """ filtered_log = EventLog() fvea = variants[vc[0][0]][0][-1][activity_key] for variant in variants: vea = variants[variant][0][-1][activity_key] if vea in end_activities: if vea == fvea or end_activities[vea] >= threshold: for trace in variants[variant]: filtered_log.append(trace) return filtered_log
def apply_from_variants_list(var_list, parameters=None): """ Discovers the log skeleton from the variants list Parameters --------------- var_list Variants list parameters Parameters Returns --------------- model Log skeleton model """ if parameters is None: parameters = {} log = EventLog() for cv in var_list: v = cv[0] trace = variants_util.variant_to_trace(v, parameters=parameters) log.append(trace) return apply(log, parameters=parameters)
def project(log: EventLog, activity: str, activity_key: str) -> List[EventLog]: proj = EventLog() proj_act = EventLog() for t in log: proj.append(pm4py.filter_trace(lambda e: e[activity_key] != activity, t)) proj_act.append(pm4py.filter_trace(lambda e: e[activity_key] == activity, t)) return [proj_act, proj]
def apply_tree_variants(variants, parameters=None): """ Apply the IM_F algorithm to a dictionary of variants obtaining a process tree Parameters ---------- variants Variants parameters Parameters of the algorithm, including: Parameters.ACTIVITY_KEY -> attribute of the log to use as activity name (default concept:name) Returns ---------- process_tree Process tree """ log = EventLog() activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) var_keys = list(variants.keys()) for var in var_keys: val = variants[var] if type(val) is list: val = len(val) for i in range(val): trace = variants_util.variant_to_trace(var, parameters=parameters) log.append(trace) return apply_tree(log, parameters=parameters)
def keep_one_trace_per_variant(log, parameters=None): """ Keeps only one trace per variant (does not matter for basic inductive miner) Parameters -------------- log Log parameters Parameters of the algorithm Returns -------------- new_log Log (with one trace per variant) """ if parameters is None: parameters = {} variants = variants_module.get_variants(log, parameters=parameters) new_log = EventLog() for var in variants: new_log.append(variants[var][0]) return new_log
def detect(log: EventLog, alphabet: Dict[str, int], act_key: str, use_msd: bool) -> Optional[str]: candidates = set(alphabet.keys()) for t in log: candidates = candidates.intersection(set(map(lambda e: e[act_key], t))) if len(candidates) == 0: return None for a in candidates: proj = EventLog() for t in log: proj.append(pm4py.filter_trace(lambda e: e[act_key] != a, t)) if len(list(filter(lambda t: len(t) == 0, proj))) == 0: dfg_proj = discover_dfg.apply(proj, parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) alphabet_proj = pm4py.get_attribute_values(proj, act_key) start_act_proj = get_starters.get_start_activities(proj, parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) end_act_proj = get_ends.get_end_activities(log, parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) pre_proj, post_proj = dfg_utils.get_transitive_relations(dfg_proj, alphabet_proj) cut = sequence_cut.detect(alphabet_proj, pre_proj, post_proj) if cut is not None: return a cut = xor_cut.detect(dfg_proj, alphabet_proj) if cut is not None: return a cut = concurrent_cut.detect(dfg_proj, alphabet_proj, start_act_proj, end_act_proj, msd= msdw_algo.derive_msd_witnesses(proj, msd_algo.apply(log, parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}), parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) if use_msd else None) if cut is not None: return a cut = loop_cut.detect(dfg_proj, alphabet_proj, start_act_proj, end_act_proj) if cut is not None: return a return None
def apply_tree_variants(variants, parameters=None): """ Apply the IM algorithm to a dictionary of variants obtaining a process tree Parameters ---------- variants Variants parameters Parameters of the algorithm, including: Parameters.ACTIVITY_KEY -> attribute of the log_skeleton to use as activity name (default concept:name) Returns ---------- process_tree Process tree """ log = EventLog() activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) var_keys = list(variants.keys()) for var in var_keys: trace = Trace() activities = var.split(constants.DEFAULT_VARIANT_SEP) for act in activities: trace.append(Event({activity_key: act})) log.append(trace) return apply_tree(log, parameters=parameters)
def generate_log(pt, no_traces=100): """ Generate a log out of a process tree Parameters ------------ pt Process tree no_traces Number of traces contained in the process tree Returns ------------ log Trace log object """ log = EventLog() for i in range(no_traces): ex_seq = execute(pt) ex_seq_labels = pt_util.project_execution_sequence_to_labels(ex_seq) trace = Trace() trace.attributes[xes.DEFAULT_NAME_KEY] = str(i) for label in ex_seq_labels: event = Event() event[xes.DEFAULT_NAME_KEY] = label trace.append(event) log.append(trace) return log
def get_log_with_log_prefixes(log, parameters=None): """ Gets an extended log that contains, in order, all the prefixes for a case of the original log Parameters -------------- log Original log parameters Possible parameters of the algorithm Returns ------------- all_prefixes_log Log with all the prefixes change_indexes Indexes of the extended log where there was a change between cases """ all_prefixes_log = EventLog() change_indexes = [] for trace in log: cumulative_trace = Trace() for event in trace: all_prefixes_log.append(deepcopy(cumulative_trace)) cumulative_trace.append(event) all_prefixes_log.append(deepcopy(cumulative_trace)) change_indexes.append([len(all_prefixes_log) - 1] * len(trace)) return all_prefixes_log, change_indexes
def split_log_on_cluster_attribute(log): """ Splits a given log into two sublogs based on the cluster trace attribute. Seperates clustered traces from not yet clustered ones indicated by the cluster attribute having value 0. Parameters ----------- log EventLog object Returns ----------- log1 EventLog object of traces which are assigned to a cluster. log2 EventLog object of traces not assigned to a cluster yet. """ # Insert traces where cluster attribute is nonzero into log1, rest into log2 log1 = EventLog() log2 = EventLog() for trace in log: if trace.attributes['cluster'] != '0': log1.append(trace) else: log2.append(trace) return log1, log2
def filter_log_by_paths(log, paths, variants, vc, threshold, attribute_key="concept:name"): """ Keep only paths which number of occurrences is above the threshold (or they belong to the first variant) Parameters ---------- log Log paths Dictionary of paths associated with their count variants (If specified) Dictionary with variant as the key and the list of traces as the value vc List of variant names along with their count threshold Cutting threshold (remove paths which number of occurrences is below the threshold) attribute_key (If specified) Specify the attribute key to use (default concept:name) Returns ---------- filtered_log Filtered log_skeleton """ filtered_log = EventLog() fvft = variants[vc[0][0]][0] fvp = set() for i in range(0, len(fvft) - 1): path = fvft[i][attribute_key] + "," + fvft[i + 1][attribute_key] fvp.add(path) for trace in log: new_trace = Trace() jj = 0 if len(trace) > 0: new_trace.append(trace[0]) for j in range(1, len(trace) - 1): jj = j if j >= len(trace): break if attribute_key in trace[j] and attribute_key in trace[j + 1]: path = trace[j][attribute_key] + "," + trace[ j + 1][attribute_key] if path in paths: if path in fvp or paths[path] >= threshold: new_trace.append(trace[j]) new_trace.append(trace[j + 1]) if len(trace) > 1 and not jj == len(trace): new_trace.append(trace[-1]) if len(new_trace) > 0: for attr in trace.attributes: new_trace.attributes[attr] = trace.attributes[attr] filtered_log.append(new_trace) return filtered_log