def filter_on_case_size(log, min_case_size=2, max_case_size=None): """ Get only traces in the log with a given size Parameters ----------- log Log min_case_size Minimum desidered size of traces max_case_size Maximum desidered size of traces Returns ----------- filtered_log Filtered log """ if max_case_size is not None: filtered_log = EventLog([ trace for trace in log if min_case_size <= len(trace) <= max_case_size ]) else: filtered_log = EventLog( [trace for trace in log if len(trace) >= min_case_size]) return filtered_log
def project(log: EventLog, activity: str, activity_key: str) -> List[EventLog]: proj = EventLog() proj_act = EventLog() for t in log: proj.append(pm4py.filter_trace(lambda e: e[activity_key] != activity, t)) proj_act.append(pm4py.filter_trace(lambda e: e[activity_key] == activity, t)) return [proj_act, proj]
def split_log_on_cluster_attribute(log): """ Splits a given log into two sublogs based on the cluster trace attribute. Seperates clustered traces from not yet clustered ones indicated by the cluster attribute having value 0. Parameters ----------- log EventLog object Returns ----------- log1 EventLog object of traces which are assigned to a cluster. log2 EventLog object of traces not assigned to a cluster yet. """ # Insert traces where cluster attribute is nonzero into log1, rest into log2 log1 = EventLog() log2 = EventLog() for trace in log: if trace.attributes['cluster'] != '0': log1.append(trace) else: log2.append(trace) return log1, log2
def project(log: EventLog, cut: Cut, activity_key: str) -> List[EventLog]: do = cut[0] redo = cut[1:] do_log = EventLog() redo_logs = [EventLog()] * len(redo) for t in log: do_trace = Trace() redo_trace = Trace() for e in t: if e[activity_key] in do: do_trace.append(e) if len(redo_trace) > 0: redo_logs = _append_trace_to_redo_log(redo_trace, redo_logs, redo, activity_key) redo_trace = Trace() else: redo_trace.append(e) if len(do_trace) > 0: do_log.append(do_trace) do_trace = Trace() if len(redo_trace) > 0: redo_logs = _append_trace_to_redo_log(redo_trace, redo_logs, redo, activity_key) do_log.append(do_trace) logs = [do_log] logs.extend(redo_logs) return logs
def get_encoded_logs(job: Job, use_cache: bool = True) -> (DataFrame, DataFrame): """returns the encoded logs returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible :param job: job configuration :param use_cache: load or not saved datasets from cache :return: training and testing DataFrame """ print('\tGetting Dataset') if use_cache: if LabelledLog.objects.filter(split=job.split, encoding=job.encoding, labelling=job.labelling).exists(): training_df, test_df = get_labelled_logs(job) else: if job.split.train_log is not None and \ job.split.test_log is not None and \ LoadedLog.objects.filter(train_log=job.split.train_log.path, test_log=job.split.test_log.path).exists(): training_log, test_log, additional_columns = get_loaded_logs( job.split) else: training_log, test_log, additional_columns = prepare_logs( job.split) if job.split.type == SplitTypes.SPLIT_SINGLE.value: job.split = duplicate_orm_row(job.split) job.split.type = SplitTypes.SPLIT_DOUBLE.value train_name = '0-' + str( int(100 - (job.split.test_size * 100))) job.split.train_log = create_log(EventLog(training_log), train_name + '.xes') test_name = str(int(100 - (job.split.test_size * 100))) + '-100' job.split.test_log = create_log(EventLog(test_log), test_name + '.xes') job.split.additional_columns = str( train_name + test_name) # TODO: find better naming policy job.save() put_loaded_logs(job.split, training_log, test_log, additional_columns) training_df, test_df = encode_label_logs( training_log, test_log, job, additional_columns=additional_columns) put_labelled_logs(job, training_df, test_df) else: training_log, test_log, additional_columns = prepare_logs(job.split) training_df, test_df = encode_label_logs( training_log, test_log, job, additional_columns=additional_columns) return training_df, test_df
def apply(tree1, tree2, log, alignments, parameters=None): """ Alignment repair on tree2 based on the alignment of log on tree1 Parameters ----------- tree1 Process Tree tree2 Process Tree log EventLog alignments related alignment of log on tree1 parameters Returns ------------ alignments repaired alignments """ parameters = {} if parameters is None else parameters parameters['COMPARE_OPTION'] = 1 if parameters.get( 'COMPARE_OPTION') is None else parameters['COMPARE_OPTION'] ret_tuple_as_trans_desc = False if parameters.get(PARAM_ALIGNMENT_RESULT_IS_SYNC_PROD_AWARE) is None else \ parameters[PARAM_ALIGNMENT_RESULT_IS_SYNC_PROD_AWARE] # TODO: if the given alignment is not True, try-catch alignments = copy.deepcopy(alignments) com_res = pt_compare.apply(tree1, tree2, parameters['COMPARE_OPTION']) if com_res.value: return alignments else: tree1_total_number = pt_mani_utils.nodes_number(tree1) pt_number.apply(com_res.subtree2, 'D', tree1_total_number + 1) best_worst_cost = apply_pt_alignments(EventLog([Trace()]), tree2, parameters)[0]['cost'] for i in range(len(alignments)): align = alignments[i] if align.get("repair") is None: scope = detect_change_scope(align['alignment'], com_res.subtree1, log[i], ret_tuple_as_trans_desc) if not len(scope.traces) == 0: sub_aligns_before = apply_pt_alignments( EventLog(scope.traces), com_res.subtree1, parameters) sub_aligns_after = apply_pt_alignments( EventLog(scope.traces), com_res.subtree2, parameters) alignment_reassemble(align['alignment'], sub_aligns_after, scope.anchor_index, com_res.subtree1, ret_tuple_as_trans_desc) recompute_cost(align, sub_aligns_before, sub_aligns_after) recompute_fitness(align, log[i], best_worst_cost) align["repair"] = True for a in alignments: a.pop("repair") if a.get("repair") is not None else None return alignments
def get_train_test_log(split: Split): """Returns training_log and test_log""" if split.type == SplitTypes.SPLIT_SINGLE.value and Split.objects.filter( type=SplitTypes.SPLIT_DOUBLE.value, original_log=split.original_log, test_size=split.test_size, splitting_method=split.splitting_method ).exists() and split.splitting_method != SplitOrderingMethods.SPLIT_RANDOM.value: return get_train_test_log(Split.objects.filter( type=SplitTypes.SPLIT_DOUBLE.value, original_log=split.original_log, test_size=split.test_size, splitting_method=split.splitting_method )[0]) elif split.original_log is not None and (not Split.objects.filter( type=SplitTypes.SPLIT_DOUBLE.value, original_log=split.original_log, test_size=split.test_size, splitting_method=split.splitting_method ).exists() or split.splitting_method == SplitOrderingMethods.SPLIT_RANDOM.value): training_log, test_log = _split_single_log(split) additional_columns = get_additional_columns(get_log(split.original_log)) if split.splitting_method != SplitOrderingMethods.SPLIT_RANDOM.value: _ = Split.objects.get_or_create( type=SplitTypes.SPLIT_DOUBLE.value, original_log=split.original_log, test_size=split.test_size, splitting_method=split.splitting_method, train_log=create_log(EventLog(training_log), '0-' + str(100 - int(split.test_size * 100)) + '.xes'), test_log=create_log(EventLog(test_log), str(100 - int(split.test_size * 100)) + '-100.xes'), additional_columns=split.additional_columns )[0] logger.info("\t\tLoaded single log from {}".format(split.original_log.path)) else: # Have to use sklearn to convert some internal data types training_log = get_log(split.train_log) additional_columns = get_additional_columns(training_log) if split.additional_columns is None: split.additional_columns = split.train_log.name + split.test_log.name + '_ac.xes' split.save() training_log, train_log_to_append = train_test_split(training_log, test_size=0, shuffle=False) test_log, test_log_to_append = train_test_split(get_log(split.test_log), test_size=0, shuffle=False) logger.info("\t\tLoaded double logs from {} and {}.".format(split.train_log.path, split.test_log.path)) if len(training_log) == 0: raise TypeError("Training log is empty. Create a new Split with better parameters") return training_log, test_log, additional_columns
def get_log_with_log_prefixes(log, parameters=None): """ Gets an extended log that contains, in order, all the prefixes for a case of the original log Parameters -------------- log Original log parameters Possible parameters of the algorithm Returns ------------- all_prefixes_log Log with all the prefixes change_indexes Indexes of the extended log where there was a change between cases """ all_prefixes_log = EventLog() change_indexes = [] for trace in log: cumulative_trace = Trace() for event in trace: all_prefixes_log.append(deepcopy(cumulative_trace)) cumulative_trace.append(event) all_prefixes_log.append(deepcopy(cumulative_trace)) change_indexes.append([len(all_prefixes_log) - 1] * len(trace)) return all_prefixes_log, change_indexes
def filter_on_case_performance(log, inf_perf, sup_perf, parameters=None): """ Gets a filtered log keeping only traces that satisfy the given performance requirements Parameters ------------ log Log inf_perf Lower bound on the performance sup_perf Upper bound on the performance parameters Parameters Returns ----------- filtered_log Filtered log """ if parameters is None: parameters = {} timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY) filtered_log = EventLog([ trace for trace in log if satisfy_perf(trace, inf_perf, sup_perf, timestamp_key) ]) return filtered_log
def replay_prediction_calculate(job: Job, log) -> (dict, dict): """calculate the prediction for the log coming from replayers :param job: job dictionary :param log: log model :return: runtime results """ additional_columns = get_additional_columns(log) data_df, _ = train_test_split(log, test_size=0, shuffle=False) data_df, _ = encode_label_logs(data_df, EventLog(), job, additional_columns) results = MODEL[job.predictive_model.predictive_model][ModelActions.PREDICT.value](job, data_df) logger.info("End {} job {}, {} . Results {}".format('runtime', job.predictive_model.predictive_model, get_run(job), results)) results_dict = dict(zip(data_df['trace_id'], list(map(int, results)))) events_for_trace = dict() data_encoder_decoder(job, data_df, EventLog()) return results_dict, events_for_trace
def filter_log_by_variants_percentage(log, variants, variants_percentage=0.0): """ Filter the log by variants percentage Parameters ---------- log Log variants Dictionary with variant as the key and the list of traces as the value variants_percentage Percentage of variants that should be kept (the most common variant is always kept) Returns ---------- filtered_log Filtered log """ filtered_log = EventLog() no_of_traces = len(log) variant_count = get_variants_sorted_by_count(variants) already_added_sum = 0 for i in range(len(variant_count)): variant = variant_count[i][0] varcount = variant_count[i][1] percentage_already_added = already_added_sum / no_of_traces if already_added_sum == 0 or percentage_already_added < variants_percentage: for trace in variants[variant]: filtered_log.append(trace) already_added_sum = already_added_sum + varcount return filtered_log
def update_merge(loglist): mergedlog = EventLog() for i in range(len(loglist)): for trace in loglist[i]: mergedlog.append(trace) return mergedlog
def keep_one_trace_per_variant(log, parameters=None): """ Keeps only one trace per variant (does not matter for basic inductive miner) Parameters -------------- log Log parameters Parameters of the algorithm Returns -------------- new_log Log (with one trace per variant) """ if parameters is None: parameters = {} variants = variants_module.get_variants(log, parameters=parameters) new_log = EventLog() for var in variants: new_log.append(variants[var][0]) return new_log
def create_log(G, conn_comp, timestamps, max_comp_len=50, include_loops=False): log = EventLog() for i in range(len(conn_comp)): if len(conn_comp[i]) <= max_comp_len: trace = Trace() trace.attributes["concept:name"] = str(i) SG = G.subgraph(conn_comp[i]) SGG = networkx.DiGraph(SG) edges = list(SGG.edges) for e in edges: if e[0] == e[1]: SGG.remove_edge(e[0], e[1]) sorted_nodes = list(networkx.topological_sort(SGG)) for n in sorted_nodes: selfloop = 1 if (n, n) in SG.edges else 0 trace.append( Event({ 'time:timestamp': timestamps[n.split("=")[1]], 'concept:name': n.split("=")[0], 'value': n.split("=")[1], 'typevalue': n, 'selfloop': selfloop })) if include_loops and selfloop: trace.append( Event({ 'time:timestamp': timestamps[n.split("=")[1]], 'concept:name': n.split("=")[0], 'value': n.split("=")[1], 'typevalue': n, 'selfloop': selfloop })) log.append(trace) log = sorting.sort_timestamp_log(log, "time:timestamp") return log
def apply_from_variants_list(var_list, parameters=None): """ Discovers the log skeleton from the variants list Parameters --------------- var_list Variants list parameters Parameters Returns --------------- model Log skeleton model """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY) variant_delimiter = exec_utils.get_param_value( Parameters.PARAMETER_VARIANT_DELIMITER, parameters, constants.DEFAULT_VARIANT_SEP) log = EventLog() for cv in var_list: v = cv[0] tr = v.split(variant_delimiter) trace = Trace() for act in tr: trace.append(Event({activity_key: act})) log.append(trace) return apply(log, parameters=parameters)
def filter_traces_contained(log, dt1, dt2, parameters=None): """ Get traces that are contained in the given interval Parameters ----------- log Trace log_skeleton dt1 Lower bound to the interval dt2 Upper bound to the interval parameters Possible parameters of the algorithm, including: Parameters.TIMESTAMP_KEY -> Attribute to use as timestamp Returns ------------ filtered_log Filtered log_skeleton """ if parameters is None: parameters = {} timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY) dt1 = get_dt_from_string(dt1) dt2 = get_dt_from_string(dt2) filtered_log = EventLog([ trace for trace in log if is_contained(trace, dt1, dt2, timestamp_key) ]) return filtered_log
def sample_log(log, no_traces=100): """ Randomly sample a fixed number of traces from the original log Parameters ----------- log Log no_traces Number of traces that the sample should have Returns ----------- newLog Filtered log """ new_log = EventLog(attributes=log.attributes, extensions=log.extensions, globals=log._omni, classifiers=log.classifiers) set_traces = set() for i in range(0, min(no_traces, len(log._list))): set_traces.add(random.randrange(0, len(log._list))) set_traces = list(set_traces) for trace in set_traces: new_log.append(copy(log._list[trace])) return new_log
def apply(log, admitted_variants, parameters=None): """ Filter log keeping/removing only provided variants Parameters ----------- log Log object admitted_variants Admitted variants parameters Parameters of the algorithm, including: Parameters.ACTIVITY_KEY -> Attribute identifying the activity in the log Parameters.POSITIVE -> Indicate if events should be kept/removed """ if parameters is None: parameters = {} positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True) variants = get_variants(log, parameters=parameters) log = EventLog(list(), attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers, omni_present=log.omni_present) for variant in variants: if (positive and variant in admitted_variants) or (not positive and variant not in admitted_variants): for trace in variants[variant]: log.append(trace) return log
def generate_log(pt, no_traces=100): """ Generate a log out of a process tree Parameters ------------ pt Process tree no_traces Number of traces contained in the process tree Returns ------------ log Trace log object """ log = EventLog() for i in range(no_traces): ex_seq = execute(pt) ex_seq_labels = pt_util.project_execution_sequence_to_labels(ex_seq) trace = Trace() trace.attributes[xes.DEFAULT_NAME_KEY] = str(i) for label in ex_seq_labels: event = Event() event[xes.DEFAULT_NAME_KEY] = label trace.append(event) log.append(trace) return log
def apply_tree_variants(variants, parameters=None): """ Apply the IM_F algorithm to a dictionary of variants obtaining a process tree Parameters ---------- variants Variants parameters Parameters of the algorithm, including: Parameters.ACTIVITY_KEY -> attribute of the log to use as activity name (default concept:name) Returns ---------- process_tree Process tree """ log = EventLog() activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) var_keys = list(variants.keys()) for var in var_keys: val = variants[var] if type(val) is list: val = len(val) for i in range(val): trace = variants_util.variant_to_trace(var, parameters=parameters) log.append(trace) return apply_tree(log, parameters=parameters)
def apply_tree_variants(variants, parameters=None): """ Apply the IM algorithm to a dictionary of variants obtaining a process tree Parameters ---------- variants Variants parameters Parameters of the algorithm, including: Parameters.ACTIVITY_KEY -> attribute of the log_skeleton to use as activity name (default concept:name) Returns ---------- process_tree Process tree """ log = EventLog() activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) var_keys = list(variants.keys()) for var in var_keys: trace = Trace() activities = var.split(constants.DEFAULT_VARIANT_SEP) for act in activities: trace.append(Event({activity_key: act})) log.append(trace) return apply_tree(log, parameters=parameters)
def filter_log_by_end_activities(end_activities, variants, vc, threshold, activity_key="concept:name"): """ Keep only variants of the log with an end activity which number of occurrences is above the threshold Parameters ---------- end_activities Dictionary of end attributes associated with their count variants (If specified) Dictionary with variant as the key and the list of traces as the value vc List of variant names along with their count threshold Cutting threshold (remove variants having end attributes which number of occurrences is below the threshold activity_key (If specified) Specify the activity key in the log (default concept:name) Returns ---------- filtered_log Filtered log """ filtered_log = EventLog() fvea = variants[vc[0][0]][0][-1][activity_key] for variant in variants: vea = variants[variant][0][-1][activity_key] if vea in end_activities: if vea == fvea or end_activities[vea] >= threshold: for trace in variants[variant]: filtered_log.append(trace) return filtered_log
def detect(log: EventLog, alphabet: Dict[str, int], act_key: str, use_msd: bool) -> Optional[str]: candidates = set(alphabet.keys()) for t in log: candidates = candidates.intersection(set(map(lambda e: e[act_key], t))) if len(candidates) == 0: return None for a in candidates: proj = EventLog() for t in log: proj.append(pm4py.filter_trace(lambda e: e[act_key] != a, t)) if len(list(filter(lambda t: len(t) == 0, proj))) == 0: dfg_proj = discover_dfg.apply(proj, parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) alphabet_proj = pm4py.get_attribute_values(proj, act_key) start_act_proj = get_starters.get_start_activities(proj, parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) end_act_proj = get_ends.get_end_activities(log, parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) pre_proj, post_proj = dfg_utils.get_transitive_relations(dfg_proj, alphabet_proj) cut = sequence_cut.detect(alphabet_proj, pre_proj, post_proj) if cut is not None: return a cut = xor_cut.detect(dfg_proj, alphabet_proj) if cut is not None: return a cut = concurrent_cut.detect(dfg_proj, alphabet_proj, start_act_proj, end_act_proj, msd= msdw_algo.derive_msd_witnesses(proj, msd_algo.apply(log, parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}), parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) if use_msd else None) if cut is not None: return a cut = loop_cut.detect(dfg_proj, alphabet_proj, start_act_proj, end_act_proj) if cut is not None: return a return None
def apply(df, parameters=None): """ Convert a dataframe into a log containing 1 case per variant (only control-flow perspective is considered) Parameters ------------- df Dataframe parameters Parameters of the algorithm Returns ------------- log Event log """ from pm4py.statistics.traces.pandas import case_statistics if parameters is None: parameters = {} variant_stats = case_statistics.get_variant_statistics(df, parameters=parameters) activity_key = parameters[ pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] if pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY log = EventLog() for vd in variant_stats: variant = vd['variant'].split(",") trace = Trace() for activity in variant: event = Event() event[activity_key] = activity trace.append(event) log.append(trace) return log
def filter_cases_exceeding_specified_acti_performance(log, transition_performance, activity, lower_bound): """ Filter cases exceeding the specified activity performance threshold Parameters ------------ log Event log transition_performance Dictionary where each transition label is associated to performance measures activity Target activity (of the filter) lower_bound Lower bound (filter cases which have a duration of the activity exceeding) Returns ------------ filtered_log Filtered log """ satisfying_indexes = get_idx_exceeding_specified_acti_performance( log, transition_performance, activity, lower_bound) new_log = EventLog(list(log[i] for i in satisfying_indexes)) return new_log
def apply(log, admitted_start_activities, parameters=None): """ Filter the log on the specified start activities Parameters ----------- log log admitted_start_activities Admitted start activities parameters Algorithm parameters Returns ----------- filtered_log Filtered log """ if parameters is None: parameters = {} attribute_key = parameters[ PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY filtered_log = EventLog([ trace for trace in log if trace and trace[0][attribute_key] in admitted_start_activities ]) return filtered_log
def apply(log, admitted_end_activities, parameters=None): """ Filter the log on the specified end activities Parameters ----------- log Log admitted_end_activities Admitted end activities parameters Algorithm parameters Returns ----------- filtered_log Filtered log """ if parameters is None: parameters = {} attribute_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, DEFAULT_NAME_KEY) filtered_log = [ trace for trace in log if trace and trace[-1][attribute_key] in admitted_end_activities ] return EventLog(filtered_log)
def filter_traces_intersecting(log, dt1, dt2, parameters=None): """ Filter traces intersecting the given interval Parameters ----------- log Trace log dt1 Lower bound to the interval dt2 Upper bound to the interval parameters Possible parameters of the algorithm, including: timestamp_key -> Attribute to use as timestamp Returns ------------ filtered_log Filtered log """ if parameters is None: parameters = {} timestamp_key = parameters[ PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY dt1 = get_dt_from_string(dt1) dt2 = get_dt_from_string(dt2) filtered_log = EventLog([trace for trace in log if is_intersecting(trace, dt1, dt2, timestamp_key)]) return filtered_log
def apply_from_variants_list(var_list, tree, parameters=None): """ Apply the alignments from the specification of a list of variants in the log Parameters ------------- var_list List of variants (for each item, the first entry is the variant itself, the second entry may be the number of cases) tree Process tree parameters Parameters of the algorithm Returns -------------- dictio_alignments Dictionary that assigns to each variant its alignment """ if parameters is None: parameters = {} dictio_alignments = {} log = EventLog() for index, varitem in enumerate(var_list): trace = variants_util.variant_to_trace(varitem[0], parameters=parameters) log.append(trace) alignments = apply(log, tree, parameters=parameters) for index, varitem in enumerate(var_list): dictio_alignments[varitem[0]] = alignments[index] return dictio_alignments
def apply(log, admitted_variants, parameters=None): """ Filter log keeping/removing only provided variants Parameters ----------- log Log object admitted_variants Admitted variants parameters Parameters of the algorithm, including: activity_key -> Attribute identifying the activity in the log positive -> Indicate if events should be kept/removed """ if parameters is None: parameters = {} positive = parameters["positive"] if "positive" in parameters else True variants = get_variants(log, parameters=parameters) log = EventLog() for variant in variants: if (positive and variant in admitted_variants) or ( not positive and variant not in admitted_variants): for trace in variants[variant]: log.append(trace) return log