def apply(log: Union[EventLog, EventStream], activity: str, parameters: Optional[Dict[Any, Any]] = None) -> EventLog: """ Filters the suffixes of an activity in the event log Parameters ---------------- log Event log activity Target activity parameters Parameters of the algorithm, including: - Parameters.ACTIVITY_KEY => the activity. - Parameters.STRICT => applies the filter strictly (cuts the occurrences of the selected activity). - Parameters.FIRST_OR_LAST => decides if the first or last occurrence of an activity should be selected. Returns ---------------- filtered_log Filtered event log """ if parameters is None: parameters = {} log = log_converter.apply(log, variant=log_converter.Variants.TO_EVENT_LOG, parameters=parameters) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) first_or_last = exec_utils.get_param_value(Parameters.FIRST_OR_LAST, parameters, "first") strict = exec_utils.get_param_value(Parameters.STRICT, parameters, True) filtered_log = EventLog(attributes=log.attributes, extensions=log.extensions, globals=log.omni_present, classifiers=log.classifiers, properties=log.properties) for trace in log: activities = [ x[activity_key] if activity_key in x else None for x in trace ] if activity in activities: if first_or_last == "first": op = min else: op = max idx_activity = op(i for i in range(len(activities)) if activities[i] == activity) if strict: idx_activity = idx_activity + 1 filtered_trace = Trace(attributes=trace.attributes, properties=trace.properties) for i in range(idx_activity, len(trace)): filtered_trace.append(trace[i]) filtered_log.append(filtered_trace) return filtered_log
def import_from_context(context, num_traces, parameters=None): """ Import a XES log from an iterparse context Parameters -------------- context Iterparse context num_traces Number of traces of the XES log parameters Parameters of the algorithm Returns -------------- log Event log """ if parameters is None: parameters = {} max_no_traces_to_import = exec_utils.get_param_value(Parameters.MAX_TRACES, parameters, sys.maxsize) timestamp_sort = exec_utils.get_param_value(Parameters.TIMESTAMP_SORT, parameters, False) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) reverse_sort = exec_utils.get_param_value(Parameters.REVERSE_SORT, parameters, False) show_progress_bar = exec_utils.get_param_value(Parameters.SHOW_PROGRESS_BAR, parameters, True) date_parser = dt_parser.get() progress = None if pkgutil.find_loader("tqdm") and show_progress_bar: from tqdm.auto import tqdm progress = tqdm(total=num_traces, desc="parsing log, completed traces :: ") log = None trace = None event = None tree = {} for tree_event, elem in context: if tree_event == _EVENT_START: # starting to read parent = tree[elem.getparent()] if elem.getparent() in tree else None if elem.tag.endswith(xes_constants.TAG_STRING): if parent is not None: tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), elem.get(xes_constants.KEY_VALUE), tree) continue elif elem.tag.endswith(xes_constants.TAG_DATE): try: dt = date_parser.apply(elem.get(xes_constants.KEY_VALUE)) tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), dt, tree) except TypeError: logging.info("failed to parse date: " + str(elem.get(xes_constants.KEY_VALUE))) except ValueError: logging.info("failed to parse date: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_EVENT): if event is not None: raise SyntaxError('file contains <event> in another <event> tag') event = Event() tree[elem] = event continue elif elem.tag.endswith(xes_constants.TAG_TRACE): if len(log) >= max_no_traces_to_import: break if trace is not None: raise SyntaxError('file contains <trace> in another <trace> tag') trace = Trace() tree[elem] = trace.attributes continue elif elem.tag.endswith(xes_constants.TAG_FLOAT): if parent is not None: try: val = float(elem.get(xes_constants.KEY_VALUE)) tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), val, tree) except ValueError: logging.info("failed to parse float: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_INT): if parent is not None: try: val = int(elem.get(xes_constants.KEY_VALUE)) tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), val, tree) except ValueError: logging.info("failed to parse int: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_BOOLEAN): if parent is not None: try: val0 = elem.get(xes_constants.KEY_VALUE) val = False if str(val0).lower() == "true": val = True tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), val, tree) except ValueError: logging.info("failed to parse boolean: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_LIST) or elem.tag.endswith(xes_constants.TAG_CONTAINER): if parent is not None: # lists have no value, hence we put None as a value tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), None, tree) continue elif elem.tag.endswith(xes_constants.TAG_ID): if parent is not None: tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), elem.get(xes_constants.KEY_VALUE), tree) continue elif elem.tag.endswith(xes_constants.TAG_EXTENSION): if log is None: raise SyntaxError('extension found outside of <log> tag') if elem.get(xes_constants.KEY_NAME) is not None and elem.get( xes_constants.KEY_PREFIX) is not None and elem.get(xes_constants.KEY_URI) is not None: log.extensions[elem.get(xes_constants.KEY_NAME)] = { xes_constants.KEY_PREFIX: elem.get(xes_constants.KEY_PREFIX), xes_constants.KEY_URI: elem.get(xes_constants.KEY_URI)} continue elif elem.tag.endswith(xes_constants.TAG_GLOBAL): if log is None: raise SyntaxError('global found outside of <log> tag') if elem.get(xes_constants.KEY_SCOPE) is not None: log.omni_present[elem.get(xes_constants.KEY_SCOPE)] = {} tree[elem] = log.omni_present[elem.get(xes_constants.KEY_SCOPE)] continue elif elem.tag.endswith(xes_constants.TAG_CLASSIFIER): if log is None: raise SyntaxError('classifier found outside of <log> tag') if elem.get(xes_constants.KEY_KEYS) is not None: classifier_value = elem.get(xes_constants.KEY_KEYS) if "'" in classifier_value: log.classifiers[elem.get(xes_constants.KEY_NAME)] = [x for x in classifier_value.split("'") if x.strip()] else: log.classifiers[elem.get(xes_constants.KEY_NAME)] = classifier_value.split() continue elif elem.tag.endswith(xes_constants.TAG_LOG): if log is not None: raise SyntaxError('file contains > 1 <log> tags') log = EventLog() tree[elem] = log.attributes continue elif tree_event == _EVENT_END: if elem in tree: del tree[elem] elem.clear() if elem.getprevious() is not None: try: del elem.getparent()[0] except TypeError: pass if elem.tag.endswith(xes_constants.TAG_EVENT): if trace is not None: trace.append(event) event = None continue elif elem.tag.endswith(xes_constants.TAG_TRACE): log.append(trace) if progress is not None: progress.update() trace = None continue elif elem.tag.endswith(xes_constants.TAG_LOG): continue # gracefully close progress bar if progress is not None: progress.close() del context, progress if timestamp_sort: log = sorting.sort_timestamp(log, timestamp_key=timestamp_key, reverse_sort=reverse_sort) # sets the activity key as default classifier in the log's properties log.properties[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_constants.DEFAULT_NAME_KEY log.properties[constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = xes_constants.DEFAULT_NAME_KEY # sets the default timestamp key log.properties[constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_constants.DEFAULT_TIMESTAMP_KEY # sets the default resource key log.properties[constants.PARAMETER_CONSTANT_RESOURCE_KEY] = xes_constants.DEFAULT_RESOURCE_KEY # sets the default transition key log.properties[constants.PARAMETER_CONSTANT_TRANSITION_KEY] = xes_constants.DEFAULT_TRANSITION_KEY # sets the default group key log.properties[constants.PARAMETER_CONSTANT_GROUP_KEY] = xes_constants.DEFAULT_GROUP_KEY return log
def get_log_traces_until_activity(log, activity, parameters=None): """ Gets a reduced version of the log containing, for each trace, only the events before a specified activity Parameters ------------- log Trace log activity Activity to reach parameters Possible parameters of the algorithm, including: PARAMETER_CONSTANT_ACTIVITY_KEY -> activity PARAMETER_CONSTANT_TIMESTAMP_KEY -> timestamp Returns ------------- new_log New log """ if parameters is None: parameters = {} activity_key = parameters[ constants. PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY timestamp_key = parameters[ constants. PARAMETER_CONSTANT_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY duration_attribute = parameters[ "duration"] if "duration" in parameters else None use_future_attributes = parameters[ "use_future_attributes"] if "use_future_attributes" in parameters else False new_log = EventLog() traces_interlapsed_time_to_act = [] i = 0 while i < len(log): ev_in_tr_w_act = sorted([ j for j in range(len(log[i])) if log[i][j][activity_key] == activity ]) if ev_in_tr_w_act and ev_in_tr_w_act[0] > 0: new_trace = Trace(log[i][0:ev_in_tr_w_act[0]]) for attr in log[i].attributes: new_trace.attributes[attr] = log[i].attributes[attr] if duration_attribute is None: try: curr_trace_interlapsed_time_to_act = log[i][ev_in_tr_w_act[0]][timestamp_key].timestamp() - \ log[i][ev_in_tr_w_act[0] - 1][timestamp_key].timestamp() except: curr_trace_interlapsed_time_to_act = log[i][ev_in_tr_w_act[0]][timestamp_key] - \ log[i][ev_in_tr_w_act[0] - 1][timestamp_key] logging.error("timestamp_key not timestamp") else: curr_trace_interlapsed_time_to_act = log[i][ ev_in_tr_w_act[0]][duration_attribute] traces_interlapsed_time_to_act.append( curr_trace_interlapsed_time_to_act) if use_future_attributes: for j in range(ev_in_tr_w_act[0] + 1, len(log[i])): new_ev = deepcopy(log[i][j]) if activity_key in new_ev: del new_ev[activity_key] new_trace.append(new_ev) new_log.append(new_trace) i = i + 1 return new_log, traces_interlapsed_time_to_act
def read_event(self): """ Gets the next event from the iterator Returns ------------ event Event """ tree = self.tree while True: tree_event, elem = next(self.context) if tree_event == _EVENT_START: parent = tree[ elem.getparent()] if elem.getparent() in tree else None if elem.tag.endswith(xes_constants.TAG_TRACE): self.trace = Trace() tree[elem] = self.trace.attributes self.reading_trace = True continue if elem.tag.endswith(xes_constants.TAG_EVENT): self.event = Event() tree[elem] = self.event self.reading_event = True continue if self.reading_event or self.reading_trace: if elem.tag.endswith(xes_constants.TAG_STRING): if parent is not None: tree = parse_attribute( elem, parent, elem.get(xes_constants.KEY_KEY), elem.get(xes_constants.KEY_VALUE), tree) continue elif elem.tag.endswith(xes_constants.TAG_DATE): try: dt = self.date_parser.apply( elem.get(xes_constants.KEY_VALUE)) tree = parse_attribute( elem, parent, elem.get(xes_constants.KEY_KEY), dt, tree) except TypeError: logging.info( "failed to parse date: " + str(elem.get(xes_constants.KEY_VALUE))) except ValueError: logging.info( "failed to parse date: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_FLOAT): if parent is not None: try: val = float(elem.get(xes_constants.KEY_VALUE)) tree = parse_attribute( elem, parent, elem.get(xes_constants.KEY_KEY), val, tree) except ValueError: logging.info( "failed to parse float: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_INT): if parent is not None: try: val = int(elem.get(xes_constants.KEY_VALUE)) tree = parse_attribute( elem, parent, elem.get(xes_constants.KEY_KEY), val, tree) except ValueError: logging.info( "failed to parse int: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_BOOLEAN): if parent is not None: try: val0 = elem.get(xes_constants.KEY_VALUE) val = False if str(val0).lower() == "true": val = True tree = parse_attribute( elem, parent, elem.get(xes_constants.KEY_KEY), val, tree) except ValueError: logging.info( "failed to parse boolean: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_LIST): if parent is not None: # lists have no value, hence we put None as a value tree = parse_attribute( elem, parent, elem.get(xes_constants.KEY_KEY), None, tree) continue elif elem.tag.endswith(xes_constants.TAG_ID): if parent is not None: tree = parse_attribute( elem, parent, elem.get(xes_constants.KEY_KEY), elem.get(xes_constants.KEY_VALUE), tree) continue elif tree_event == _EVENT_END: if elem in tree: del tree[elem] elem.clear() if elem.getprevious() is not None: try: del elem.getparent()[0] except TypeError: pass if elem.tag.endswith(xes_constants.TAG_EVENT): self.reading_event = False if self.acceptance_condition(self.event): for attr in self.trace.attributes: self.event[constants.CASE_ATTRIBUTE_PREFIX + attr] = self.trace.attributes[attr] return self.event continue elif elem.tag.endswith(xes_constants.TAG_TRACE): self.reading_trace = False continue elif elem.tag.endswith(xes_constants.TAG_LOG): self.reading_log = False break
def apply(dfg, start_activities, end_activities, parameters=None): """ Applies the playout algorithm on a DFG, extracting the most likely traces according to the DFG Parameters --------------- dfg *Complete* DFG start_activities Start activities end_activities End activities parameters Parameters of the algorithm, including: - Parameters.ACTIVITY_KEY => the activity key of the simulated log - Parameters.TIMESTAMP_KEY => the timestamp key of the simulated log - Parameters.MAX_NO_VARIANTS => the maximum number of variants generated by the method (default: 3000) - Parameters.MIN_WEIGHTED_PROBABILITY => the minimum overall weighted probability that makes the method stop (default: 1) - Parameters.MAX_NO_OCC_PER_ACTIVITY => the maximum number of occurrences per activity in the traces of the log (default: 2) - Parameters.INTERRUPT_SIMULATION_WHEN_DFG_COMPLETE => interrupts the simulation when the DFG of the simulated log has the same keys to the DFG of the original log (all behavior is contained) (default: False) - Parameters.ADD_TRACE_IF_TAKES_NEW_ELS_TO_DFG => adds a simulated trace to the simulated log only if it adds elements to the simulated DFG, e.g., it adds behavior; skip insertion otherwise (default: False) - Parameters.RETURN_VARIANTS => returns the traces as variants with a likely number of occurrences Returns --------------- simulated_log Simulated log """ if parameters is None: parameters = {} timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) max_no_variants = exec_utils.get_param_value(Parameters.MAX_NO_VARIANTS, parameters, 3000) min_weighted_probability = exec_utils.get_param_value( Parameters.MIN_WEIGHTED_PROBABILITY, parameters, 1.0) interrupt_simulation_when_dfg_complete = exec_utils.get_param_value( Parameters.INTERRUPT_SIMULATION_WHEN_DFG_COMPLETE, parameters, False) add_trace_if_takes_new_els_to_dfg = exec_utils.get_param_value( Parameters.ADD_TRACE_IF_TAKES_NEW_ELS_TO_DFG, parameters, False) return_variants = exec_utils.get_param_value(Parameters.RETURN_VARIANTS, parameters, False) # keep track of the DFG, start activities and end activities of the (ongoing) simulation simulated_traces_dfg = set() simulated_traces_sa = set() simulated_traces_ea = set() interrupt_break_condition = False overall_probability = 0.0 final_traces = [] for tr, p in get_traces(dfg, start_activities, end_activities, parameters=parameters): if (interrupt_simulation_when_dfg_complete and interrupt_break_condition ) or not (len(final_traces) < max_no_variants and overall_probability <= min_weighted_probability): break overall_probability += p diff_sa = {tr[0]}.difference(simulated_traces_sa) diff_ea = {tr[-1]}.difference(simulated_traces_ea) diff_dfg = {(tr[i], tr[i + 1]) for i in range(len(tr) - 1) }.difference(simulated_traces_dfg) adds_something = len(diff_sa) > 0 or len(diff_ea) > 0 or len( diff_dfg) > 0 if add_trace_if_takes_new_els_to_dfg and not adds_something: # interrupt the addition if the ADD_TRACE_IF_TAKES_NEW_ELS_TO_DFG is set to True, # and the trace does not really change the information on the DFG, start activities, # end activities continue # update the start activities, end activities, DFG of the original log simulated_traces_sa = simulated_traces_sa.union(diff_sa) simulated_traces_ea = simulated_traces_ea.union(diff_ea) simulated_traces_dfg = simulated_traces_dfg.union(diff_dfg) # memorize the difference between the original DFG and the DFG of the simulated log diff_original_sa = set(start_activities).difference( simulated_traces_sa) diff_original_ea = set(end_activities).difference(simulated_traces_ea) diff_original_dfg = set(dfg).difference(simulated_traces_dfg) interrupt_break_condition = len(diff_original_sa) == 0 and len( diff_original_ea) == 0 and len(diff_original_dfg) == 0 final_traces.append((-p, tr)) if interrupt_simulation_when_dfg_complete and interrupt_break_condition: break # make sure that the traces are strictly ordered by their probability # (generally, the order is already pretty good, since the states are visited in the queue based on their order, # but not always 100% consistent) final_traces = sorted(final_traces) if return_variants: # returns the variants instead of the log variants = [] for p, tr in final_traces: variants.append({ "variant": ",".join(tr), "count": math.ceil(-p * max_no_variants) }) return variants else: event_log = EventLog() # assigns to each event an increased timestamp from 1970 curr_timestamp = 10000000 for index, tr in enumerate(final_traces): log_trace = Trace( attributes={ xes_constants.DEFAULT_TRACEID_KEY: str(index), "probability": -tr[0] }) for act in tr[1]: log_trace.append( Event({ activity_key: act, timestamp_key: datetime.datetime.fromtimestamp(curr_timestamp) })) # increases by 1 second curr_timestamp += 1 event_log.append(log_trace) return event_log
def to_lifecycle(log, parameters=None): """ Converts a log from interval format (e.g. an event has two timestamps) to lifecycle format (an event has only a timestamp, and a transition lifecycle) Parameters ------------- log Log (expressed in the interval format) parameters Possible parameters of the method (activity, timestamp key, start timestamp key, transition ...) Returns ------------- log Lifecycle event log """ if parameters is None: parameters = {} timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes.DEFAULT_TIMESTAMP_KEY) start_timestamp_key = exec_utils.get_param_value( Parameters.START_TIMESTAMP_KEY, parameters, xes.DEFAULT_START_TIMESTAMP_KEY) transition_key = exec_utils.get_param_value(Parameters.TRANSITION_KEY, parameters, xes.DEFAULT_TRANSITION_KEY) if log is not None and len(log) > 0: if "PM4PY_TYPE" in log.attributes and log.attributes[ "PM4PY_TYPE"] == "lifecycle": return log if log[0] is not None and len(log[0]) > 0: first_event = log[0][0] if transition_key in first_event: return log new_log = EventLog(attributes=copy(log.attributes), extensions=copy(log.extensions), classifiers=copy(log.classifiers), omni_present=copy(log.omni_present), properties=copy(log.properties)) new_log.attributes["PM4PY_TYPE"] = "lifecycle" for trace in log: new_trace = Trace() for attr in trace.attributes: new_trace.attributes[attr] = trace.attributes[attr] list_events = [] for index, event in enumerate(trace): new_event_start = Event() new_event_complete = Event() for attr in event: if not attr == timestamp_key and not attr == start_timestamp_key: new_event_start[attr] = event[attr] new_event_complete[attr] = event[attr] new_event_start[timestamp_key] = event[start_timestamp_key] new_event_start[transition_key] = "start" new_event_start["@@custom_lif_id"] = 0 new_event_start["@@origin_ev_idx"] = index new_event_complete[timestamp_key] = event[timestamp_key] new_event_complete[transition_key] = "complete" new_event_complete["@@custom_lif_id"] = 1 new_event_complete["@@origin_ev_idx"] = index list_events.append(new_event_start) list_events.append(new_event_complete) list_events = sorted( list_events, key=lambda x: (x[timestamp_key], x["@@origin_ev_idx"], x["@@custom_lif_id"])) for ev in list_events: new_trace.append(ev) new_log.append(new_trace) return new_log return log
def to_interval(log, parameters=None): """ Converts a log to interval format (e.g. an event has two timestamps) from lifecycle format (an event has only a timestamp, and a transition lifecycle) Parameters ------------- log Log (expressed in the lifecycle format) parameters Possible parameters of the method (activity, timestamp key, start timestamp key, transition ...) Returns ------------- log Interval event log """ if parameters is None: parameters = {} timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes.DEFAULT_TIMESTAMP_KEY) start_timestamp_key = exec_utils.get_param_value( Parameters.START_TIMESTAMP_KEY, parameters, xes.DEFAULT_START_TIMESTAMP_KEY) transition_key = exec_utils.get_param_value(Parameters.TRANSITION_KEY, parameters, xes.DEFAULT_TRANSITION_KEY) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY) lifecycle_instance_key = exec_utils.get_param_value( Parameters.LIFECYCLE_INSTANCE_KEY, parameters, xes.DEFAULT_INSTANCE_KEY) business_hours = exec_utils.get_param_value(Parameters.BUSINESS_HOURS, parameters, False) worktiming = exec_utils.get_param_value(Parameters.WORKTIMING, parameters, [7, 17]) weekends = exec_utils.get_param_value(Parameters.WEEKENDS, parameters, [6, 7]) if log is not None and len(log) > 0: if "PM4PY_TYPE" in log.attributes and log.attributes[ "PM4PY_TYPE"] == "interval": return log if log[0] is not None and len(log[0]) > 0: first_event = log[0][0] if start_timestamp_key in first_event: return log new_log = EventLog(attributes=copy(log.attributes), extensions=copy(log.extensions), classifiers=copy(log.classifiers), omni_present=copy(log.omni_present), properties=copy(log.properties)) new_log.attributes["PM4PY_TYPE"] = "interval" new_log.properties[ constants. PARAMETER_CONSTANT_START_TIMESTAMP_KEY] = xes.DEFAULT_START_TIMESTAMP_KEY for trace in log: new_trace = Trace() for attr in trace.attributes: new_trace.attributes[attr] = trace.attributes[attr] activities_start = {} for event in trace: activity = event[activity_key] instance = event[ lifecycle_instance_key] if lifecycle_instance_key in event else None activity = (activity, instance) transition = event[ transition_key] if transition_key in event else "complete" timestamp = event[timestamp_key] if transition.lower() == "start": if activity not in activities_start: activities_start[activity] = list() activities_start[activity].append(event) elif transition.lower() == "complete": start_event = None start_timestamp = event[timestamp_key] if activity in activities_start and len( activities_start[activity]) > 0: start_event = activities_start[activity].pop(0) start_timestamp = start_event[timestamp_key] new_event = Event() for attr in event: if not attr == timestamp_key and not attr == transition_key: new_event[attr] = event[attr] if start_event is not None: for attr in start_event: if not attr == timestamp_key and not attr == transition_key: new_event["@@startevent_" + attr] = start_event[attr] new_event[start_timestamp_key] = start_timestamp new_event[timestamp_key] = timestamp new_event["@@duration"] = ( timestamp - start_timestamp).total_seconds() if business_hours: bh = BusinessHours( start_timestamp.replace(tzinfo=None), timestamp.replace(tzinfo=None), worktiming=worktiming, weekends=weekends) new_event["@@approx_bh_duration"] = bh.getseconds() new_trace.append(new_event) new_trace = sorting.sort_timestamp_trace(new_trace, start_timestamp_key) new_log.append(new_trace) return new_log return log
def empty_sequence_accepted(pt: ProcessTree) -> bool: alignment = calculate_optimal_alignment(pt, Trace()) return alignment["cost"] < STD_MODEL_LOG_MOVE_COST
def concatenate_traces(t1: Trace, t2: Trace) -> Trace: for e in t2: t1.append(e) return t1
def apply(log, net, im, fm, parameters=None): """ Performs a Monte Carlo simulation of an accepting Petri net without duplicate transitions and where the preset is always distinct from the postset (FIFO variant; the semaphores pile up if waiting is needed, and the first in is the first to win the semaphore) Parameters ------------- log Event log net Accepting Petri net without duplicate transitions and where the preset is always distinct from the postset im Initial marking fm Final marking parameters Parameters of the algorithm: PARAM_NUM_SIMULATIONS => (default: 100) PARAM_FORCE_DISTRIBUTION => Force a particular stochastic distribution (e.g. normal) when the stochastic map is discovered from the log (default: None; no distribution is forced) PARAM_ENABLE_DIAGNOSTICS => Enable the printing of diagnostics (default: True) PARAM_DIAGN_INTERVAL => Interval of time in which diagnostics of the simulation are printed (default: 32) PARAM_CASE_ARRIVAL_RATIO => Case arrival of new cases (default: None; inferred from the log) PARAM_PROVIDED_SMAP => Stochastic map that is used in the simulation (default: None; inferred from the log) PARAM_MAP_RESOURCES_PER_PLACE => Specification of the number of resources available per place (default: None; each place gets the default number of resources) PARAM_DEFAULT_NUM_RESOURCES_PER_PLACE => Default number of resources per place when not specified (default: 1; each place gets 1 resource and has to wait for the resource to finish) PARAM_SMALL_SCALE_FACTOR => Scale factor for the sleeping time of the actual simulation (default: 864000.0, 10gg) PARAM_MAX_THREAD_EXECUTION_TIME => Maximum execution time per thread (default: 60.0, 1 minute) Returns ------------ simulated_log Simulated event log simulation_result Result of the simulation: Outputs.OUTPUT_PLACES_INTERVAL_TREES => inteval trees that associate to each place the times in which it was occupied. Outputs.OUTPUT_TRANSITIONS_INTERVAL_TREES => interval trees that associate to each transition the intervals of time in which it could not fire because some token was in the output. Outputs.OUTPUT_CASES_EX_TIME => Throughput time of the cases included in the simulated log Outputs.OUTPUT_MEDIAN_CASES_EX_TIME => Median of the throughput times Outputs.OUTPUT_CASE_ARRIVAL_RATIO => Case arrival ratio that was specified in the simulation Outputs.OUTPUT_TOTAL_CASES_TIME => Total time occupied by cases of the simulated log """ if parameters is None: parameters = {} from intervaltree import IntervalTree timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) no_simulations = exec_utils.get_param_value( Parameters.PARAM_NUM_SIMULATIONS, parameters, 100) force_distribution = exec_utils.get_param_value( Parameters.PARAM_FORCE_DISTRIBUTION, parameters, None) enable_diagnostics = exec_utils.get_param_value( Parameters.PARAM_ENABLE_DIAGNOSTICS, parameters, True) diagn_interval = exec_utils.get_param_value( Parameters.PARAM_DIAGN_INTERVAL, parameters, 32.0) case_arrival_ratio = exec_utils.get_param_value( Parameters.PARAM_CASE_ARRIVAL_RATIO, parameters, None) smap = exec_utils.get_param_value(Parameters.PARAM_PROVIDED_SMAP, parameters, None) resources_per_places = exec_utils.get_param_value( Parameters.PARAM_MAP_RESOURCES_PER_PLACE, parameters, None) default_num_resources_per_places = exec_utils.get_param_value( Parameters.PARAM_DEFAULT_NUM_RESOURCES_PER_PLACE, parameters, 1) small_scale_factor = exec_utils.get_param_value( Parameters.PARAM_SMALL_SCALE_FACTOR, parameters, 864000) max_thread_exec_time = exec_utils.get_param_value( Parameters.PARAM_MAX_THREAD_EXECUTION_TIME, parameters, 60.0) if case_arrival_ratio is None: case_arrival_ratio = case_arrival.get_case_arrival_avg( log, parameters=parameters) if resources_per_places is None: resources_per_places = {} logging.basicConfig() logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) places_interval_trees = {} transitions_interval_trees = {} cases_ex_time = [] list_cases = {} for place in net.places: # assign a semaphore to each place. if place in resources_per_places: place.semaphore = Semaphore(resources_per_places[place]) else: # if the user does not specify the number of resources per place, # the default number is used place.semaphore = Semaphore(default_num_resources_per_places) place.assigned_time = [] places_interval_trees[place] = IntervalTree() for trans in net.transitions: transitions_interval_trees[trans] = IntervalTree() # when the user does not specify any map from transitions to random variables, # a replay operation is performed if smap is None: if enable_diagnostics: logger.info(str(time()) + " started the replay operation.") if force_distribution is not None: smap = replay.get_map_from_log_and_net( log, net, im, fm, force_distribution=force_distribution, parameters=parameters) else: smap = replay.get_map_from_log_and_net(log, net, im, fm, parameters=parameters) if enable_diagnostics: logger.info(str(time()) + " ended the replay operation.") # the start timestamp is set to 1000000 instead of 0 to avoid problems with 32 bit machines start_time = 1000000 threads = [] for i in range(no_simulations): list_cases[i] = Trace() t = SimulationThread(i, net, im, fm, smap, start_time, places_interval_trees, transitions_interval_trees, cases_ex_time, list_cases, enable_diagnostics, diagn_interval, small_scale_factor, max_thread_exec_time) t.start() threads.append(t) start_time = start_time + case_arrival_ratio # wait a factor before opening a thread and the next one sleep(case_arrival_ratio / small_scale_factor) for t in threads: t.join() i = 0 while i < len(threads): if threads[i].terminated_correctly is False: del list_cases[threads[i].id] del threads[i] del cases_ex_time[i] continue i = i + 1 if enable_diagnostics: logger.info(str(time()) + " ended the Monte carlo simulation.") log = EventLog(list(list_cases.values())) min_timestamp = log[0][0][timestamp_key].timestamp() max_timestamp = max(y[timestamp_key].timestamp() for x in log for y in x) transitions_interval_trees = { t.name: y for t, y in transitions_interval_trees.items() } return log, { Outputs.OUTPUT_PLACES_INTERVAL_TREES.value: places_interval_trees, Outputs.OUTPUT_TRANSITIONS_INTERVAL_TREES.value: transitions_interval_trees, Outputs.OUTPUT_CASES_EX_TIME.value: cases_ex_time, Outputs.OUTPUT_MEDIAN_CASES_EX_TIME.value: median(cases_ex_time), Outputs.OUTPUT_CASE_ARRIVAL_RATIO.value: case_arrival_ratio, Outputs.OUTPUT_TOTAL_CASES_TIME.value: max_timestamp - min_timestamp }
def filter_log_by_paths(log, paths, variants, vc, threshold, attribute_key="concept:name"): """ Keep only paths which number of occurrences is above the threshold (or they belong to the first variant) Parameters ---------- log Log paths Dictionary of paths associated with their count variants (If specified) Dictionary with variant as the key and the list of traces as the value vc List of variant names along with their count threshold Cutting threshold (remove paths which number of occurrences is below the threshold) attribute_key (If specified) Specify the attribute key to use (default concept:name) Returns ---------- filtered_log Filtered log """ filtered_log = EventLog(list(), attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers, omni_present=log.omni_present, properties=log.properties) fvft = variants[vc[0][0]][0] fvp = set() for i in range(0, len(fvft) - 1): path = fvft[i][attribute_key] + "," + fvft[i + 1][attribute_key] fvp.add(path) for trace in log: new_trace = Trace() jj = 0 if len(trace) > 0: new_trace.append(trace[0]) for j in range(1, len(trace) - 1): jj = j if j >= len(trace): break if attribute_key in trace[j] and attribute_key in trace[j + 1]: path = trace[j][attribute_key] + "," + trace[ j + 1][attribute_key] if path in paths: if path in fvp or paths[path] >= threshold: new_trace.append(trace[j]) new_trace.append(trace[j + 1]) if len(trace) > 1 and not jj == len(trace): new_trace.append(trace[-1]) if len(new_trace) > 0: for attr in trace.attributes: new_trace.attributes[attr] = trace.attributes[attr] filtered_log.append(new_trace) return filtered_log
def preprocessing(log, parameters=None): """ Preprocessing step for the Aplha+ algorithm. Removing all transitions from the log with a loop of length one. Parameters ------------ log Event log parameters Parameters of the algorithm Returns ------------- log filtered log and a list of the filtered transitions loop_one_list Loop one list A_filtered Dictionary: activity before the loop-length-one activity B_filtered Dictionary: activity after the loop-length-one activity loops_in_first_place Loops in source place loops_in_last_place Loops in sink place """ loops_in_first_place = set() loops_in_last_place = set() if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_util.DEFAULT_NAME_KEY) # List for values that have a loop of length one loop_one_list = [] # Log without activities that have a loop of length one filtered_log = EventLog() # dictionary A: activity before the loop-length-one activity A = {} # dictionary B: activity after the loop-length-one activity B = {} A_filtered = {} B_filtered = {} # inserting artificial start and end activity, since it is not allowed to have a loop at the source place # (according to paper) for trace in log: trace.insert(0, {activity_key: 'artificial_start'}) trace.append({activity_key: 'artificial_end'}) for trace in log: i = 0 while i < len(trace) - 1: test = trace[1] current = trace[i][activity_key] successor = trace[i + 1][activity_key] if current == successor: if current not in loop_one_list: loop_one_list.append(current) i += 1 for trace in log: i = 0 filtered_trace = Trace() while i < len(trace) - 1: current = trace[i][activity_key] successor = trace[i + 1][activity_key] if not current in loop_one_list: filtered_trace.append(current) if successor in loop_one_list: if not current in loop_one_list: if current in A: A[successor].append(current) else: A[successor] = [current] if current in loop_one_list: if not successor in loop_one_list: if current in B: B[current].append(successor) else: B[current] = [successor] if i == len(trace) - 2: if not successor in loop_one_list: filtered_trace.append(successor) i += 1 filtered_log.append(filtered_trace) # Making sets instead of lists for key, value in A.items(): A_filtered[key] = set(value) # Making sets instead of lists for key, value in B.items(): B_filtered[key] = set(value) for trace in log: if trace.__getitem__(0) in loop_one_list: loops_in_first_place.add(trace.__getitem__(0)) if trace.__getitem__(len(trace) - 1) in loop_one_list: loops_in_last_place.add(trace.__getitem__(len(trace) - 1)) loops_in_first_place = list(loops_in_first_place) loops_in_last_place = list(loops_in_last_place) return (filtered_log, loop_one_list, A_filtered, B_filtered, loops_in_first_place, loops_in_last_place)
def __approximate_alignment_on_parallel(pt: ProcessTree, trace: Trace, a_sets: Dict[ProcessTree, Set[str]], sa_sets: Dict[ProcessTree, Set[str]], ea_sets: Dict[ProcessTree, Set[str]], tau_flags: Dict[ProcessTree, bool], tl: int, th: int, parameters=None): if parameters is None: parameters = {} from pulp import lpSum, LpVariable, LpProblem, LpMinimize activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, DEFAULT_NAME_KEY) assert pt.operator == Operator.PARALLEL assert len(pt.children) > 0 assert len(trace) > 0 ilp = LpProblem(sense=LpMinimize) # x_i_j = 1 <=> assigns activity i to subtree j x_variables: Dict[int, Dict[int, LpVariable]] = {} # s_i_j = 1 <=> activity i is a start activity in the current sub-trace assigned to subtree j s_variables: Dict[int, Dict[int, LpVariable]] = {} # e_i_j = 1 <=> activity i is an end activity in the current sub-trace assigned to subtree j e_variables: Dict[int, Dict[int, LpVariable]] = {} # auxiliary u_j <=> u_j=1 if an activity is assigned to subtree j u_variables: Dict[int, LpVariable] = {} # v_i_j = 1 <=> activity i is neither a start nor end-activity in the current sub-trace assigned to subtree j v_variables: Dict[int, Dict[int, LpVariable]] = {} s_costs = {} e_costs = {} u_costs = {} v_costs = {} for i, a in enumerate(trace): x_variables[i] = {} s_variables[i] = {} s_costs[i] = {} e_variables[i] = {} e_costs[i] = {} v_variables[i] = {} v_costs[i] = {} for j, subtree in enumerate(pt.children): x_variables[i][j] = LpVariable('x_' + str(i) + '_' + str(j), cat='Binary') s_variables[i][j] = LpVariable('s_' + str(i) + '_' + str(j), cat='Binary') s_costs[i][j] = 0 if a[activity_key] in sa_sets[subtree] else 1 e_variables[i][j] = LpVariable('e_' + str(i) + '_' + str(j), cat='Binary') e_costs[i][j] = 0 if a[activity_key] in ea_sets[subtree] else 1 v_variables[i][j] = LpVariable('v_' + str(i) + '_' + str(j), cat='Binary') v_costs[i][j] = 0 if a[activity_key] in a_sets[subtree] else 1 for j in range(len(pt.children)): u_variables[j] = LpVariable('u_' + str(j), cat='Binary') # define costs to not assign anything to subtree j if tau_flags[pt.children[j]]: u_costs[j] = 0 elif sa_sets[pt.children[j]] & ea_sets[pt.children[j]]: # intersection of start-activities and end-activities is not empty u_costs[j] = 1 else: # intersection of start-activities and end-activities is empty u_costs[j] = 2 # objective function ilp += lpSum([ v_variables[i][j] * v_costs[i][j] for i in range(len(trace)) for j in range(len(pt.children)) ] + [ s_variables[i][j] * s_costs[i][j] for i in range(len(trace)) for j in range(len(pt.children)) ] + [ e_variables[i][j] * e_costs[i][j] for i in range(len(trace)) for j in range(len(pt.children)) ] + [(1 - u_variables[j]) * u_costs[j] for j in range(len(pt.children))]), "objective_function" # constraints for i in range(len(trace)): # every activity is assigned to one subtree ilp += lpSum([x_variables[i][j] * 1 for j in range(len(pt.children))]) == 1 for j in range(len(pt.children)): # first activity is a start activity ilp += x_variables[0][j] <= s_variables[0][j] # last activity is an end-activity ilp += x_variables[len(trace) - 1][j] <= e_variables[len(trace) - 1][j] # define s_i_j variables for i in range(len(trace)): for j in range(len(pt.children)): ilp += s_variables[i][j] <= x_variables[i][j] for k in range(i): ilp += s_variables[i][j] <= 1 - x_variables[k][j] # activity can be only a start-activity for one subtree ilp += lpSum(s_variables[i][j] for j in range(len(pt.children))) <= 1 # define e_i_j variables for i in range(len(trace)): for j in range(len(pt.children)): ilp += e_variables[i][j] <= x_variables[i][j] for k in range(i + 1, len(trace)): ilp += e_variables[i][j] <= 1 - x_variables[k][j] # activity can be only an end-activity for one subtree ilp += lpSum(e_variables[i][j] for j in range(len(pt.children))) <= 1 for j in range(len(pt.children)): for i in range(len(trace)): # define u_j variables ilp += u_variables[j] >= x_variables[i][j] # if u_j variable = 1 ==> a start activity must exist ilp += u_variables[j] <= lpSum(s_variables[i][j] for i in range(len(trace))) # if u_j variable = 1 ==> an end activity must exist ilp += u_variables[j] <= lpSum(e_variables[i][j] for i in range(len(trace))) # define v_i_j variables for i in range(len(trace)): for j in range(2): ilp += v_variables[i][j] >= 1 - s_variables[i][ j] + 1 - e_variables[i][j] + x_variables[i][j] - 2 ilp += v_variables[i][j] <= x_variables[i][j] ilp += v_variables[i][j] <= 1 - e_variables[i][j] ilp += v_variables[i][j] <= 1 - s_variables[i][j] status = ilp.solve() assert status == 1 # trace_parts list contains trace parts mapped onto the determined subtree trace_parts: List[Tuple[ProcessTree, Trace]] = [] last_subtree: ProcessTree = None for i in range(len(trace)): for j in range(len(pt.children)): subtree = pt.children[j] if x_variables[i][j].varValue == 1: if last_subtree and subtree == last_subtree: trace_parts[-1][1].append(trace[i]) else: assert last_subtree is None or subtree != last_subtree t = Trace() t.append(trace[i]) trace_parts.append((subtree, t)) last_subtree = subtree continue # calculate an alignment for each subtree alignments_per_subtree: Dict[ProcessTree] = {} for j in range(len(pt.children)): subtree: ProcessTree = pt.children[j] sub_trace = Trace() for trace_part in trace_parts: if subtree == trace_part[0]: sub_trace = concatenate_traces(sub_trace, trace_part[1]) align_result = __approximate_alignment_for_trace(subtree, a_sets, sa_sets, ea_sets, tau_flags, sub_trace, tl, th, parameters=parameters) if align_result is None: # the alignment did not terminate correctly. return None alignments_per_subtree[subtree] = align_result # compose alignments from subtree alignments res = [] for trace_part in trace_parts: activities_to_cover = trace_to_list_of_str(trace_part[1]) activities_covered_so_far = [] alignment = alignments_per_subtree[trace_part[0]] while activities_to_cover != activities_covered_so_far: move = alignment.pop(0) res.append(move) # if the alignment move is NOT a model move add activity to activities_covered_so_far if move[0] != SKIP: activities_covered_so_far.append(move[0]) # add possible remaining alignment moves to resulting alignment, the order does not matter (parallel operator) for subtree in alignments_per_subtree: if len(alignments_per_subtree[subtree]) > 0: res.extend(alignments_per_subtree[subtree]) return res
def __approximate_alignment_on_sequence(pt: ProcessTree, trace: Trace, a_sets: Dict[ProcessTree, Set[str]], sa_sets: Dict[ProcessTree, Set[str]], ea_sets: Dict[ProcessTree, Set[str]], tau_flags: Dict[ProcessTree, bool], tl: int, th: int, parameters=None): if parameters is None: parameters = {} from pulp import lpSum, LpVariable, LpProblem, LpMinimize activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, DEFAULT_NAME_KEY) assert pt.operator == Operator.SEQUENCE assert len(pt.children) > 0 assert len(trace) > 0 ilp = LpProblem(sense=LpMinimize) # x_i_j = 1 <=> assigns activity i to subtree j x_variables: Dict[int, Dict[int, LpVariable]] = {} # s_i_j = 1 <=> activity i is a start activity in the current sub-trace assigned to subtree j s_variables: Dict[int, Dict[int, LpVariable]] = {} # e_i_j = 1 <=> activity i is an end activity in the current sub-trace assigned to subtree j e_variables: Dict[int, Dict[int, LpVariable]] = {} # auxiliary u_j <=> u_j=1 if an activity is assigned to subtree j u_variables: Dict[int, LpVariable] = {} # v_i_j = 1 <=> activity i is neither a start nor end-activity in the current sub-trace assigned to subtree j v_variables: Dict[int, Dict[int, LpVariable]] = {} s_costs = {} e_costs = {} u_costs = {} v_costs = {} # trace <a_0,...,a_n> for i, a in enumerate(trace): x_variables[i] = {} s_variables[i] = {} s_costs[i] = {} e_variables[i] = {} e_costs[i] = {} v_variables[i] = {} v_costs[i] = {} for j, subtree in enumerate(pt.children): x_variables[i][j] = LpVariable('x_' + str(i) + '_' + str(j), cat='Binary') s_variables[i][j] = LpVariable('s_' + str(i) + '_' + str(j), cat='Binary') s_costs[i][j] = 0 if a[activity_key] in sa_sets[subtree] else 1 e_variables[i][j] = LpVariable('e_' + str(i) + '_' + str(j), cat='Binary') e_costs[i][j] = 0 if a[activity_key] in ea_sets[subtree] else 1 v_variables[i][j] = LpVariable('v_' + str(i) + '_' + str(j), cat='Binary') v_costs[i][j] = 0 if a[activity_key] in a_sets[subtree] else 1 for j in range(len(pt.children)): u_variables[j] = LpVariable('u_' + str(j), cat='Binary') # define costs to not assign anything to subtree j if tau_flags[pt.children[j]]: u_costs[j] = 0 elif sa_sets[pt.children[j]] & ea_sets[pt.children[j]]: # intersection of start-activities and end-activities is not empty u_costs[j] = 1 else: # intersection of start-activities and end-activities is empty u_costs[j] = 2 # objective function ilp += lpSum([ v_variables[i][j] * v_costs[i][j] for i in range(len(trace)) for j in range(len(pt.children)) ] + [ s_variables[i][j] * s_costs[i][j] for i in range(len(trace)) for j in range(len(pt.children)) ] + [ e_variables[i][j] * e_costs[i][j] for i in range(len(trace)) for j in range(len(pt.children)) ] + [(1 - u_variables[j]) * u_costs[j] for j in range(len(pt.children))]), "objective_function" # constraints for i in range(len(trace)): # every activity is assigned to one subtree ilp += lpSum([x_variables[i][j] * 1 for j in range(len(pt.children))]) == 1 for j in range(len(pt.children)): # first activity is start activity ilp += x_variables[0][j] <= s_variables[0][j] # last activity is end-activity ilp += x_variables[len(trace) - 1][j] <= e_variables[len(trace) - 1][j] # define s_i_j variables for i in range(1, len(trace)): for j in range(len(pt.children)): ilp += s_variables[i][j] >= x_variables[i][j] + 1 - x_variables[ i - 1][j] - 1 ilp += s_variables[i][j] <= x_variables[i][j] ilp += s_variables[i][j] <= 1 - x_variables[i - 1][j] for i in range(len(trace)): # activity can be only for one subtree a start-activity ilp += lpSum(s_variables[i][j] for j in range(len(pt.children))) <= 1 # define e_i_j variables for i in range(len(trace) - 1): for j in range(len(pt.children)): ilp += e_variables[i][j] >= x_variables[i][j] + 1 - x_variables[ i + 1][j] - 1 ilp += e_variables[i][j] <= x_variables[i][j] ilp += e_variables[i][j] <= 1 - x_variables[i + 1][j] for i in range(len(trace)): # activity can be only for one subtree an end-activity ilp += lpSum(e_variables[i][j] for j in range(len(pt.children))) <= 1 # constraint - preserving sequence when assigning activities to subtrees for i in range(len(trace) - 1): for j in range(len(pt.children)): ilp += lpSum( x_variables[i + 1][k] for k in range(j, len(pt.children))) >= x_variables[i][j] for j in range(len(pt.children)): for i in range(len(trace)): # define u_j variables ilp += u_variables[j] >= x_variables[i][j] # if u_j variable = 1 ==> a start activity must exist ilp += u_variables[j] <= lpSum(s_variables[i][j] for i in range(len(trace))) # if u_j variable = 1 ==> an end activity must exist ilp += u_variables[j] <= lpSum(e_variables[i][j] for i in range(len(trace))) # define v_i_j variables for i in range(len(trace)): for j in range(2): ilp += v_variables[i][j] >= 1 - s_variables[i][ j] + 1 - e_variables[i][j] + x_variables[i][j] - 2 ilp += v_variables[i][j] <= x_variables[i][j] ilp += v_variables[i][j] <= 1 - e_variables[i][j] ilp += v_variables[i][j] <= 1 - s_variables[i][j] status = ilp.solve() assert status == 1 alignments_to_calculate: List[Tuple[ProcessTree, Trace]] = [] for j in range(len(pt.children)): sub_trace = Trace() for i in range(len(trace)): if x_variables[i][j].varValue == 1: sub_trace.append(trace[i]) alignments_to_calculate.append((pt.children[j], sub_trace)) # calculate and compose alignments res = [] for subtree, sub_trace in alignments_to_calculate: align_result = __approximate_alignment_for_trace(subtree, a_sets, sa_sets, ea_sets, tau_flags, sub_trace, tl, th, parameters=parameters) if align_result is None: # the alignment did not terminate correctly. return None res.extend(align_result) return res
def __approximate_alignment_on_loop(pt: ProcessTree, trace: Trace, a_sets: Dict[ProcessTree, Set[str]], sa_sets: Dict[ProcessTree, Set[str]], ea_sets: Dict[ProcessTree, Set[str]], tau_flags: Dict[ProcessTree, bool], tl: int, th: int, parameters=None): if parameters is None: parameters = {} from pulp import lpSum, LpVariable, LpProblem, LpMinimize activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, DEFAULT_NAME_KEY) assert pt.operator == Operator.LOOP assert len(pt.children) == 2 assert len(trace) > 0 ilp = LpProblem(sense=LpMinimize) # x_i_j = 1 <=> assigns activity i to subtree j x_variables: Dict[int, Dict[int, LpVariable]] = {} # t_i_j = 1 <=> inserts a tau at position i and assigns it to subtree j t_variables: Dict[int, Dict[int, LpVariable]] = {} # s_i_j = 1 <=> activity i is a start activity in the current sub-trace assigned to subtree j s_variables: Dict[int, Dict[int, LpVariable]] = {} # e_i_j = 1 <=> activity i is an end activity in the current sub-trace assigned to subtree j e_variables: Dict[int, Dict[int, LpVariable]] = {} # v_i_j = 1 <=> activity i is neither a start nor end-activity in the current sub-trace assigned to subtree j v_variables: Dict[int, Dict[int, LpVariable]] = {} # auxiliary variables # p_i_j = 1 <=> previous activity i-1 is assigned to the other subtree or t_1_other-subtree is 1 p_variables: Dict[int, Dict[int, LpVariable]] = {} # n_i_j = 1 <=> next activity i+1 is assigned to the other subtree or t_1_other-subtree is 1 n_variables: Dict[int, Dict[int, LpVariable]] = {} t_costs = {} s_costs = {} e_costs = {} v_costs = {} for i, a in enumerate(trace): x_variables[i] = {} s_variables[i] = {} s_costs[i] = {} e_variables[i] = {} e_costs[i] = {} v_variables[i] = {} v_costs[i] = {} p_variables[i] = {} n_variables[i] = {} for j, subtree in enumerate(pt.children): x_variables[i][j] = LpVariable('x_' + str(i) + '_' + str(j), cat='Binary') s_variables[i][j] = LpVariable('s_' + str(i) + '_' + str(j), cat='Binary') s_costs[i][j] = 0 if a[activity_key] in sa_sets[subtree] else 1 e_variables[i][j] = LpVariable('e_' + str(i) + '_' + str(j), cat='Binary') e_costs[i][j] = 0 if a[activity_key] in ea_sets[subtree] else 1 v_variables[i][j] = LpVariable('v_' + str(i) + '_' + str(j), cat='Binary') v_costs[i][j] = 0 if a[activity_key] in a_sets[subtree] else 1 p_variables[i][j] = LpVariable('p_' + str(i) + '_' + str(j), cat='Binary') n_variables[i][j] = LpVariable('n_' + str(i) + '_' + str(j), cat='Binary') for i in range(len(trace) + 1): t_variables[i] = {} t_costs[i] = {} for j, subtree in enumerate(pt.children): t_variables[i][j] = LpVariable('t_' + str(i) + '_' + str(j), cat='Binary') if tau_flags[subtree]: t_costs[i][ j] = -0.00001 # favour to add a cut if possible over not putting a cut else: if len(sa_sets[subtree].intersection(ea_sets[subtree])) != 0: t_costs[i][j] = 1 else: t_costs[i][j] = 2 # objective function ilp += lpSum([ s_variables[i][j] * s_costs[i][j] for i in range(len(trace)) for j in range(len(pt.children)) ] + [ e_variables[i][j] * e_costs[i][j] for i in range(len(trace)) for j in range(len(pt.children)) ] + [ v_variables[i][j] * v_costs[i][j] for i in range(len(trace)) for j in range(len(pt.children)) ] + [ t_variables[i][j] * t_costs[i][j] for i in range(len(trace) + 1) for j in range(len(pt.children)) ]), "objective_function" # constraints # universe j {0,1} # universe i for t_i_j variables {0,...,len(trace)} # universe i else {0,...,len(trace)-1} # first tau can never be assigned to the 2nd subtree ilp += t_variables[0][1] == 0 # last tau can never be assigned to the 2nd subtree ilp += t_variables[len(trace)][1] == 0 # if first/last tau is not used --> first/last activity is assigned to 1st subtree ilp += 1 - t_variables[0][0] <= x_variables[0][0] ilp += 1 - t_variables[len(trace)][0] <= x_variables[len(trace) - 1][0] for i in range(len(trace)): # every activity is assigned to one subtree ilp += lpSum([x_variables[i][j] * 1 for j in range(len(pt.children))]) == 1 # start/end/intermediate-activity at position i can only be assigned to one subtree ilp += lpSum([s_variables[i][j] * 1 for j in range(len(pt.children))]) <= 1 ilp += lpSum([e_variables[i][j] * 1 for j in range(len(pt.children))]) <= 1 ilp += lpSum([v_variables[i][j] * 1 for j in range(len(pt.children))]) <= 1 for i in range(len(trace) + 1): # max one tau is used per index ilp += lpSum([t_variables[i][j] for j in range(2)]) <= 1 # if tau is used and hence, assigned to a subtree, the surrounding activities are assigned to the other subtree for i in range(1, len(trace)): # if tau at position i is assigned to 1st subtree, the previous activity is assigned to 2nd subtree ilp += t_variables[i][0] <= x_variables[i - 1][1] # if tau at position i is assigned to 1st subtree, the previous activity is assigned to 2nd subtree ilp += t_variables[i][1] <= x_variables[i - 1][0] for i in range(len(trace)): # if tau at position i is assigned to 1st subtree, the next activity is assigned to 2nd subtree ilp += t_variables[i][0] <= x_variables[i][1] # if tau at position i is assigned to 2nd subtree, the next activity is assigned to 1st subtree ilp += t_variables[i][1] <= x_variables[i][0] # if last tau is used and assigned to 1st subtree (assigning it to the 2nd subtree is already forbidden by another # constraint) --> last activity must be assigned to 2nd subtree ilp += t_variables[len(trace)][0] <= x_variables[len(trace) - 1][1] # define auxiliary variables n: n_i_1 = 1 <=> next activity i+1 is assigned to 2nd subtree or t_i+1_2 = 1 for i in range(len(trace) - 1): ilp += n_variables[i][0] <= x_variables[i + 1][1] + t_variables[i + 1][1] ilp += n_variables[i][0] >= x_variables[i + 1][1] ilp += n_variables[i][0] >= t_variables[i + 1][1] ilp += n_variables[i][1] <= x_variables[i + 1][0] + t_variables[i + 1][0] ilp += n_variables[i][1] >= x_variables[i + 1][0] ilp += n_variables[i][1] >= t_variables[i + 1][0] ilp += t_variables[len(trace)][1] <= n_variables[len(trace) - 1][0] ilp += t_variables[len(trace)][0] <= n_variables[len(trace) - 1][1] # define e_i_j variables for i in range(len(trace)): for j in range(2): ilp += e_variables[i][j] <= n_variables[i][j] ilp += e_variables[i][j] <= x_variables[i][j] ilp += e_variables[i][ j] >= n_variables[i][j] + x_variables[i][j] - 1 # define auxiliary variables p: p_i_1 = 1 <=> previous activity i-1 is assigned to 2nd subtree or t_i-1_2 = 1 ilp += t_variables[0][1] <= p_variables[0][0] ilp += p_variables[0][1] <= t_variables[0][0] for i in range(1, len(trace)): ilp += p_variables[i][0] <= t_variables[i][1] + x_variables[i - 1][1] ilp += p_variables[i][0] >= t_variables[i][1] ilp += p_variables[i][0] >= x_variables[i - 1][1] ilp += p_variables[i][1] <= t_variables[i][0] + x_variables[i - 1][0] ilp += p_variables[i][1] >= t_variables[i][0] ilp += p_variables[i][1] >= x_variables[i - 1][0] # define s_i_j variables for i in range(len(trace)): for j in range(2): ilp += s_variables[i][ j] >= p_variables[i][j] + x_variables[i][j] - 1 ilp += s_variables[i][j] <= p_variables[i][j] ilp += s_variables[i][j] <= p_variables[i][j] ilp += 1 - t_variables[0][0] <= s_variables[0][0] # define v_i_j variables for i in range(len(trace)): for j in range(2): ilp += v_variables[i][j] >= 1 - s_variables[i][ j] + 1 - e_variables[i][j] + x_variables[i][j] - 2 ilp += v_variables[i][j] <= x_variables[i][j] ilp += v_variables[i][j] <= 1 - e_variables[i][j] ilp += v_variables[i][j] <= 1 - s_variables[i][j] status = ilp.solve() assert status == 1 alignments_to_calculate: List[Tuple[ProcessTree, Trace]] = [] sub_trace = Trace() current_subtree_idx = 0 for i in range(len(trace)): for j in range(2): if t_variables[i][j].varValue: if i == 0: # first tau can be only assigned to first subtree assert j == 0 alignments_to_calculate.append((pt.children[j], Trace())) current_subtree_idx = 1 else: alignments_to_calculate.append( (pt.children[current_subtree_idx], sub_trace)) alignments_to_calculate.append((pt.children[j], Trace())) sub_trace = Trace() for j in range(2): if x_variables[i][j].varValue: if j == current_subtree_idx: sub_trace.append(trace[i]) else: alignments_to_calculate.append( (pt.children[current_subtree_idx], sub_trace)) sub_trace = Trace() sub_trace.append(trace[i]) current_subtree_idx = j if len(sub_trace) > 0: alignments_to_calculate.append( (pt.children[current_subtree_idx], sub_trace)) if t_variables[len(trace)][0].varValue: alignments_to_calculate.append((pt.children[0], Trace())) res = [] for subtree, sub_trace in alignments_to_calculate: align_result = __approximate_alignment_for_trace(subtree, a_sets, sa_sets, ea_sets, tau_flags, sub_trace, tl, th, parameters=parameters) if align_result is None: # the alignment did not terminate correctly. return None res.extend(align_result) return res
def apply( log: EventLog, values: List[str], parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> EventLog: """ Filter log by keeping only traces that has/has not events with an attribute value that belongs to the provided values list Parameters ----------- log Trace log values Allowed attributes parameters Parameters of the algorithm, including: Parameters.ACTIVITY_KEY -> Attribute identifying the activity in the log Parameters.POSITIVE -> Indicate if events should be kept/removed Returns ----------- filtered_log Filtered log """ if parameters is None: parameters = {} log = log_converter.apply(log, variant=log_converter.Variants.TO_EVENT_LOG, parameters=parameters) attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY) positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True) filtered_log = EventLog(list(), attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers, omni_present=log.omni_present, properties=log.properties) for trace in log: new_trace = Trace() found = False for j in range(len(trace)): if attribute_key in trace[j]: attribute_value = trace[j][attribute_key] if attribute_value in values: found = True if (found and positive) or (not found and not positive): new_trace = trace else: for attr in trace.attributes: new_trace.attributes[attr] = trace.attributes[attr] if len(new_trace) > 0: filtered_log.append(new_trace) return filtered_log