def variant_to_trace(variant, parameters=None): if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) variant_delimiter = exec_utils.get_param_value( Parameters.PARAMETER_VARIANT_DELIMITER, parameters, constants.DEFAULT_VARIANT_SEP) from pm4py.objects.log.obj import Trace, Event trace = Trace() if type(variant) is tuple or type(variant) is list: for act in variant: event = Event({activity_key: act}) trace.append(event) elif type(variant) is str: var_act = variant.split(variant_delimiter) for act in var_act: event = Event({activity_key: act}) trace.append(event) return trace
def form_log_from_dictio_couple(first_cases_repr, second_cases_repr, enable_multiplier=False): """ Form a log from a couple of dictionary, to use for root cause analysis Parameters ------------- first_cases_repr First cases representation second_cases_repr Second cases representation enable_multiplier Enable balancing of classes Returns ------------ log Trace log object """ log = EventLog() if enable_multiplier: multiplier_first = int( max( float(len(second_cases_repr)) / float(len(first_cases_repr)), 1)) multiplier_second = int( max( float(len(first_cases_repr)) / float(len(second_cases_repr)), 1)) else: multiplier_first = 1 multiplier_second = 1 for j in range(multiplier_first): for i in range(len(first_cases_repr)): trace = Trace() event = Event(first_cases_repr[i]) trace.append(event) log.append(trace) for j in range(multiplier_second): for i in range(len(second_cases_repr)): trace = Trace() event = Event(second_cases_repr[i]) trace.append(event) log.append(trace) return log
def apply_tree_variants(variants, parameters=None): """ Apply the IM algorithm to a dictionary of variants obtaining a process tree Parameters ---------- variants Variants parameters Parameters of the algorithm, including: Parameters.ACTIVITY_KEY -> attribute of the log to use as activity name (default concept:name) Returns ---------- process_tree Process tree """ log = EventLog() activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) var_keys = list(variants.keys()) for var in var_keys: trace = Trace() activities = variants_util.get_activities_from_variant(var) for act in activities: trace.append(Event({activity_key: act})) log.append(trace) return apply_tree(log, parameters=parameters)
def list_of_str_to_trace(activities: List[str]) -> Trace: t = Trace() for a in activities: e = Event() e["concept:name"] = a t.append(e) return t
def apply(df, parameters=None): """ Convert a dataframe into a log containing 1 case per variant (only control-flow perspective is considered) Parameters ------------- df Dataframe parameters Parameters of the algorithm Returns ------------- log Event log """ from pm4py.statistics.traces.pandas import case_statistics if parameters is None: parameters = {} variant_stats = case_statistics.get_variant_statistics(df, parameters=parameters) activity_key = parameters[ pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] if pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY log = EventLog() for vd in variant_stats: variant = vd['variant'].split(",") trace = Trace() for activity in variant: event = Event() event[activity_key] = activity trace.append(event) log.append(trace) return log
def apply(log, parameters=None): """ Converts the event log to an event stream Parameters ---------- log: :class:`pm4py.log.log.EventLog` An Event log include_case_attributes: Default is True case_attribute_prefix: Default is 'case:' enable_deepcopy Enables deepcopy (avoid references between input and output objects) Returns ------- log : :class:`pm4py.log.log.EventLog` An Event stream """ if parameters is None: parameters = {} stream_post_processing = exec_utils.get_param_value( Parameters.STREAM_POST_PROCESSING, parameters, False) case_pref = exec_utils.get_param_value(Parameters.CASE_ATTRIBUTE_PREFIX, parameters, 'case:') enable_deepcopy = exec_utils.get_param_value(Parameters.DEEP_COPY, parameters, True) include_case_attributes = exec_utils.get_param_value( Parameters.INCLUDE_CASE_ATTRIBUTES, parameters, True) compress = exec_utils.get_param_value(Parameters.COMPRESS, parameters, True) if pkgutil.find_loader("pandas"): import pandas if isinstance(log, pandas.DataFrame): extensions = __detect_extensions(log) list_events = pandas_utils.to_dict_records(log) if stream_post_processing: list_events = __postprocess_stream(list_events) if compress: list_events = __compress(list_events) for i in range(len(list_events)): list_events[i] = Event(list_events[i]) log = log_instance.EventStream(list_events, attributes={'origin': 'csv'}) for ex in extensions: log.extensions[ex.name] = { xes_constants.KEY_PREFIX: ex.prefix, xes_constants.KEY_URI: ex.uri } if isinstance(log, EventLog): return __transform_event_log_to_event_stream( log, include_case_attributes=include_case_attributes, case_attribute_prefix=case_pref, enable_deepcopy=enable_deepcopy) return log
def apply(tree: ProcessTree, parameters : Optional[Dict[Union[str, Parameters], Any]] = None) -> EventLog: """ Performs an extensive playout of the process tree Parameters ------------- tree Process tree parameters Possible parameters, including: - Parameters.MIN_TRACE_LENGTH => minimum length of a trace (default: 1) - Parameters.MAX_TRACE_LENGTH => maximum length of a trace (default: min_allowed_trace_length) - Parameters.MAX_LOOP_OCC => maximum number of occurrences for a loop (default: MAX_TRACE_LENGTH) - Parameters.ACTIVITY_KEY => activity key - Parameters.MAX_LIMIT_NUM_TRACES => maximum number to the limit of traces; the playout shall stop when the number is reached (default: 100000) Returns ------------- log Event log """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) # to save memory in the returned log, allocate each activity once. to know the list of activities of the # process tree, use the footprints module fp_tree = fp_discovery.apply(tree, parameters=parameters) activities = fp_tree["activities"] activities = {act: Event({activity_key: act}) for act in activities} min_allowed_trace_length = bottomup_discovery.get_min_trace_length(tree, parameters=parameters) min_trace_length = exec_utils.get_param_value(Parameters.MIN_TRACE_LENGTH, parameters, 1) max_trace_length = exec_utils.get_param_value(Parameters.MAX_TRACE_LENGTH, parameters, min_allowed_trace_length) max_loop_occ = exec_utils.get_param_value(Parameters.MAX_LOOP_OCC, parameters, int(max_trace_length / 2)) max_limit_num_traces = exec_utils.get_param_value(Parameters.MAX_LIMIT_NUM_TRACES, parameters, 100000) return_set_strings = exec_utils.get_param_value(Parameters.RETURN_SET_STRINGS, parameters, False) bottomup = bottomup_discovery.get_bottomup_nodes(tree, parameters=parameters) min_rem_dict = bottomup_discovery.get_min_rem_dict(tree, parameters=parameters) max_rem_dict = bottomup_discovery.get_max_rem_dict(tree, parameters=parameters) playout_dictio = {} for i in range(len(bottomup)): get_playout(bottomup[i], playout_dictio, min_trace_length, max_trace_length, max_loop_occ, min_rem_dict, max_rem_dict, max_limit_num_traces) tree_playout_traces = playout_dictio[tree][TRACES] if return_set_strings: return tree_playout_traces log = EventLog() for tr0 in tree_playout_traces: trace = Trace() for act in tr0: trace.append(activities[act]) log.append(trace) return log
def execute_script(): L = EventLog() e1 = Event() e1["concept:name"] = "A" e2 = Event() e2["concept:name"] = "B" e3 = Event() e3["concept:name"] = "C" e4 = Event() e4["concept:name"] = "D" t = Trace() t.append(e1) t.append(e2) t.append(e3) t.append(e4) for i in range(10000): L.append(deepcopy(t)) print(len(L))
def apply(df, parameters=None): """ Convert a dataframe into a log containing N case per variant (only control-flow perspective is considered) Parameters ------------- df Dataframe parameters Parameters of the algorithm Returns ------------- log Event log """ from pm4py.statistics.traces.pandas import case_statistics if parameters is None: parameters = {} return_variants = parameters[ RETURN_VARIANTS] if RETURN_VARIANTS in parameters else False case_glue = parameters[ pm4_constants. PARAMETER_CONSTANT_CASEID_KEY] if pm4_constants.PARAMETER_CONSTANT_CASEID_KEY in parameters else pm4_constants.CASE_CONCEPT_NAME activity_key = parameters[ pm4_constants. PARAMETER_CONSTANT_ACTIVITY_KEY] if pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY variant_stats = case_statistics.get_variant_statistics( df, parameters=parameters) log = EventLog() all_variants_log = {} for vd in variant_stats: variant = vd['variant'].split(",") variant_count = vd[case_glue] trace = Trace() for activity in variant: event = Event() event[activity_key] = activity trace.append(event) all_variants_log[vd['variant']] = [] for i in range(variant_count): log.append(trace) all_variants_log[vd['variant']].append(len(log) - 1) if return_variants: return log, all_variants_log return log
def insert_artificial_start_end(log: EventLog, parameters: Optional[Dict[Any, Any]] = None) -> EventLog: """ Inserts the artificial start/end activities in an event log Parameters ------------------- log Event log parameters Parameters of the algorithm, including: - Parameters.ACTIVITY_KEY: the activity - Parameters.TIMESTAMP_KEY: the timestamp Returns ------------------ log Enriched log """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) artificial_start_activity = exec_utils.get_param_value(Parameters.PARAM_ARTIFICIAL_START_ACTIVITY, parameters, constants.DEFAULT_ARTIFICIAL_START_ACTIVITY) artificial_end_activity = exec_utils.get_param_value(Parameters.PARAM_ARTIFICIAL_END_ACTIVITY, parameters, constants.DEFAULT_ARTIFICIAL_END_ACTIVITY) for trace in log: start_event = Event({activity_key: artificial_start_activity}) end_event = Event({activity_key: artificial_end_activity}) if trace: if timestamp_key in trace[0]: start_event[timestamp_key] = trace[0][timestamp_key] - datetime.timedelta(seconds=1) if timestamp_key in trace[-1]: end_event[timestamp_key] = trace[-1][timestamp_key] + datetime.timedelta(seconds=1) trace.insert(0, start_event) trace.append(end_event) return log
def acyclic_net_variants(net, initial_marking, final_marking, activity_key=xes_util.DEFAULT_NAME_KEY): """ Given an acyclic accepting Petri net, initial and final marking extracts a set of variants (in form of traces) replayable on the net. Warning: this function is based on a marking exploration. If the accepting Petri net contains loops, the method will not work properly as it stops the search if a specific marking has already been encountered. Parameters ---------- :param net: An acyclic workflow net :param initial_marking: The initial marking of the net. :param final_marking: The final marking of the net. :param activity_key: activity key to use Returns ------- :return: variants: :class:`list` Set of variants - in the form of Trace objects - obtainable executing the net """ active = {(initial_marking, ())} visited = set() variants = set() while active: curr_marking, curr_partial_trace = active.pop() curr_pair = (curr_marking, curr_partial_trace) enabled_transitions = semantics.enabled_transitions(net, curr_marking) for transition in enabled_transitions: if transition.label is not None: next_partial_trace = curr_partial_trace + (transition.label, ) else: next_partial_trace = curr_partial_trace next_marking = semantics.execute(transition, net, curr_marking) next_pair = (next_marking, next_partial_trace) if next_marking == final_marking: variants.add(next_partial_trace) else: # If the next marking is not in visited, if the next marking+partial trace is different from the current one+partial trace if next_pair not in visited and curr_pair != next_pair: active.add(next_pair) visited.add(curr_pair) trace_variants = [] for variant in variants: trace = Trace() for activity_label in variant: trace.append(Event({activity_key: activity_label})) trace_variants.append(trace) return trace_variants
def read_trace(self) -> Trace: if self.i < self.no_traces: case_id = self.c_unq[self.i] si = self.c_ind[self.i] ei = si + self.c_counts[self.i] trace = Trace( attributes={xes_constants.DEFAULT_TRACEID_KEY: case_id}) for j in range(si, ei): event = Event({ xes_constants.DEFAULT_NAME_KEY: self.activities[j], xes_constants.DEFAULT_TIMESTAMP_KEY: self.timestamps[j] }) trace.append(event) self.i = self.i + 1 return trace
def parse_event_log_string( traces: Collection[str], sep: str = ",", activity_key: str = xes_constants.DEFAULT_NAME_KEY, timestamp_key: str = xes_constants.DEFAULT_TIMESTAMP_KEY, case_id_key: str = xes_constants.DEFAULT_TRACEID_KEY) -> EventLog: """ Parse a collection of traces expressed as strings (e.g., ["A,B,C,D", "A,C,B,D", "A,D"]) to an event log Parameters ------------------ traces Collection of traces expressed as strings sep Separator used to split the activities of a string trace activity_key The attribute that should be used as activity timestamp_key The attribute that should be used as timestamp case_id_key The attribute that should be used as case identifier Returns ----------------- log Event log """ log = EventLog() this_timest = 10000000 for index, trace in enumerate(traces): activities = trace.split(sep) trace = Trace() trace.attributes[case_id_key] = str(index) for act in activities: event = Event({ activity_key: act, timestamp_key: datetime.datetime.fromtimestamp(this_timest) }) trace.append(event) this_timest = this_timest + 1 log.append(trace) return log
def generate_log(pt0, no_traces=100): """ Generate a log out of a process tree Parameters ------------ pt Process tree no_traces Number of traces contained in the process tree Returns ------------ log Trace log object """ pt = deepcopy(pt0) # different taus must give different ID in log generation!!!! # so we cannot use the default process tree class # we use this different one! pt = GenerationTree(pt) log = EventLog() # assigns to each event an increased timestamp from 1970 curr_timestamp = 10000000 for i in range(no_traces): ex_seq = execute(pt) ex_seq_labels = pt_util.project_execution_sequence_to_labels(ex_seq) trace = Trace() trace.attributes[xes.DEFAULT_NAME_KEY] = str(i) for label in ex_seq_labels: event = Event() event[xes.DEFAULT_NAME_KEY] = label event[xes.DEFAULT_TIMESTAMP_KEY] = datetime.datetime.fromtimestamp(curr_timestamp) trace.append(event) curr_timestamp = curr_timestamp + 1 log.append(trace) return log
def apply( tree: ProcessTree, parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> EventLog: """ Gets the top-bottom playout of a process tree Parameters --------------- tree Process tree parameters Parameters of the algorithm, including: - Parameters.ACTIVITY_KEY: activity key - Parameters.NO_TRACES: number of traces that should be returned Returns --------------- log Event log """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) no_traces = exec_utils.get_param_value(Parameters.NO_TRACES, parameters, 1000) execution_sequences = get_num_ex_sequences(tree, no_traces) log = EventLog() for seq in execution_sequences: trace = Trace() for el in seq: if el.label is not None: event = Event({activity_key: el.label}) trace.append(event) log.append(trace) return log
def keep_only_one_attribute_per_event(log, attribute_key): """ Keeps only one attribute per event Parameters --------------- log Event log attribute_key Attribute key """ new_log = EventLog() if log is not None: for trace in log: new_trace = Trace() for ev in trace: new_trace.append(Event({attribute_key: ev[attribute_key]})) new_log.append(new_trace) return new_log
def form_fake_log(prefixes_keys, activity_key=xes_util.DEFAULT_NAME_KEY): """ Form fake log for replay (putting each prefix as separate trace to align) Parameters ---------- prefixes_keys Keys of the prefixes (to form a log with a given order) activity_key Activity key (must be provided if different from concept:name) """ fake_log = EventLog() for prefix in prefixes_keys: trace = Trace() prefix_activities = prefix.split(constants.DEFAULT_VARIANT_SEP) for activity in prefix_activities: event = Event() event[activity_key] = activity trace.append(event) fake_log.append(trace) return fake_log
def check_is_fitting(*args, activity_key=xes_constants.DEFAULT_NAME_KEY): """ Checks if a trace object is fit against a process model Parameters ----------------- trace Trace object (trace / variant) model Model (process tree, Petri net, BPMN, ...) activity_key Activity key (optional) Returns ----------------- is_fit Boolean value (True if the trace fits; False if the trace does not) """ from pm4py.util import variants_util from pm4py.convert import convert_to_process_tree, convert_to_petri_net trace = args[0] model = args[1:] try: model = convert_to_process_tree(*model) except: # the model cannot be expressed as a process tree, let's say if at least can be expressed as a Petri net model = convert_to_petri_net(*model) if not isinstance(trace, Trace): activities = variants_util.get_activities_from_variant(trace) trace = Trace() for act in activities: trace.append(Event({activity_key: act})) if isinstance(model, ProcessTree): return __check_is_fit_process_tree(trace, model, activity_key=activity_key) elif isinstance(model, tuple) and isinstance(model[0], PetriNet): return __check_is_fit_petri_net(trace, model[0], model[1], model[2], activity_key=activity_key)
def import_from_context(context, num_traces, parameters=None): """ Import a XES log from an iterparse context Parameters -------------- context Iterparse context num_traces Number of traces of the XES log parameters Parameters of the algorithm Returns -------------- log Event log """ if parameters is None: parameters = {} max_no_traces_to_import = exec_utils.get_param_value(Parameters.MAX_TRACES, parameters, sys.maxsize) timestamp_sort = exec_utils.get_param_value(Parameters.TIMESTAMP_SORT, parameters, False) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) reverse_sort = exec_utils.get_param_value(Parameters.REVERSE_SORT, parameters, False) show_progress_bar = exec_utils.get_param_value(Parameters.SHOW_PROGRESS_BAR, parameters, True) date_parser = dt_parser.get() progress = None if pkgutil.find_loader("tqdm") and show_progress_bar: from tqdm.auto import tqdm progress = tqdm(total=num_traces, desc="parsing log, completed traces :: ") log = None trace = None event = None tree = {} for tree_event, elem in context: if tree_event == _EVENT_START: # starting to read parent = tree[elem.getparent()] if elem.getparent() in tree else None if elem.tag.endswith(xes_constants.TAG_STRING): if parent is not None: tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), elem.get(xes_constants.KEY_VALUE), tree) continue elif elem.tag.endswith(xes_constants.TAG_DATE): try: dt = date_parser.apply(elem.get(xes_constants.KEY_VALUE)) tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), dt, tree) except TypeError: logging.info("failed to parse date: " + str(elem.get(xes_constants.KEY_VALUE))) except ValueError: logging.info("failed to parse date: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_EVENT): if event is not None: raise SyntaxError('file contains <event> in another <event> tag') event = Event() tree[elem] = event continue elif elem.tag.endswith(xes_constants.TAG_TRACE): if len(log) >= max_no_traces_to_import: break if trace is not None: raise SyntaxError('file contains <trace> in another <trace> tag') trace = Trace() tree[elem] = trace.attributes continue elif elem.tag.endswith(xes_constants.TAG_FLOAT): if parent is not None: try: val = float(elem.get(xes_constants.KEY_VALUE)) tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), val, tree) except ValueError: logging.info("failed to parse float: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_INT): if parent is not None: try: val = int(elem.get(xes_constants.KEY_VALUE)) tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), val, tree) except ValueError: logging.info("failed to parse int: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_BOOLEAN): if parent is not None: try: val0 = elem.get(xes_constants.KEY_VALUE) val = False if str(val0).lower() == "true": val = True tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), val, tree) except ValueError: logging.info("failed to parse boolean: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_LIST): if parent is not None: # lists have no value, hence we put None as a value tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), None, tree) continue elif elem.tag.endswith(xes_constants.TAG_ID): if parent is not None: tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), elem.get(xes_constants.KEY_VALUE), tree) continue elif elem.tag.endswith(xes_constants.TAG_EXTENSION): if log is None: raise SyntaxError('extension found outside of <log> tag') if elem.get(xes_constants.KEY_NAME) is not None and elem.get( xes_constants.KEY_PREFIX) is not None and elem.get(xes_constants.KEY_URI) is not None: log.extensions[elem.get(xes_constants.KEY_NAME)] = { xes_constants.KEY_PREFIX: elem.get(xes_constants.KEY_PREFIX), xes_constants.KEY_URI: elem.get(xes_constants.KEY_URI)} continue elif elem.tag.endswith(xes_constants.TAG_GLOBAL): if log is None: raise SyntaxError('global found outside of <log> tag') if elem.get(xes_constants.KEY_SCOPE) is not None: log.omni_present[elem.get(xes_constants.KEY_SCOPE)] = {} tree[elem] = log.omni_present[elem.get(xes_constants.KEY_SCOPE)] continue elif elem.tag.endswith(xes_constants.TAG_CLASSIFIER): if log is None: raise SyntaxError('classifier found outside of <log> tag') if elem.get(xes_constants.KEY_KEYS) is not None: classifier_value = elem.get(xes_constants.KEY_KEYS) if "'" in classifier_value: log.classifiers[elem.get(xes_constants.KEY_NAME)] = [x for x in classifier_value.split("'") if x.strip()] else: log.classifiers[elem.get(xes_constants.KEY_NAME)] = classifier_value.split() continue elif elem.tag.endswith(xes_constants.TAG_LOG): if log is not None: raise SyntaxError('file contains > 1 <log> tags') log = EventLog() tree[elem] = log.attributes continue elif tree_event == _EVENT_END: if elem in tree: del tree[elem] elem.clear() if elem.getprevious() is not None: try: del elem.getparent()[0] except TypeError: pass if elem.tag.endswith(xes_constants.TAG_EVENT): if trace is not None: trace.append(event) event = None continue elif elem.tag.endswith(xes_constants.TAG_TRACE): log.append(trace) if progress is not None: progress.update() trace = None continue elif elem.tag.endswith(xes_constants.TAG_LOG): continue # gracefully close progress bar if progress is not None: progress.close() del context, progress if timestamp_sort: log = sorting.sort_timestamp(log, timestamp_key=timestamp_key, reverse_sort=reverse_sort) # sets the activity key as default classifier in the log's properties log.properties[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_constants.DEFAULT_NAME_KEY log.properties[constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = xes_constants.DEFAULT_NAME_KEY # sets the default timestamp key log.properties[constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_constants.DEFAULT_TIMESTAMP_KEY # sets the default resource key log.properties[constants.PARAMETER_CONSTANT_RESOURCE_KEY] = xes_constants.DEFAULT_RESOURCE_KEY # sets the default transition key log.properties[constants.PARAMETER_CONSTANT_TRANSITION_KEY] = xes_constants.DEFAULT_TRANSITION_KEY # sets the default group key log.properties[constants.PARAMETER_CONSTANT_GROUP_KEY] = xes_constants.DEFAULT_GROUP_KEY return log
def import_log_from_file_object(f, encoding, file_size=sys.maxsize, parameters=None): """ Import a log object from a (XML) file object Parameters ----------- f file object encoding Encoding file_size Size of the file (measured on disk) parameters Parameters of the algorithm, including Parameters.TIMESTAMP_SORT -> Specify if we should sort log by timestamp Parameters.TIMESTAMP_KEY -> If sort is enabled, then sort the log by using this key Parameters.REVERSE_SORT -> Specify in which direction the log should be sorted Parameters.MAX_TRACES -> Specify the maximum number of traces to import from the log (read in order in the XML file) Parameters.MAX_BYTES -> Maximum number of bytes to read Parameters.SKYP_BYTES -> Number of bytes to skip Parameters.SET_ATTRIBUTES_TO_READ -> Names of the attributes that should be parsed. If not specified, then, all the attributes are parsed. Returns ----------- log Log file """ values_dict = {} date_parser = dt_parser.get() set_attributes_to_read = exec_utils.get_param_value( Parameters.SET_ATTRIBUTES_TO_READ, parameters, None) max_no_traces_to_import = exec_utils.get_param_value( Parameters.MAX_TRACES, parameters, sys.maxsize) timestamp_sort = exec_utils.get_param_value(Parameters.TIMESTAMP_SORT, parameters, False) timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) reverse_sort = exec_utils.get_param_value(Parameters.REVERSE_SORT, parameters, False) skip_bytes = exec_utils.get_param_value(Parameters.SKIP_BYTES, parameters, False) max_bytes_to_read = exec_utils.get_param_value(Parameters.MAX_BYTES, parameters, sys.maxsize) if file_size > max_bytes_to_read: skip_bytes = file_size - max_bytes_to_read log = EventLog() tracecount = 0 trace = None event = None f.seek(skip_bytes) for line in f: content = line.decode(encoding).split("\"") if len(content) > 0: tag = content[0].split("<")[-1] if trace is not None: if event is not None: if len(content) == 5: key, value = read_attribute_key_value( tag, content, date_parser, values_dict, set_attributes_to_read) if value is not None: event[key] = value elif tag.startswith("/event"): trace.append(event) event = None elif tag.startswith("event"): event = Event() elif len(content) == 5: key, value = read_attribute_key_value( tag, content, date_parser, values_dict, set_attributes_to_read) if value is not None: trace.attributes[key] = value elif tag.startswith("/trace"): log.append(trace) tracecount += 1 if tracecount > max_no_traces_to_import: break trace = None elif tag.startswith("trace"): trace = Trace() if timestamp_sort: log = sorting.sort_timestamp(log, timestamp_key=timestamp_key, reverse_sort=reverse_sort) # sets the activity key as default classifier in the log's properties log.properties[ constants. PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_constants.DEFAULT_NAME_KEY log.properties[ constants. PARAMETER_CONSTANT_ATTRIBUTE_KEY] = xes_constants.DEFAULT_NAME_KEY # sets the default timestamp key log.properties[ constants. PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_constants.DEFAULT_TIMESTAMP_KEY # sets the default resource key log.properties[ constants. PARAMETER_CONSTANT_RESOURCE_KEY] = xes_constants.DEFAULT_RESOURCE_KEY # sets the default transition key log.properties[ constants. PARAMETER_CONSTANT_TRANSITION_KEY] = xes_constants.DEFAULT_TRANSITION_KEY # sets the default group key log.properties[ constants. PARAMETER_CONSTANT_GROUP_KEY] = xes_constants.DEFAULT_GROUP_KEY return log
def to_interval(log, parameters=None): """ Converts a log to interval format (e.g. an event has two timestamps) from lifecycle format (an event has only a timestamp, and a transition lifecycle) Parameters ------------- log Log (expressed in the lifecycle format) parameters Possible parameters of the method (activity, timestamp key, start timestamp key, transition ...) Returns ------------- log Interval event log """ if parameters is None: parameters = {} timestamp_key = parameters[ constants. PARAMETER_CONSTANT_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY start_timestamp_key = parameters[ constants. PARAMETER_CONSTANT_START_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_START_TIMESTAMP_KEY in parameters else xes.DEFAULT_START_TIMESTAMP_KEY transition_key = parameters[ constants. PARAMETER_CONSTANT_TRANSITION_KEY] if constants.PARAMETER_CONSTANT_TRANSITION_KEY in parameters else xes.DEFAULT_TRANSITION_KEY activity_key = parameters[ constants. PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY business_hours = parameters[ "business_hours"] if "business_hours" in parameters else False worktiming = parameters["worktiming"] if "worktiming" in parameters else [ 7, 17 ] weekends = parameters["weekends"] if "weekends" in parameters else [6, 7] if log is not None and len(log) > 0: if "PM4PY_TYPE" in log.attributes and log.attributes[ "PM4PY_TYPE"] == "interval": return log if log[0] is not None and len(log[0]) > 0: first_event = log[0][0] if start_timestamp_key in first_event: return log new_log = EventLog() new_log.attributes["PM4PY_TYPE"] = "interval" for trace in log: new_trace = Trace() for attr in trace.attributes: new_trace.attributes[attr] = trace.attributes[attr] activities_start = {} for event in trace: activity = event[activity_key] transition = event[ transition_key] if transition_key in event else "complete" timestamp = event[timestamp_key] if transition.lower() == "start": if activity not in activities_start: activities_start[activity] = list() activities_start[activity].append(event) elif transition.lower() == "complete": start_event = None start_timestamp = event[timestamp_key] if activity in activities_start and len( activities_start[activity]) > 0: start_event = activities_start[activity].pop(0) start_timestamp = start_event[timestamp_key] new_event = Event() for attr in event: if not attr == timestamp_key and not attr == transition_key: new_event[attr] = event[attr] if start_event is not None: for attr in start_event: if not attr == timestamp_key and not attr == transition_key: new_event["@@startevent_" + attr] = start_event[attr] new_event[start_timestamp_key] = start_timestamp new_event[timestamp_key] = timestamp new_event["@@duration"] = ( timestamp - start_timestamp).total_seconds() if business_hours: bh = BusinessHours( start_timestamp.replace(tzinfo=None), timestamp.replace(tzinfo=None), worktiming=worktiming, weekends=weekends) new_event["@@approx_bh_duration"] = bh.getseconds() new_trace.append(new_event) new_trace = sorting.sort_timestamp_trace(new_trace, start_timestamp_key) new_log.append(new_trace) return new_log return log
def from_dict_to_event(event_dict): timestamp_field_name = "time:timestamp" if timestamp_field_name in event_dict.keys(): event_dict[timestamp_field_name] = dt_parser.get().apply( event_dict[timestamp_field_name]) return Event(event_dict)
def apply( dfg: Dict[Tuple[str, str], int], start_activities: Dict[str, int], end_activities: Dict[str, int], parameters: Optional[Dict[Union[str, Parameters], Any]] = None ) -> Union[EventLog, Dict[Tuple[str, str], int]]: """ Applies the playout algorithm on a DFG, extracting the most likely traces according to the DFG Parameters --------------- dfg *Complete* DFG start_activities Start activities end_activities End activities parameters Parameters of the algorithm, including: - Parameters.ACTIVITY_KEY => the activity key of the simulated log - Parameters.TIMESTAMP_KEY => the timestamp key of the simulated log - Parameters.MAX_NO_VARIANTS => the maximum number of variants generated by the method (default: 3000) - Parameters.MIN_WEIGHTED_PROBABILITY => the minimum overall weighted probability that makes the method stop (default: 1) - Parameters.MAX_NO_OCC_PER_ACTIVITY => the maximum number of occurrences per activity in the traces of the log (default: 2) - Parameters.INTERRUPT_SIMULATION_WHEN_DFG_COMPLETE => interrupts the simulation when the DFG of the simulated log has the same keys to the DFG of the original log (all behavior is contained) (default: False) - Parameters.ADD_TRACE_IF_TAKES_NEW_ELS_TO_DFG => adds a simulated trace to the simulated log only if it adds elements to the simulated DFG, e.g., it adds behavior; skip insertion otherwise (default: False) - Parameters.RETURN_VARIANTS => returns the traces as variants with a likely number of occurrences Returns --------------- simulated_log Simulated log """ if parameters is None: parameters = {} timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) max_no_variants = exec_utils.get_param_value(Parameters.MAX_NO_VARIANTS, parameters, 3000) min_weighted_probability = exec_utils.get_param_value( Parameters.MIN_WEIGHTED_PROBABILITY, parameters, 1.0) interrupt_simulation_when_dfg_complete = exec_utils.get_param_value( Parameters.INTERRUPT_SIMULATION_WHEN_DFG_COMPLETE, parameters, False) add_trace_if_takes_new_els_to_dfg = exec_utils.get_param_value( Parameters.ADD_TRACE_IF_TAKES_NEW_ELS_TO_DFG, parameters, False) return_variants = exec_utils.get_param_value(Parameters.RETURN_VARIANTS, parameters, False) max_execution_time = exec_utils.get_param_value( Parameters.MAX_EXECUTION_TIME, parameters, sys.maxsize) # keep track of the DFG, start activities and end activities of the (ongoing) simulation simulated_traces_dfg = set() simulated_traces_sa = set() simulated_traces_ea = set() interrupt_break_condition = False overall_probability = 0.0 final_traces = [] start_time = time.time() for tr, p in get_traces(dfg, start_activities, end_activities, parameters=parameters): if (interrupt_simulation_when_dfg_complete and interrupt_break_condition ) or not (len(final_traces) < max_no_variants and overall_probability <= min_weighted_probability): break current_time = time.time() if (current_time - start_time) > max_execution_time: break overall_probability += p diff_sa = {tr[0]}.difference(simulated_traces_sa) diff_ea = {tr[-1]}.difference(simulated_traces_ea) diff_dfg = {(tr[i], tr[i + 1]) for i in range(len(tr) - 1) }.difference(simulated_traces_dfg) adds_something = len(diff_sa) > 0 or len(diff_ea) > 0 or len( diff_dfg) > 0 if add_trace_if_takes_new_els_to_dfg and not adds_something: # interrupt the addition if the ADD_TRACE_IF_TAKES_NEW_ELS_TO_DFG is set to True, # and the trace does not really change the information on the DFG, start activities, # end activities continue # update the start activities, end activities, DFG of the original log simulated_traces_sa = simulated_traces_sa.union(diff_sa) simulated_traces_ea = simulated_traces_ea.union(diff_ea) simulated_traces_dfg = simulated_traces_dfg.union(diff_dfg) # memorize the difference between the original DFG and the DFG of the simulated log diff_original_sa = set(start_activities).difference( simulated_traces_sa) diff_original_ea = set(end_activities).difference(simulated_traces_ea) diff_original_dfg = set(dfg).difference(simulated_traces_dfg) interrupt_break_condition = len(diff_original_sa) == 0 and len( diff_original_ea) == 0 and len(diff_original_dfg) == 0 final_traces.append((-p, tr)) if interrupt_simulation_when_dfg_complete and interrupt_break_condition: break # make sure that the traces are strictly ordered by their probability # (generally, the order is already pretty good, since the states are visited in the queue based on their order, # but not always 100% consistent) final_traces = sorted(final_traces) if return_variants: # returns the variants instead of the log variants = [] for p, tr in final_traces: variants.append({ "variant": constants.DEFAULT_VARIANT_SEP.join(tr), "count": math.ceil(-p * max_no_variants) }) return variants else: event_log = EventLog() # assigns to each event an increased timestamp from 1970 curr_timestamp = 10000000 for index, tr in enumerate(final_traces): log_trace = Trace( attributes={ xes_constants.DEFAULT_TRACEID_KEY: str(index), "probability": -tr[0] }) for act in tr[1]: log_trace.append( Event({ activity_key: act, timestamp_key: datetime.datetime.fromtimestamp(curr_timestamp) })) # increases by 1 second curr_timestamp += 1 event_log.append(log_trace) return event_log
def run(self): """ Runs the thread """ if self.enable_diagnostics: diagnostics = SimulationDiagnostics(self) diagnostics.start() from intervaltree import IntervalTree, Interval logging.basicConfig() logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) net, im, fm, smap, source, sink, start_time = self.net, self.im, self.fm, self.map, self.source, self.sink, self.start_time places_interval_trees = self.places_interval_trees transitions_interval_trees = self.transitions_interval_trees cases_ex_time = self.cases_ex_time current_time = start_time self.internal_thread_start_time = time() rem_time = self.get_rem_time() acquired_places = set() acquired = source.semaphore.acquire(timeout=rem_time) if acquired: acquired_places.add(source) source.assigned_time.append(current_time) current_marking = im et = enabled_transitions(net, current_marking) first_event = None last_event = None while not fm <= current_marking or len(et) == 0: et = list(enabled_transitions(net, current_marking)) ct = stochastic_utils.pick_transition(et, smap) simulated_execution_plus_waiting_time = -1 while simulated_execution_plus_waiting_time < 0: simulated_execution_plus_waiting_time = smap[ct].get_value( ) if ct in smap else 0.0 # establish how much time we need to wait before firing the transition # (it depends on the input places tokens) waiting_time = 0 for arc in ct.out_arcs: place = arc.target sem_value = int(place.semaphore._value) rem_time = self.get_rem_time() acquired = place.semaphore.acquire(timeout=rem_time) if acquired: acquired_places.add(place) rem_time = self.get_rem_time() if rem_time == 0: break if sem_value == 0: waiting_time = max( waiting_time, place.assigned_time.pop(0) - current_time) if place.assigned_time else waiting_time if rem_time == 0: for place in acquired_places: place.semaphore.release() break # if the waiting time is greater than 0, add an interval to the interval tree denoting # the waiting times for the given transition if waiting_time > 0: transitions_interval_trees[ct].add( Interval(current_time, current_time + waiting_time)) # get the actual execution time of the transition as a difference between simulated_execution_plus_waiting_time # and the waiting time execution_time = max( simulated_execution_plus_waiting_time - waiting_time, 0) # increase the timing based on the waiting time and the execution time of the transition current_time = current_time + waiting_time + execution_time for arc in ct.out_arcs: place = arc.target place.assigned_time.append(current_time) place.assigned_time = sorted(place.assigned_time) current_marking = weak_execute(ct, current_marking) if ct.label is not None: eve = Event({ xes_constants.DEFAULT_NAME_KEY: ct.label, xes_constants.DEFAULT_TIMESTAMP_KEY: datetime.datetime.fromtimestamp(current_time) }) last_event = eve if first_event is None: first_event = last_event self.list_cases[self.id].append(eve) for arc in ct.in_arcs: place = arc.source p_ex_time = place.assigned_time.pop(0) if current_time - p_ex_time > 0: places_interval_trees[place].add( Interval(p_ex_time, current_time)) place.assigned_time.append(current_time) place.assigned_time = sorted(place.assigned_time) place.semaphore.release() # sleep before starting next iteration sleep((waiting_time + execution_time) / self.small_scale_factor) if first_event is not None and last_event is not None: cases_ex_time.append( last_event[xes_constants.DEFAULT_TIMESTAMP_KEY].timestamp() - first_event[xes_constants.DEFAULT_TIMESTAMP_KEY].timestamp()) else: cases_ex_time.append(0) places_to_free = set(current_marking).union(acquired_places) for place in places_to_free: place.semaphore.release() rem_time = self.get_rem_time() if rem_time > 0: self.terminated_correctly = True if self.enable_diagnostics: logger.info( str(time()) + " terminated successfully thread ID " + str(self.id)) if self.enable_diagnostics: if rem_time == 0: if self.enable_diagnostics: logger.info( str(time()) + " terminated for timeout thread ID " + str(self.id)) if self.enable_diagnostics: diagnostics.diagn_open = False
def to_interval(log, parameters=None): """ Converts a log to interval format (e.g. an event has two timestamps) from lifecycle format (an event has only a timestamp, and a transition lifecycle) Parameters ------------- log Log (expressed in the lifecycle format) parameters Possible parameters of the method (activity, timestamp key, start timestamp key, transition ...) Returns ------------- log Interval event log """ if parameters is None: parameters = {} timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes.DEFAULT_TIMESTAMP_KEY) start_timestamp_key = exec_utils.get_param_value( Parameters.START_TIMESTAMP_KEY, parameters, xes.DEFAULT_START_TIMESTAMP_KEY) transition_key = exec_utils.get_param_value(Parameters.TRANSITION_KEY, parameters, xes.DEFAULT_TRANSITION_KEY) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY) lifecycle_instance_key = exec_utils.get_param_value( Parameters.LIFECYCLE_INSTANCE_KEY, parameters, xes.DEFAULT_INSTANCE_KEY) business_hours = exec_utils.get_param_value(Parameters.BUSINESS_HOURS, parameters, False) worktiming = exec_utils.get_param_value(Parameters.WORKTIMING, parameters, [7, 17]) weekends = exec_utils.get_param_value(Parameters.WEEKENDS, parameters, [6, 7]) if log is not None and len(log) > 0: if "PM4PY_TYPE" in log.attributes and log.attributes[ "PM4PY_TYPE"] == "interval": return log if log[0] is not None and len(log[0]) > 0: first_event = log[0][0] if start_timestamp_key in first_event: return log new_log = EventLog(attributes=copy(log.attributes), extensions=copy(log.extensions), classifiers=copy(log.classifiers), omni_present=copy(log.omni_present), properties=copy(log.properties)) new_log.attributes["PM4PY_TYPE"] = "interval" new_log.properties[ constants. PARAMETER_CONSTANT_START_TIMESTAMP_KEY] = xes.DEFAULT_START_TIMESTAMP_KEY for trace in log: new_trace = Trace() for attr in trace.attributes: new_trace.attributes[attr] = trace.attributes[attr] activities_start = {} for event in trace: activity = event[activity_key] instance = event[ lifecycle_instance_key] if lifecycle_instance_key in event else None activity = (activity, instance) transition = event[ transition_key] if transition_key in event else "complete" timestamp = event[timestamp_key] if transition.lower() == "start": if activity not in activities_start: activities_start[activity] = list() activities_start[activity].append(event) elif transition.lower() == "complete": start_event = None start_timestamp = event[timestamp_key] if activity in activities_start and len( activities_start[activity]) > 0: start_event = activities_start[activity].pop(0) start_timestamp = start_event[timestamp_key] new_event = Event() for attr in event: if not attr == timestamp_key and not attr == transition_key: new_event[attr] = event[attr] if start_event is not None: for attr in start_event: if not attr == timestamp_key and not attr == transition_key: new_event["@@startevent_" + attr] = start_event[attr] new_event[start_timestamp_key] = start_timestamp new_event[timestamp_key] = timestamp new_event["@@duration"] = ( timestamp - start_timestamp).total_seconds() if business_hours: bh = BusinessHours( start_timestamp.replace(tzinfo=None), timestamp.replace(tzinfo=None), worktiming=worktiming, weekends=weekends) new_event["@@approx_bh_duration"] = bh.getseconds() new_trace.append(new_event) new_trace = sorting.sort_timestamp_trace(new_trace, start_timestamp_key) new_log.append(new_trace) return new_log return log
def to_lifecycle(log, parameters=None): """ Converts a log from interval format (e.g. an event has two timestamps) to lifecycle format (an event has only a timestamp, and a transition lifecycle) Parameters ------------- log Log (expressed in the interval format) parameters Possible parameters of the method (activity, timestamp key, start timestamp key, transition ...) Returns ------------- log Lifecycle event log """ if parameters is None: parameters = {} timestamp_key = parameters[ constants. PARAMETER_CONSTANT_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY start_timestamp_key = parameters[ constants. PARAMETER_CONSTANT_START_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_START_TIMESTAMP_KEY in parameters else xes.DEFAULT_START_TIMESTAMP_KEY transition_key = parameters[ constants. PARAMETER_CONSTANT_TRANSITION_KEY] if constants.PARAMETER_CONSTANT_TRANSITION_KEY in parameters else xes.DEFAULT_TRANSITION_KEY if log is not None and len(log) > 0: if "PM4PY_TYPE" in log.attributes and log.attributes[ "PM4PY_TYPE"] == "lifecycle": return log if log[0] is not None and len(log[0]) > 0: first_event = log[0][0] if transition_key in first_event: return log new_log = EventLog() new_log.attributes["PM4PY_TYPE"] = "lifecycle" for trace in log: new_trace = Trace() for attr in trace.attributes: new_trace.attributes[attr] = trace.attributes[attr] list_events = [] for index, event in enumerate(trace): new_event_start = Event() new_event_complete = Event() for attr in event: if not attr == timestamp_key and not attr == start_timestamp_key: new_event_start[attr] = event[attr] new_event_complete[attr] = event[attr] new_event_start[timestamp_key] = event[start_timestamp_key] new_event_start[transition_key] = "start" new_event_start["@@custom_lif_id"] = 0 new_event_start["@@origin_ev_idx"] = index new_event_complete[timestamp_key] = event[timestamp_key] new_event_complete[transition_key] = "complete" new_event_complete["@@custom_lif_id"] = 1 new_event_complete["@@origin_ev_idx"] = index list_events.append(new_event_start) list_events.append(new_event_complete) list_events = sorted( list_events, key=lambda x: (x[timestamp_key], x["@@origin_ev_idx"], x["@@custom_lif_id"])) for ev in list_events: new_trace.append(ev) new_log.append(new_trace) return new_log return log
def read_event(self): """ Gets the next event from the iterator Returns ------------ event Event """ tree = self.tree while True: tree_event, elem = next(self.context) if tree_event == _EVENT_START: parent = tree[ elem.getparent()] if elem.getparent() in tree else None if elem.tag.endswith(xes_constants.TAG_TRACE): self.trace = Trace() tree[elem] = self.trace.attributes self.reading_trace = True continue if elem.tag.endswith(xes_constants.TAG_EVENT): self.event = Event() tree[elem] = self.event self.reading_event = True continue if self.reading_event or self.reading_trace: if elem.tag.endswith(xes_constants.TAG_STRING): if parent is not None: tree = parse_attribute( elem, parent, elem.get(xes_constants.KEY_KEY), elem.get(xes_constants.KEY_VALUE), tree) continue elif elem.tag.endswith(xes_constants.TAG_DATE): try: dt = self.date_parser.apply( elem.get(xes_constants.KEY_VALUE)) tree = parse_attribute( elem, parent, elem.get(xes_constants.KEY_KEY), dt, tree) except TypeError: logging.info( "failed to parse date: " + str(elem.get(xes_constants.KEY_VALUE))) except ValueError: logging.info( "failed to parse date: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_FLOAT): if parent is not None: try: val = float(elem.get(xes_constants.KEY_VALUE)) tree = parse_attribute( elem, parent, elem.get(xes_constants.KEY_KEY), val, tree) except ValueError: logging.info( "failed to parse float: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_INT): if parent is not None: try: val = int(elem.get(xes_constants.KEY_VALUE)) tree = parse_attribute( elem, parent, elem.get(xes_constants.KEY_KEY), val, tree) except ValueError: logging.info( "failed to parse int: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_BOOLEAN): if parent is not None: try: val0 = elem.get(xes_constants.KEY_VALUE) val = False if str(val0).lower() == "true": val = True tree = parse_attribute( elem, parent, elem.get(xes_constants.KEY_KEY), val, tree) except ValueError: logging.info( "failed to parse boolean: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_LIST): if parent is not None: # lists have no value, hence we put None as a value tree = parse_attribute( elem, parent, elem.get(xes_constants.KEY_KEY), None, tree) continue elif elem.tag.endswith(xes_constants.TAG_ID): if parent is not None: tree = parse_attribute( elem, parent, elem.get(xes_constants.KEY_KEY), elem.get(xes_constants.KEY_VALUE), tree) continue elif tree_event == _EVENT_END: if elem in tree: del tree[elem] elem.clear() if elem.getprevious() is not None: try: del elem.getparent()[0] except TypeError: pass if elem.tag.endswith(xes_constants.TAG_EVENT): self.reading_event = False if self.acceptance_condition(self.event): for attr in self.trace.attributes: self.event[constants.CASE_ATTRIBUTE_PREFIX + attr] = self.trace.attributes[attr] return self.event continue elif elem.tag.endswith(xes_constants.TAG_TRACE): self.reading_trace = False continue elif elem.tag.endswith(xes_constants.TAG_LOG): self.reading_log = False break
def preprocess_log(log, activities=None, parameters=None): """ Preprocess a log to enable correlation mining Parameters -------------- log Log object activities (if provided) list of activities of the log parameters Parameters of the algorithm Returns -------------- transf_stream Transformed stream activities_grouped Grouped activities activities List of activities of the log """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) start_timestamp_key = exec_utils.get_param_value( Parameters.START_TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) index_key = exec_utils.get_param_value(Parameters.INDEX_KEY, parameters, DEFAULT_INDEX_KEY) if type(log) is pd.DataFrame: # keep only the two columns before conversion log = log[list(set([activity_key, timestamp_key, start_timestamp_key]))] parameters["deepcopy"] = False parameters["include_case_attributes"] = False log = converter.apply(log, variant=converter.TO_EVENT_STREAM, parameters=parameters) transf_stream = EventStream() for idx, ev in enumerate(log): transf_stream.append( Event({ activity_key: ev[activity_key], timestamp_key: ev[timestamp_key].timestamp(), start_timestamp_key: ev[start_timestamp_key].timestamp(), index_key: idx })) transf_stream = sorted( transf_stream, key=lambda x: (x[start_timestamp_key], x[timestamp_key], x[index_key])) if activities is None: activities = sorted(list(set(x[activity_key] for x in transf_stream))) activities_grouped = { x: [y for y in transf_stream if y[activity_key] == x] for x in activities } return transf_stream, activities_grouped, activities
def apply(frequency_dfg: Dict[Tuple[str, str], int], start_activities: Dict[str, int], end_activities: Dict[str, int], parameters: Optional[Dict[Any, Any]] = None) -> EventLog: """ Simulates a log out with the transition probabilities provided by the frequency DFG, and the time deltas provided by the performance DFG Parameters --------------- frequency_dfg Frequency DFG start_activities Start activities end_activities End activities parameters Parameters of the algorithm, including: - Parameters.NUM_TRACES: the number of traces of the simulated log - Parameters.ACTIVITY_KEY: the activity key to be used in the simulated log - Parameters.TIMESTAMP_KEY: the timestamp key to be used in the simulated log - Parameters.CASE_ID_KEY: the case identifier key to be used in the simulated log - Parameters.CASE_ARRIVAL_RATE: the average distance (in seconds) between the start of two cases (default: 1) - Parameters.PERFORMANCE_DFG: (mandatory) the performance DFG that is used for the time deltas. Returns --------------- simulated_log Simulated log """ if parameters is None: parameters = {} num_traces = exec_utils.get_param_value(Parameters.NUM_TRACES, parameters, 1000) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, xes_constants.DEFAULT_TRACEID_KEY) case_arrival_rate = exec_utils.get_param_value( Parameters.CASE_ARRIVAL_RATE, parameters, 1) performance_dfg = copy( exec_utils.get_param_value(Parameters.PERFORMANCE_DFG, parameters, None)) frequency_dfg = copy(frequency_dfg) artificial_start_activity = exec_utils.get_param_value( Parameters.PARAM_ARTIFICIAL_START_ACTIVITY, parameters, constants.DEFAULT_ARTIFICIAL_START_ACTIVITY) artificial_end_activity = exec_utils.get_param_value( Parameters.PARAM_ARTIFICIAL_END_ACTIVITY, parameters, constants.DEFAULT_ARTIFICIAL_END_ACTIVITY) for sa in start_activities: frequency_dfg[(artificial_start_activity, sa)] = start_activities[sa] performance_dfg[(artificial_start_activity, sa)] = 0 for ea in end_activities: frequency_dfg[(ea, artificial_end_activity)] = end_activities[ea] performance_dfg[(ea, artificial_end_activity)] = 0 choices = {} for el in frequency_dfg: if not el[0] in choices: choices[el[0]] = {} choices[el[0]][el[1]] = frequency_dfg[el] if performance_dfg is None: raise Exception( "performance DFG simulation requires the Parameters.PERFORMANCE_DFG ('performance_dfg') parameter specification." ) log = EventLog() curr_st = 10000000 for i in range(num_traces): curr_st += case_arrival_rate curr_t = curr_st trace = Trace(attributes={case_id_key: str(i)}) log.append(trace) curr_act = artificial_start_activity while True: next_act = dict_based_choice(choices[curr_act]) if next_act == artificial_end_activity or next_act is None: break perf = performance_dfg[(curr_act, next_act)] if type(perf) is dict: perf = perf["mean"] perf = 0 if perf == 0 else exponential(perf) curr_t += perf curr_act = next_act eve = Event({ activity_key: curr_act, timestamp_key: datetime.fromtimestamp(curr_t) }) trace.append(eve) return log