def sort_timestamp_stream(event_log, timestamp_key=xes.DEFAULT_TIMESTAMP_KEY, reverse_sort=False): """ Sort an event log based on timestamp key Parameters ----------- event_log Event log timestamp_key Timestamp key reverse_sort If true, reverses the direction in which the sort is done (ascending) Returns ----------- event_log Sorted event log """ events = sorted(event_log._list, key=lambda x: x[timestamp_key], reverse=reverse_sort) new_stream = EventStream(events, attributes=event_log.attributes, extensions=event_log.extensions, omni_present=event_log.omni_present, classifiers=event_log.classifiers, properties=event_log.properties) return new_stream
def table_to_stream(table, parameters=None): """ Converts a Pyarrow table to an event stream Parameters ------------ table Pyarrow table parameters Possible parameters of the algorithm """ if parameters is None: parameters = {} dict0 = table.to_pydict() keys = list(dict0.keys()) # for legacy format support if LEGACY_PARQUET_CASECONCEPTNAME in keys: for key in keys: dict0[key.replace(LEGACY_PARQUET_TP_REPLACER, ":")] = dict0.pop(key) stream = EventStream([dict(zip(dict0, i)) for i in zip(*dict0.values())]) return stream
def new_window(self, begin, end, activity=''): # increment the id of the window if activity: # when using a detector for an attribute of the activity print( f'Generating model for sub-log [{begin} - {end - 1}] - window [{self.window_count[activity]}] - activity [{activity}]') self.window_count[activity] += 1 else: print(f'Generating model for sub-log [{begin} - {end - 1}] - window [{self.window_count}]') self.window_count += 1 if self.current_parameters.read_log_as == ReadLogAs.EVENT.name: # generate the sub-log for the window window = EventStream(self.event_data[begin:end]) sub_log = log_converter.apply(window, variant=log_converter.Variants.TO_EVENT_LOG) elif self.current_parameters.read_log_as == ReadLogAs.TRACE.name: sub_log = EventLog(self.event_data[begin:end]) else: print(f'Incorrect window type: {self.current_parameters.read_log_as}.') # save the sub-log output_path = os.path.join(self.logs_path, self.current_parameters.logname, activity) if not os.path.exists(output_path): os.makedirs(output_path) if activity and activity != '': output_filename = os.path.join(output_path, f'sublog_w{self.window_count[activity]}_{begin}_{end - 1}.xes') else: output_filename = os.path.join(output_path, f'sublog_w{self.window_count}_{begin}_{end - 1}.xes') xes_exporter.apply(sub_log, output_filename) self.execute_processes_for_window(sub_log, begin, activity)
def sort_lambda_stream(event_log, sort_function, reverse=False): """ Sort a stream based on a lambda expression Parameters ------------ event_log Stream sort_function Sort function reverse Boolean (sort by reverse order) Returns ------------ stream Sorted stream """ events = sorted(event_log._list, key=sort_function, reverse=reverse) new_stream = EventStream(events, attributes=event_log.attributes, extensions=event_log.extensions, omni_present=event_log.omni_present, classifiers=event_log.classifiers, properties=event_log.properties) return new_stream
def apply_numeric_events(log, int1, int2, parameters=None): """ Apply a filter on events (numerical filter) Parameters -------------- log Log int1 Lower bound of the interval int2 Upper bound of the interval parameters Possible parameters of the algorithm: Parameters.ATTRIBUTE_KEY => indicates which attribute to filter Parameters.POSITIVE => keep or remove traces with such events? Returns -------------- filtered_log Filtered log """ if parameters is None: parameters = {} attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY) stream = log_converter.apply(log, variant=log_converter.TO_EVENT_STREAM) if exec_utils.get_param_value(Parameters.POSITIVE, parameters, True): stream = EventStream(list(filter(lambda x: attribute_key in x and int1 <= x[attribute_key] <= int2, stream)), attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers, omni_present=log.omni_present, properties=log.properties) else: stream = EventStream( list(filter(lambda x: attribute_key in x and (x[attribute_key] < int1 or x[attribute_key] > int2), stream)), attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers, omni_present=log.omni_present, properties=log.properties) filtered_log = log_converter.apply(stream) return filtered_log
def sample_stream(event_log, no_events=100): """ Randomly sample a fixed number of events from the original event log Parameters ----------- event_log Event log no_events Number of events that the sample should have Returns ----------- newLog Filtered log """ new_log = EventStream(attributes=event_log.attributes, extensions=event_log.extensions, globals=event_log.omni_present, classifiers=event_log.classifiers) new_log._list = random.sample(event_log, min(no_events, len(event_log))) return new_log
def process_two_fixed_sliding_windows(self, event_data, initial_index_w1, initial_index_w2, winsize): print(f'process_two_windows w1: {initial_index_w1} w2: {initial_index_w2} winsize: {winsize}') if self.current_parameters.read_log_as == ReadLogAs.EVENT.name: # generate the sub-log for the window window1 = EventStream(event_data[initial_index_w1:initial_index_w1 + winsize]) window2 = EventStream(event_data[initial_index_w2:initial_index_w2 + winsize]) sub_log1 = log_converter.apply(window1, variant=log_converter.Variants.TO_EVENT_LOG) sub_log2 = log_converter.apply(window2, variant=log_converter.Variants.TO_EVENT_LOG) elif self.current_parameters.read_log_as == ReadLogAs.TRACE.name: sub_log1 = EventLog(event_data[initial_index_w1:initial_index_w1 + winsize]) sub_log2 = EventLog(event_data[initial_index_w2:(initial_index_w2 + winsize)]) print(f'Sub-log1: {len(sub_log1)} - Sub-log2: {len(sub_log2)}') else: print(f'Incorrect window type: {self.current_parameters.read_log_as}.') # TODO remove after debugging # for debug purpose # dataframe = log_converter.apply(sub_log1, variant=log_converter.Variants.TO_DATA_FRAME) # dataframe.to_csv(f'data/debug/{self.current_parameters.logname}_{self.window_count}_log1.csv') # dataframe = log_converter.apply(sub_log2, variant=log_converter.Variants.TO_DATA_FRAME) # dataframe.to_csv(f'data/debug/{self.current_parameters.logname}_{self.window_count}_log2.csv') self.compare_sliding_fixed_windows(sub_log1, sub_log2)
def apply_events(log, values, parameters=None): """ Filter log by keeping only events with an attribute value that belongs to the provided values list Parameters ----------- log log values Allowed attributes parameters Parameters of the algorithm, including: Parameters.ACTIVITY_KEY -> Attribute identifying the activity in the log Parameters.POSITIVE -> Indicate if events should be kept/removed Returns ----------- filtered_log Filtered log """ if parameters is None: parameters = {} attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY) positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True) stream = log_converter.apply(log, variant=log_converter.TO_EVENT_STREAM) if positive: stream = EventStream(list(filter(lambda x: x[attribute_key] in values, stream)), attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers, omni_present=log.omni_present, properties=log.properties) else: stream = EventStream(list(filter(lambda x: x[attribute_key] not in values, stream)), attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers, omni_present=log.omni_present, properties=log.properties) filtered_log = log_converter.apply(stream) return filtered_log
def filter_log_events_attr(log, values, parameters=None): """ Filter log by keeping only events with an attribute value that belongs to the provided values list Parameters ----------- log log values Allowed attributes parameters Parameters of the algorithm, including: activity_key -> Attribute identifying the activity in the log positive -> Indicate if events should be kept/removed Returns ----------- filtered_log Filtered log """ # CODE SAVING FROM FILTERS if parameters is None: parameters = {} attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY) positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True) stream = log_converter.apply(log, variant=log_converter.TO_EVENT_STREAM, parameters={"deepcopy": False}) if positive: stream = EventStream(list(filter(lambda x: x[attribute_key] in values, stream))) else: stream = EventStream(list(filter(lambda x: x[attribute_key] not in values, stream))) filtered_log = log_converter.apply(stream, variant=log_converter.Variants.TO_EVENT_LOG) return filtered_log
def apply_events( log: EventLog, dt1: Union[str, datetime.datetime], dt2: Union[str, datetime.datetime], parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> EventLog: """ Get a new log containing all the events contained in the given interval Parameters ----------- log Log dt1 Lower bound to the interval dt2 Upper bound to the interval parameters Possible parameters of the algorithm, including: Parameters.TIMESTAMP_KEY -> Attribute to use as timestamp Returns ------------ filtered_log Filtered log """ if parameters is None: parameters = {} timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY) dt1 = get_dt_from_string(dt1) dt2 = get_dt_from_string(dt2) stream = log_converter.apply(log, variant=log_converter.TO_EVENT_STREAM, parameters={"deepcopy": False}) filtered_stream = EventStream([ x for x in stream if dt1 <= x[timestamp_key].replace(tzinfo=None) <= dt2 ], attributes=log.attributes, extensions=log.extensions, omni_present=log.omni_present, classifiers=log.classifiers, properties=log.properties) filtered_log = log_converter.apply( filtered_stream, variant=log_converter.Variants.TO_EVENT_LOG) return filtered_log
def preprocess_log(log, activities=None, parameters=None): """ Preprocess a log to enable correlation mining Parameters -------------- log Log object activities (if provided) list of activities of the log parameters Parameters of the algorithm Returns -------------- transf_stream Transformed stream activities_grouped Grouped activities activities List of activities of the log """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) start_timestamp_key = exec_utils.get_param_value( Parameters.START_TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) index_key = exec_utils.get_param_value(Parameters.INDEX_KEY, parameters, DEFAULT_INDEX_KEY) if type(log) is pd.DataFrame: # keep only the two columns before conversion log = log[list(set([activity_key, timestamp_key, start_timestamp_key]))] parameters["deepcopy"] = False parameters["include_case_attributes"] = False log = converter.apply(log, variant=converter.TO_EVENT_STREAM, parameters=parameters) transf_stream = EventStream() for idx, ev in enumerate(log): transf_stream.append( Event({ activity_key: ev[activity_key], timestamp_key: ev[timestamp_key].timestamp(), start_timestamp_key: ev[start_timestamp_key].timestamp(), index_key: idx })) transf_stream = sorted( transf_stream, key=lambda x: (x[start_timestamp_key], x[timestamp_key], x[index_key])) if activities is None: activities = sorted(list(set(x[activity_key] for x in transf_stream))) activities_grouped = { x: [y for y in transf_stream if y[activity_key] == x] for x in activities } return transf_stream, activities_grouped, activities
def apply_numeric(log, int1, int2, parameters=None): """ Apply a filter on cases (numerical filter) Parameters -------------- log Log int1 Lower bound of the interval int2 Upper bound of the interval parameters Possible parameters of the algorithm Returns -------------- filtered_df Filtered dataframe """ if parameters is None: parameters = {} attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY) case_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, xes.DEFAULT_TRACEID_KEY) positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True) # stream_filter_key is helpful to filter on cases containing an event with an attribute # in the specified value set, but such events shall have an activity in particular. stream_filter_key1 = exec_utils.get_param_value(Parameters.STREAM_FILTER_KEY1, parameters, None) stream_filter_value1 = exec_utils.get_param_value(Parameters.STREAM_FILTER_VALUE1, parameters, None) stream_filter_key2 = exec_utils.get_param_value(Parameters.STREAM_FILTER_KEY2, parameters, None) stream_filter_value2 = exec_utils.get_param_value(Parameters.STREAM_FILTER_VALUE2, parameters, None) stream = log_converter.apply(log, variant=log_converter.TO_EVENT_STREAM) if stream_filter_key1 is not None: stream = EventStream( list(filter(lambda x: stream_filter_key1 in x and x[stream_filter_key1] == stream_filter_value1, stream)), attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers, omni_present=log.omni_present, properties=log.properties) if stream_filter_key2 is not None: stream = EventStream( list(filter(lambda x: stream_filter_key2 in x and x[stream_filter_key2] == stream_filter_value2, stream)), attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers, omni_present=log.omni_present, properties=log.properties) if positive: stream = EventStream(list(filter(lambda x: attribute_key in x and int1 <= x[attribute_key] <= int2, stream)), attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers, omni_present=log.omni_present, properties=log.properties) else: stream = EventStream( list(filter(lambda x: attribute_key in x and (x[attribute_key] < int1 or x[attribute_key] > int2), stream)), attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers, omni_present=log.omni_present, properties=log.properties) all_cases_ids = set(x["case:" + case_key] for x in stream) filtered_log = EventLog(list(), attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers, omni_present=log.omni_present, properties=log.properties) for case in log: if case.attributes[case_key] in all_cases_ids: filtered_log.append(case)
import os import config from datetime import datetime from pm4py.objects.log.exporter.xes import exporter as xes_exporter from pm4py.objects.log.obj import EventStream, EventLog spark_session = create_default_spark_session() def timestamp_to_iso_str(ts): return datetime.fromtimestamp(ts / 1000).isoformat() udf = f.udf(lambda x: timestamp_to_iso_str(x)) df = spark_session.read\ .json(os.path.join(config.ROOT_DIR, "data/logs_leche.json"), multiLine = "true")\ .filter(f.col("stage") == "Fabricacion")\ .filter(f.col("timestamp").isNotNull())\ .withColumn("timestamp", udf(f.col("timestamp"))) logs_formated = log_rdd.format_df(df, case_id="id", task_id="task", event_timestamp="timestamp").collect() xes_exporter.apply(EventLog(EventStream(logs_formated)), os.path.join(config.ROOT_DIR, "data/logs_leche.xes")) # x = xes_importer.apply(os.path.join(config.ROOT_DIR, "data/M2.xes")) # print(type(x))