Example #1
0
def sort_timestamp_stream(event_log,
                          timestamp_key=xes.DEFAULT_TIMESTAMP_KEY,
                          reverse_sort=False):
    """
    Sort an event log based on timestamp key

    Parameters
    -----------
    event_log
        Event log
    timestamp_key
        Timestamp key
    reverse_sort
        If true, reverses the direction in which the sort is done (ascending)

    Returns
    -----------
    event_log
        Sorted event log
    """
    events = sorted(event_log._list,
                    key=lambda x: x[timestamp_key],
                    reverse=reverse_sort)
    new_stream = EventStream(events,
                             attributes=event_log.attributes,
                             extensions=event_log.extensions,
                             omni_present=event_log.omni_present,
                             classifiers=event_log.classifiers,
                             properties=event_log.properties)
    return new_stream
Example #2
0
def table_to_stream(table, parameters=None):
    """
    Converts a Pyarrow table to an event stream

    Parameters
    ------------
    table
        Pyarrow table
    parameters
        Possible parameters of the algorithm
    """
    if parameters is None:
        parameters = {}

    dict0 = table.to_pydict()
    keys = list(dict0.keys())
    # for legacy format support
    if LEGACY_PARQUET_CASECONCEPTNAME in keys:
        for key in keys:
            dict0[key.replace(LEGACY_PARQUET_TP_REPLACER,
                              ":")] = dict0.pop(key)

    stream = EventStream([dict(zip(dict0, i)) for i in zip(*dict0.values())])

    return stream
Example #3
0
    def new_window(self, begin, end, activity=''):
        # increment the id of the window
        if activity:  # when using a detector for an attribute of the activity
            print(
                f'Generating model for sub-log [{begin} - {end - 1}] - window [{self.window_count[activity]}] - activity [{activity}]')
            self.window_count[activity] += 1
        else:
            print(f'Generating model for sub-log [{begin} - {end - 1}] - window [{self.window_count}]')
            self.window_count += 1

        if self.current_parameters.read_log_as == ReadLogAs.EVENT.name:
            # generate the sub-log for the window
            window = EventStream(self.event_data[begin:end])
            sub_log = log_converter.apply(window, variant=log_converter.Variants.TO_EVENT_LOG)
        elif self.current_parameters.read_log_as == ReadLogAs.TRACE.name:
            sub_log = EventLog(self.event_data[begin:end])
        else:
            print(f'Incorrect window type: {self.current_parameters.read_log_as}.')

        # save the sub-log
        output_path = os.path.join(self.logs_path, self.current_parameters.logname, activity)
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        if activity and activity != '':
            output_filename = os.path.join(output_path, f'sublog_w{self.window_count[activity]}_{begin}_{end - 1}.xes')
        else:
            output_filename = os.path.join(output_path, f'sublog_w{self.window_count}_{begin}_{end - 1}.xes')
        xes_exporter.apply(sub_log, output_filename)

        self.execute_processes_for_window(sub_log, begin, activity)
Example #4
0
def sort_lambda_stream(event_log, sort_function, reverse=False):
    """
    Sort a stream based on a lambda expression

    Parameters
    ------------
    event_log
        Stream
    sort_function
        Sort function
    reverse
        Boolean (sort by reverse order)

    Returns
    ------------
    stream
        Sorted stream
    """
    events = sorted(event_log._list, key=sort_function, reverse=reverse)
    new_stream = EventStream(events,
                             attributes=event_log.attributes,
                             extensions=event_log.extensions,
                             omni_present=event_log.omni_present,
                             classifiers=event_log.classifiers,
                             properties=event_log.properties)

    return new_stream
def apply_numeric_events(log, int1, int2, parameters=None):
    """
    Apply a filter on events (numerical filter)

    Parameters
    --------------
    log
        Log
    int1
        Lower bound of the interval
    int2
        Upper bound of the interval
    parameters
        Possible parameters of the algorithm:
            Parameters.ATTRIBUTE_KEY => indicates which attribute to filter
            Parameters.POSITIVE => keep or remove traces with such events?

    Returns
    --------------
    filtered_log
        Filtered log
    """
    if parameters is None:
        parameters = {}

    attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY)

    stream = log_converter.apply(log, variant=log_converter.TO_EVENT_STREAM)
    if exec_utils.get_param_value(Parameters.POSITIVE, parameters, True):
        stream = EventStream(list(filter(lambda x: attribute_key in x and int1 <= x[attribute_key] <= int2, stream)),
                             attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers,
                             omni_present=log.omni_present, properties=log.properties)
    else:
        stream = EventStream(
            list(filter(lambda x: attribute_key in x and (x[attribute_key] < int1 or x[attribute_key] > int2), stream)),
            attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers,
            omni_present=log.omni_present, properties=log.properties)
    filtered_log = log_converter.apply(stream)

    return filtered_log
Example #6
0
def sample_stream(event_log, no_events=100):
    """
    Randomly sample a fixed number of events from the original event log

    Parameters
    -----------
    event_log
        Event log
    no_events
        Number of events that the sample should have

    Returns
    -----------
    newLog
        Filtered log
    """
    new_log = EventStream(attributes=event_log.attributes,
                          extensions=event_log.extensions,
                          globals=event_log.omni_present,
                          classifiers=event_log.classifiers)
    new_log._list = random.sample(event_log, min(no_events, len(event_log)))
    return new_log
Example #7
0
    def process_two_fixed_sliding_windows(self, event_data, initial_index_w1, initial_index_w2, winsize):
        print(f'process_two_windows w1: {initial_index_w1} w2: {initial_index_w2} winsize: {winsize}')
        if self.current_parameters.read_log_as == ReadLogAs.EVENT.name:
            # generate the sub-log for the window
            window1 = EventStream(event_data[initial_index_w1:initial_index_w1 + winsize])
            window2 = EventStream(event_data[initial_index_w2:initial_index_w2 + winsize])
            sub_log1 = log_converter.apply(window1, variant=log_converter.Variants.TO_EVENT_LOG)
            sub_log2 = log_converter.apply(window2, variant=log_converter.Variants.TO_EVENT_LOG)
        elif self.current_parameters.read_log_as == ReadLogAs.TRACE.name:
            sub_log1 = EventLog(event_data[initial_index_w1:initial_index_w1 + winsize])
            sub_log2 = EventLog(event_data[initial_index_w2:(initial_index_w2 + winsize)])
            print(f'Sub-log1: {len(sub_log1)} - Sub-log2: {len(sub_log2)}')
        else:
            print(f'Incorrect window type: {self.current_parameters.read_log_as}.')

        # TODO remove after debugging
        # for debug purpose
        # dataframe = log_converter.apply(sub_log1, variant=log_converter.Variants.TO_DATA_FRAME)
        # dataframe.to_csv(f'data/debug/{self.current_parameters.logname}_{self.window_count}_log1.csv')
        # dataframe = log_converter.apply(sub_log2, variant=log_converter.Variants.TO_DATA_FRAME)
        # dataframe.to_csv(f'data/debug/{self.current_parameters.logname}_{self.window_count}_log2.csv')
        self.compare_sliding_fixed_windows(sub_log1, sub_log2)
def apply_events(log, values, parameters=None):
    """
    Filter log by keeping only events with an attribute value that belongs to the provided values list

    Parameters
    -----------
    log
        log
    values
        Allowed attributes
    parameters
        Parameters of the algorithm, including:
            Parameters.ACTIVITY_KEY -> Attribute identifying the activity in the log
            Parameters.POSITIVE -> Indicate if events should be kept/removed

    Returns
    -----------
    filtered_log
        Filtered log
    """
    if parameters is None:
        parameters = {}

    attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY)
    positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True)

    stream = log_converter.apply(log, variant=log_converter.TO_EVENT_STREAM)
    if positive:
        stream = EventStream(list(filter(lambda x: x[attribute_key] in values, stream)), attributes=log.attributes,
                             extensions=log.extensions, classifiers=log.classifiers,
                             omni_present=log.omni_present, properties=log.properties)
    else:
        stream = EventStream(list(filter(lambda x: x[attribute_key] not in values, stream)), attributes=log.attributes,
                             extensions=log.extensions, classifiers=log.classifiers,
                             omni_present=log.omni_present, properties=log.properties)

    filtered_log = log_converter.apply(stream)

    return filtered_log
Example #9
0
def filter_log_events_attr(log, values, parameters=None):
    """
    Filter log by keeping only events with an attribute value that belongs to the provided values list

    Parameters
    -----------
    log
        log
    values
        Allowed attributes
    parameters
        Parameters of the algorithm, including:
            activity_key -> Attribute identifying the activity in the log
            positive -> Indicate if events should be kept/removed

    Returns
    -----------
    filtered_log
        Filtered log
    """

    # CODE SAVING FROM FILTERS

    if parameters is None:
        parameters = {}

    attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY)
    positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True)

    stream = log_converter.apply(log, variant=log_converter.TO_EVENT_STREAM, parameters={"deepcopy": False})
    if positive:
        stream = EventStream(list(filter(lambda x: x[attribute_key] in values, stream)))
    else:
        stream = EventStream(list(filter(lambda x: x[attribute_key] not in values, stream)))

    filtered_log = log_converter.apply(stream, variant=log_converter.Variants.TO_EVENT_LOG)

    return filtered_log
Example #10
0
def apply_events(
    log: EventLog,
    dt1: Union[str, datetime.datetime],
    dt2: Union[str, datetime.datetime],
    parameters: Optional[Dict[Union[str, Parameters],
                              Any]] = None) -> EventLog:
    """
    Get a new log containing all the events contained in the given interval

    Parameters
    -----------
    log
        Log
    dt1
        Lower bound to the interval
    dt2
        Upper bound to the interval
    parameters
        Possible parameters of the algorithm, including:
            Parameters.TIMESTAMP_KEY -> Attribute to use as timestamp

    Returns
    ------------
    filtered_log
        Filtered log
    """
    if parameters is None:
        parameters = {}
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY,
                                               parameters,
                                               DEFAULT_TIMESTAMP_KEY)
    dt1 = get_dt_from_string(dt1)
    dt2 = get_dt_from_string(dt2)

    stream = log_converter.apply(log,
                                 variant=log_converter.TO_EVENT_STREAM,
                                 parameters={"deepcopy": False})
    filtered_stream = EventStream([
        x
        for x in stream if dt1 <= x[timestamp_key].replace(tzinfo=None) <= dt2
    ],
                                  attributes=log.attributes,
                                  extensions=log.extensions,
                                  omni_present=log.omni_present,
                                  classifiers=log.classifiers,
                                  properties=log.properties)
    filtered_log = log_converter.apply(
        filtered_stream, variant=log_converter.Variants.TO_EVENT_LOG)

    return filtered_log
Example #11
0
def preprocess_log(log, activities=None, parameters=None):
    """
    Preprocess a log to enable correlation mining

    Parameters
    --------------
    log
        Log object
    activities
        (if provided) list of activities of the log
    parameters
        Parameters of the algorithm

    Returns
    --------------
    transf_stream
        Transformed stream
    activities_grouped
        Grouped activities
    activities
        List of activities of the log
    """
    if parameters is None:
        parameters = {}

    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters,
                                              xes_constants.DEFAULT_NAME_KEY)
    timestamp_key = exec_utils.get_param_value(
        Parameters.TIMESTAMP_KEY, parameters,
        xes_constants.DEFAULT_TIMESTAMP_KEY)
    start_timestamp_key = exec_utils.get_param_value(
        Parameters.START_TIMESTAMP_KEY, parameters,
        xes_constants.DEFAULT_TIMESTAMP_KEY)
    index_key = exec_utils.get_param_value(Parameters.INDEX_KEY, parameters,
                                           DEFAULT_INDEX_KEY)

    if type(log) is pd.DataFrame:
        # keep only the two columns before conversion
        log = log[list(set([activity_key, timestamp_key,
                            start_timestamp_key]))]

    parameters["deepcopy"] = False
    parameters["include_case_attributes"] = False
    log = converter.apply(log,
                          variant=converter.TO_EVENT_STREAM,
                          parameters=parameters)
    transf_stream = EventStream()
    for idx, ev in enumerate(log):
        transf_stream.append(
            Event({
                activity_key: ev[activity_key],
                timestamp_key: ev[timestamp_key].timestamp(),
                start_timestamp_key: ev[start_timestamp_key].timestamp(),
                index_key: idx
            }))
    transf_stream = sorted(
        transf_stream,
        key=lambda x: (x[start_timestamp_key], x[timestamp_key], x[index_key]))

    if activities is None:
        activities = sorted(list(set(x[activity_key] for x in transf_stream)))

    activities_grouped = {
        x: [y for y in transf_stream if y[activity_key] == x]
        for x in activities
    }

    return transf_stream, activities_grouped, activities
def apply_numeric(log, int1, int2, parameters=None):
    """
    Apply a filter on cases (numerical filter)

    Parameters
    --------------
    log
        Log
    int1
        Lower bound of the interval
    int2
        Upper bound of the interval
    parameters
        Possible parameters of the algorithm

    Returns
    --------------
    filtered_df
        Filtered dataframe
    """
    if parameters is None:
        parameters = {}

    attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY)
    case_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, xes.DEFAULT_TRACEID_KEY)
    positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True)
    # stream_filter_key is helpful to filter on cases containing an event with an attribute
    # in the specified value set, but such events shall have an activity in particular.

    stream_filter_key1 = exec_utils.get_param_value(Parameters.STREAM_FILTER_KEY1, parameters, None)
    stream_filter_value1 = exec_utils.get_param_value(Parameters.STREAM_FILTER_VALUE1, parameters, None)
    stream_filter_key2 = exec_utils.get_param_value(Parameters.STREAM_FILTER_KEY2, parameters, None)
    stream_filter_value2 = exec_utils.get_param_value(Parameters.STREAM_FILTER_VALUE2, parameters, None)

    stream = log_converter.apply(log, variant=log_converter.TO_EVENT_STREAM)
    if stream_filter_key1 is not None:
        stream = EventStream(
            list(filter(lambda x: stream_filter_key1 in x and x[stream_filter_key1] == stream_filter_value1, stream)),
            attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers,
            omni_present=log.omni_present, properties=log.properties)
    if stream_filter_key2 is not None:
        stream = EventStream(
            list(filter(lambda x: stream_filter_key2 in x and x[stream_filter_key2] == stream_filter_value2, stream)),
            attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers,
            omni_present=log.omni_present, properties=log.properties)

    if positive:
        stream = EventStream(list(filter(lambda x: attribute_key in x and int1 <= x[attribute_key] <= int2, stream)),
                             attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers,
                             omni_present=log.omni_present, properties=log.properties)
    else:
        stream = EventStream(
            list(filter(lambda x: attribute_key in x and (x[attribute_key] < int1 or x[attribute_key] > int2), stream)),
            attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers,
            omni_present=log.omni_present, properties=log.properties)

    all_cases_ids = set(x["case:" + case_key] for x in stream)

    filtered_log = EventLog(list(), attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers,
                            omni_present=log.omni_present, properties=log.properties)

    for case in log:
        if case.attributes[case_key] in all_cases_ids:
            filtered_log.append(case)
import os
import config
from datetime import datetime
from pm4py.objects.log.exporter.xes import exporter as xes_exporter
from pm4py.objects.log.obj import EventStream, EventLog

spark_session = create_default_spark_session()


def timestamp_to_iso_str(ts):
    return datetime.fromtimestamp(ts / 1000).isoformat()


udf = f.udf(lambda x: timestamp_to_iso_str(x))

df = spark_session.read\
    .json(os.path.join(config.ROOT_DIR, "data/logs_leche.json"), multiLine = "true")\
    .filter(f.col("stage") == "Fabricacion")\
    .filter(f.col("timestamp").isNotNull())\
    .withColumn("timestamp", udf(f.col("timestamp")))

logs_formated = log_rdd.format_df(df,
                                  case_id="id",
                                  task_id="task",
                                  event_timestamp="timestamp").collect()

xes_exporter.apply(EventLog(EventStream(logs_formated)),
                   os.path.join(config.ROOT_DIR, "data/logs_leche.xes"))

# x = xes_importer.apply(os.path.join(config.ROOT_DIR, "data/M2.xes"))
# print(type(x))