Beispiel #1
0
def apply_heu(log: EventLog,
              parameters: Optional[Dict[Any, Any]] = None) -> HeuristicsNet:
    """
    Discovers an heuristics net using the Heuristics Miner ++ algorithm

    Implements the approach described in
    Burattin, Andrea, and Alessandro Sperduti. "Heuristics Miner for Time Intervals." ESANN. 2010.

    https://andrea.burattin.net/public-files/publications/2010-esann-slides.pdf

    Parameters
    --------------
    log
        Event log
    parameters
        Parameters of the algorithm, including:
        - Parameters.ACTIVITY_KEY
        - Parameters.START_TIMESTAMP_KEY
        - Parameters.TIMESTAMP_KEY
        - Parameters.DEPENDENCY_THRESH
        - Parameters.AND_MEASURE_THRESH
        - Parameters.MIN_ACT_COUNT
        - Parameters.MIN_DFG_OCCURRENCES
        - Parameters.HEU_NET_DECORATION

    Returns
    --------------
    heu_net
        Heuristics net
    """
    if parameters is None:
        parameters = {}

    log = log_converter.apply(log, parameters=parameters)
    log = interval_lifecycle.to_interval(log, parameters=parameters)
    start_timestamp_key = exec_utils.get_param_value(
        Parameters.START_TIMESTAMP_KEY, parameters, None)
    if start_timestamp_key is None:
        start_timestamp_key = xes.DEFAULT_START_TIMESTAMP_KEY
        parameters = copy(parameters)
        parameters[Parameters.START_TIMESTAMP_KEY] = start_timestamp_key
    start_activities, end_activities, activities_occurrences, dfg, performance_dfg, sojourn_time, concurrent_activities = discover_abstraction_log(
        log, parameters=parameters)
    return discover_heu_net_plus_plus(start_activities,
                                      end_activities,
                                      activities_occurrences,
                                      dfg,
                                      performance_dfg,
                                      sojourn_time,
                                      concurrent_activities,
                                      parameters=parameters)
Beispiel #2
0
    def __init__(self, model_type, current_parameters, control, input_path,
                 models_path, metrics_path, logs_path, current_log, discovery, user,
                 drifts_output_path):

        self.current_parameters = current_parameters
        self.user = user
        self.control = control
        self.input_path = input_path
        self.models_path = models_path
        self.metrics_path = metrics_path
        self.logs_path = logs_path
        self.drifts_output_path = drifts_output_path
        self.model_type = model_type

        # instance of the MetricsManager
        if current_parameters.approach == Approach.FIXED.name:
            self.metrics = None
        elif current_parameters.approach == Approach.ADAPTIVE.name:
            self.metrics = {}

        # current loaded event log information
        self.current_log = current_log
        # convert to interval time log if needed
        self.converted_log = interval_lifecycle.to_interval(self.current_log.log)
        # set the event_data as requested by the user (read event by event or trace by trace)
        if self.current_parameters.read_log_as == ReadLogAs.TRACE.name:
            self.event_data = self.converted_log
        if self.current_parameters.read_log_as == ReadLogAs.EVENT.name:
            # convert the log into an event stream
            self.event_data = log_converter.apply(self.converted_log, variant=log_converter.Variants.TO_EVENT_STREAM)
        else:
            self.event_data = self.converted_log
            print(
                f'The window type received is not defined for IPDD {self.current_parameters.read_log_as}, assuming STREAM OF TRACES')
        # class that implements the discovery method for the current model
        self.discovery = discovery
Beispiel #3
0
def average_duration_activity(
        log: EventLog,
        t1: Union[datetime, str],
        t2: Union[datetime, str],
        r: str,
        a: str,
        parameters: Optional[Dict[str, Any]] = None) -> float:
    """
    The average duration of instances of a given activity completed during a given time slot by a given resource.

    Metric RBI 4.3 in Pika, Anastasiia, et al.
    "Mining resource profiles from event logs." ACM Transactions on Management Information Systems (TMIS) 8.1 (2017): 1-30.

    Parameters
    -----------------
    log
        Event log
    t1
        Left interval
    t2
        Right interval
    r
        Resource
    a
        Activity

    Returns
    ----------------
    metric
        Value of the metric
    """
    if parameters is None:
        parameters = {}

    t1 = get_dt_from_string(t1)
    t2 = get_dt_from_string(t2)

    timestamp_key = exec_utils.get_param_value(
        Parameters.TIMESTAMP_KEY, parameters,
        xes_constants.DEFAULT_TIMESTAMP_KEY)
    resource_key = exec_utils.get_param_value(
        Parameters.RESOURCE_KEY, parameters,
        xes_constants.DEFAULT_RESOURCE_KEY)
    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters,
                                              xes_constants.DEFAULT_NAME_KEY)
    start_timestamp_key = exec_utils.get_param_value(
        Parameters.START_TIMESTAMP_KEY, parameters, None)

    from pm4py.objects.log.util import sorting
    log = sorting.sort_timestamp(log, timestamp_key)
    from pm4py.objects.log.util import interval_lifecycle
    log = interval_lifecycle.to_interval(log, parameters=parameters)
    if start_timestamp_key is None:
        log = __insert_start_from_previous_event(log, parameters=parameters)
        start_timestamp_key = xes_constants.DEFAULT_START_TIMESTAMP_KEY

    log = converter.apply(log, variant=converter.Variants.TO_EVENT_STREAM)
    log = [
        x for x in log if x[resource_key] == r and x[activity_key] == a
        and x[timestamp_key] >= t1 and x[timestamp_key] < t2
    ]

    return float(
        mean(x[timestamp_key].timestamp() - x[start_timestamp_key].timestamp()
             for x in log))
Beispiel #4
0
def __compute_workload(
        log: EventLog,
        resource: Optional[str] = None,
        activity: Optional[str] = None,
        parameters: Optional[Dict[str, Any]] = None) -> Dict[Tuple, int]:
    """
    Computes the workload of resources/activities, corresponding to each event a number
    (number of concurring events)

    Parameters
    ---------------
    log
        event log
    resource
        (if provided) Resource on which we want to compute the workload
    activity
        (if provided) Activity on which we want to compute the workload

    Returns
    ---------------
    workload_dict
        Dictionary associating to each event the number of concurring events
    """
    if parameters is None:
        parameters = {}

    timestamp_key = exec_utils.get_param_value(
        Parameters.TIMESTAMP_KEY, parameters,
        xes_constants.DEFAULT_TIMESTAMP_KEY)
    resource_key = exec_utils.get_param_value(
        Parameters.RESOURCE_KEY, parameters,
        xes_constants.DEFAULT_RESOURCE_KEY)
    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters,
                                              xes_constants.DEFAULT_NAME_KEY)
    start_timestamp_key = exec_utils.get_param_value(
        Parameters.START_TIMESTAMP_KEY, parameters, None)

    from pm4py.objects.log.util import sorting
    log = sorting.sort_timestamp(log, timestamp_key)
    from pm4py.objects.log.util import interval_lifecycle
    log = interval_lifecycle.to_interval(log, parameters=parameters)
    if start_timestamp_key is None:
        log = __insert_start_from_previous_event(log, parameters=parameters)
        start_timestamp_key = xes_constants.DEFAULT_START_TIMESTAMP_KEY
    events = converter.apply(log, variant=converter.Variants.TO_EVENT_STREAM)
    if resource is not None:
        events = [x for x in events if x[resource_key] == resource]
    if activity is not None:
        events = [x for x in events if x[activity_key] == activity]
    events = [(x[start_timestamp_key].timestamp(),
               x[timestamp_key].timestamp(), x[resource_key], x[activity_key])
              for x in events]
    events = sorted(events)
    from intervaltree import IntervalTree, Interval
    tree = IntervalTree()
    ev_map = {}
    k = 0.000001
    for ev in events:
        tree.add(Interval(ev[0], ev[1] + k))
    for ev in events:
        ev_map[ev] = len(tree[ev[0]:ev[1] + k])
    return ev_map
Beispiel #5
0
    def calculate_waiting_time_similarity(sublog1, sublog2, window):
        # convert to interval log
        new_log1 = EventLog(sublog1)
        new_log2 = EventLog(sublog2)
        interval_log1 = interval_lifecycle.to_interval(new_log1)
        interval_log2 = interval_lifecycle.to_interval(new_log2)

        # get the samples, containing a list of values for each activity
        sample1 = WaitingTime.get_waiting_time(interval_log1)
        sample2 = WaitingTime.get_waiting_time(interval_log2)

        # remove activities that are not present in both samples
        keys_to_remove = []
        for a1 in sample1.keys():
            if a1 not in sample2:
                keys_to_remove.append(a1)
        for key in keys_to_remove:
            sample1.pop(key)

        keys_to_remove = []
        for a2 in sample2.keys():
            if a2 not in sample1:
                keys_to_remove.append(a2)
        for key in keys_to_remove:
            sample2.pop(key)

        # create list of all activities containing the difference in the sojourn time compared to the previous window
        # 1 - significant difference
        # 0 - no significant difference
        activities = {}
        for k in sample1.keys():
            activities[k] = 0

        # TODO Remove after finishing the debug
        # Save the samples
        # Create target directory & all intermediate directories if don't exists
        folder_name = os.path.join('data', 'debug', 'samples_waiting_time')
        if not os.path.exists(folder_name):
            os.makedirs(folder_name)

        activities_with_difference = []
        for activity in sample1.keys():
            t = None
            p_value = None
            try:
                t, p_value = stats.ttest_ind(sample1[activity],
                                             sample2[activity])
            except ValueError as e:
                error = f'T Test paired cannot be calculated for activity {activity}: [{e}]'
                print(error)

            if p_value and p_value < 0.05:
                # assume alternative hypothesis - evidence of significant difference between the samples
                activities_with_difference.append(activity)

            # to avoid error on the serialization for saving the metric's information
            if not p_value:
                p_value = f'Not calculated [{error}]'
            elif p_value and math.isnan(p_value):
                p_value = f'NaN'
            activities[activity] = p_value  # save the calculated p-value

        total_of_activities = len(sample1)
        percentual_of_difference = len(
            activities_with_difference) / total_of_activities

        return 1 - percentual_of_difference, activities_with_difference, activities
Beispiel #6
0
    def calculate_sojourn_time_similarity(sublog1, sublog2, window,
                                          parameters):
        # convert to interval log
        new_log1 = EventLog(sublog1)
        new_log2 = EventLog(sublog2)
        interval_log1 = interval_lifecycle.to_interval(new_log1)
        interval_log2 = interval_lifecycle.to_interval(new_log2)

        # TODO remove after debugging
        # for debug purpose
        # from pm4py.objects.conversion.log import converter as log_converter
        # dataframe = log_converter.apply(log1, variant=log_converter.Variants.TO_DATA_FRAME)
        # dataframe.to_csv(f'data/debug/{parameters.logname}_{window}_log1.csv')
        # dataframe = log_converter.apply(log2, variant=log_converter.Variants.TO_DATA_FRAME)
        # dataframe.to_csv(f'data/debug/{parameters.logname}_{window}_log2.csv')

        # get the samples, containing a list of values for each activity
        sample1 = SojournTime.get_durations(interval_log1)
        sample2 = SojournTime.get_durations(interval_log2)

        # remove activities that are not present in both samples
        keys_to_remove = []
        for a1 in sample1.keys():
            if a1 not in sample2:
                keys_to_remove.append(a1)
        for key in keys_to_remove:
            sample1.pop(key)

        keys_to_remove = []
        for a2 in sample2.keys():
            if a2 not in sample1:
                keys_to_remove.append(a2)
        for key in keys_to_remove:
            sample2.pop(key)

        # create list of all activities containing the difference in the sojourn time compared to the previous window
        # 1 - significant difference
        # 0 - no significant difference
        activities = {}
        for k in sample1.keys():
            activities[k] = 0

        # TODO Remove after finishing the debug
        # Save the samples
        # Create target directory & all intermediate directories if don't exists
        experiment_name = f'{parameters.logname}_winsize{parameters.winsize}'
        folder_name = os.path.join('data', 'debug', experiment_name,
                                   'samples_waiting_time')
        if not os.path.exists(folder_name):
            os.makedirs(folder_name)

        activities_with_difference = []
        for activity in sample1.keys():
            t = None
            p_value = None
            try:
                t, p_value = stats.ttest_rel(sample1[activity],
                                             sample2[activity])
            except ValueError as e:
                error = f'T Test paired cannot be calculated for activity {activity} windows {window - 1}-{window}: [{e}]'
                print(error)

            # TODO Remove after finishing the debug
            # create a file for each activity, containing the samples
            dict = {
                f'w{window - 1}': sample1[activity],
                f'w{window}': sample2[activity]
            }
            df = pd.DataFrame(
                {key: pd.Series(value)
                 for key, value in dict.items()})
            filename = f'test{window - 1}-{window}_{activity}.csv'
            # print(f'Saving CSV file {filename}')
            df.to_csv(os.path.join(folder_name, filename))
            # filename = f'test{window - 1}-{window}_{activity}.xlsx'
            # print(f'Saving excel file {filename}')
            # df.to_excel(os.path.join(folder_name, filename))

            if p_value and p_value < 0.01:
                # assume alternative hypothesis - evidence of significant difference between the samples
                activities_with_difference.append(activity)

            # to avoid error on the serialization for saving the metric's information
            if not p_value:
                p_value = f'Not calculated [{error}]'
            elif p_value and math.isnan(p_value):
                p_value = f'NaN'
            activities[activity] = p_value  # save the calculated p-value

        total_of_activities = len(sample1)
        percentual_of_difference = len(
            activities_with_difference) / total_of_activities

        return 1 - percentual_of_difference, activities_with_difference, activities