Ejemplo n.º 1
0
 def test_case_statistics(self):
     from pm4py.statistics.traces.log import case_statistics
     log = self.get_log()
     case_statistics.get_kde_caseduration(log)
     case_statistics.get_events(log, "N77802")
     case_statistics.get_variant_statistics(log)
     case_statistics.get_cases_description(log)
     case_statistics.get_all_casedurations(log)
     case_statistics.get_first_quartile_caseduration(log)
     case_statistics.get_median_caseduration(log)
Ejemplo n.º 2
0
def get_variants_list(log, parameters=None):
    """
    Gets the list of variants (along with their count) from the particular log_skeleton type

    Parameters
    ------------
    log
        Log
    parameters
        Parameters of the algorithm

    Returns
    -------------
    variants_list
        List of variants of the log_skeleton (along with their count)
    """
    from pm4py.statistics.traces.pandas import case_statistics as pd_case_statistics
    from pm4py.statistics.traces.log import case_statistics as log_case_statistics

    variants_list = []
    if type(log) is pd.DataFrame:
        pd_variants = pd_case_statistics.get_variant_statistics(
            log, parameters=parameters)
        for var in pd_variants:
            varkeys = list(var.keys())
            del varkeys[varkeys.index("variant")]
            variants_list.append((var["variant"], var[varkeys[0]]))
    else:
        log_variants = log_case_statistics.get_variant_statistics(
            log, parameters=parameters)
        for var in log_variants:
            varkeys = list(var.keys())
            del varkeys[varkeys.index("variant")]
            variants_list.append((var["variant"], var[varkeys[0]]))
    return variants_list
Ejemplo n.º 3
0
def W_creater(log, R, w, output=False):

    W = []
    log = variants_filter.apply(log, R)
    target_size = len(log) * w  # it determines the size of W
    variant = case_statistics.get_variant_statistics(log)
    variant = sorted(variant, key=lambda x: x['count'], reverse=True)
    if output:
        print(
            "=" * 100,
            "\nW creater called with w : {} and target size {}\n".format(
                w, target_size))
    W_size = 0
    for v in variant:
        W_size += v['count']
        W.append(v['variant'])
        if output:
            print(
                "\t\t{}___added with size {} // {} out of {}  // total size : {}"
                .format(v['variant'][:60], v['count'], W_size, target_size,
                        len(log)))

        if W_size > target_size:
            break

    if output:
        print("W creater END with its size: {}".format(len(W)))
        print("=" * 100)
    return W
def variant_filter(log):
    new_log = EventLog()
    result = []
    variant_list = get.get_variants(log)
    variant_list_count = case_statistics.get_variant_statistics(log)
    sampled = random.sample(variant_list_count, 1000)
    vlist = [v['variant'] for v in variant_list_count]
    vlist_s = [v['variant'] for v in sampled]
    for v in vlist:
        if v in vlist_s:
            for trace in variant_list[v]:
                new_log.append(trace)

    new_len = len(case_statistics.get_variant_statistics(new_log))
    result.extend([new_len, len(new_log), len(unique_activities(new_log))])
    return new_log
Ejemplo n.º 5
0
def sublog_percent(log, upper_percent, parameters=None):
    '''
    change variant dictionary got from sublog into dataframe, so that we can extract the frequency of each variant
    :param log: same as sublog2varlist()
    :param freq_thres: same as sublog2varlist()
    :return: dataframe of variants with their counts together with the correspond var_list(until the percent )
    '''

    if parameters is None:
        parameters = {}
    lower_percent = exec_utils.get_param_value(Parameters.LOWER_PERCENT,
                                               parameters, 0)

    variants_count = case_statistics.get_variant_statistics(log)
    variants_count = sorted(variants_count,
                            key=lambda x: x['count'],
                            reverse=True)
    df = pd.DataFrame.from_dict(variants_count)
    # calculate the cumunative sum
    csum = np.array(df['count']).cumsum()
    csum = csum / csum[-1]
    num_list = csum[csum <= upper_percent]
    num_list_lower = csum[csum <= lower_percent]
    # stop until the percent is satisfied
    df_w_count = df.iloc[len(num_list_lower):len(num_list), :]
    # get correspond var_list
    filtered_var_list = df_w_count['variant'].values.tolist()
    str_var_list = [
        variants_util.get_activities_from_variant(v) for v in filtered_var_list
    ]

    return df_w_count, str_var_list
Ejemplo n.º 6
0
def sublog_percent2varlist(log, upper_percent, parameters=None):
    '''
    just need to var list
    :param log: same as sublog2varlist()
    :param freq_thres: same as sublog2varlist()
    :return: dataframe of variants with their counts together with the correspond var_list(until the percent )
    '''

    if parameters is None:
        parameters = {}
    lower_percent = exec_utils.get_param_value(Parameters.LOWER_PERCENT,
                                               parameters, 0)

    variants_count = case_statistics.get_variant_statistics(log)
    variants_count = sorted(variants_count,
                            key=lambda x: x['count'],
                            reverse=True)
    df = pd.DataFrame.from_dict(variants_count)
    # calculate the cumunative sum
    csum = np.array(df['count']).cumsum()
    csum = csum / csum[-1]
    num_list = csum[csum <= upper_percent]
    num_list_lower = csum[csum <= lower_percent]
    # stop until the percent is satisfied
    df_w_count = df.iloc[len(num_list_lower):len(num_list), :]
    # get correspond var_list
    filtered_var_list = df_w_count['variant'].values.tolist()
    return df_w_count, filtered_var_list
Ejemplo n.º 7
0
def trace_variant(log):
    variants_count = case_statistics.get_variant_statistics(log)
    variants_count = sorted(variants_count,
                            key=lambda x: x["count"],
                            reverse=True)
    occurrences = [x["count"] for x in variants_count]

    len_occurr, len_log = len(occurrences), len(log)

    ratio_most_common_variant = sum(occurrences[:1]) / len(log)
    ratio_top_1_variants = sum(occurrences[:int(len_occurr * 0.01)]) / len_log
    ratio_top_5_variants = sum(occurrences[:int(len_occurr * 0.05)]) / len_log
    ratio_top_10_variants = sum(occurrences[:int(len_occurr * 0.1)]) / len_log
    ratio_top_20_variants = sum(occurrences[:int(len_occurr * 0.2)]) / len_log
    ratio_top_50_variants = sum(occurrences[:int(len_occurr * 0.5)]) / len_log
    ratio_top_75_variants = sum(occurrences[:int(len_occurr * 0.75)]) / len_log
    mean_variant_occurrence = np.mean(occurrences)
    std_variant_occurrence = np.std(occurrences)
    skewness_variant_occurrence = stats.skew(occurrences)
    kurtosis_variant_occurrence = stats.kurtosis(occurrences)

    return [
        ratio_most_common_variant,
        ratio_top_1_variants,
        ratio_top_5_variants,
        ratio_top_10_variants,
        ratio_top_20_variants,
        ratio_top_50_variants,
        ratio_top_75_variants,
        mean_variant_occurrence,
        std_variant_occurrence,
        skewness_variant_occurrence,
        kurtosis_variant_occurrence,
    ]
Ejemplo n.º 8
0
def sublog2varlist(log, freq_thres, num):
    '''
    extract lists of variants from selected sublogs together with frequency threshold to filter out infrequent variants
    :param log: sublog containing the selected case attribute value
    :param freq_thres: (int) frequency threshold to filter out infrequent variants
    :return: lists of variant strings
    '''
    variants_count = case_statistics.get_variant_statistics(log)
    variants_count = sorted(variants_count,
                            key=lambda x: x['count'],
                            reverse=True)
    filtered_var_list = []
    filtered_var_list_1 = []
    filtered_var_list_2 = []
    for i in range(len(variants_count)):
        if variants_count[i]['count'] >= freq_thres:
            filtered_var_list_1.append(
                variants_count[i]['variant'])  # variant string
        elif i < num:
            filtered_var_list_2.append(variants_count[i]['variant'])

    # union set ensure the ordered union will be satisfied
    filtered_var_list = filtered_var_list_1 + filtered_var_list_2
    str_var_list = [
        variants_util.get_activities_from_variant(v) for v in filtered_var_list
    ]

    return str_var_list
Ejemplo n.º 9
0
 def test_obtaining_variants(self):
     # to avoid static method warnings in tests,
     # that by construction of the unittest package have to be expressed in such way
     self.dummy_variable = "dummy_value"
     input_log = os.path.join(INPUT_DATA_DIR, "running-example.xes")
     log = xes_importer.import_log(input_log)
     stats = case_statistics.get_variant_statistics(log)
     del stats
Ejemplo n.º 10
0
def get_statistics(period_1_log, period_2_log):
    variants_count1 = case_statistics.get_variant_statistics(period_1_log)
    variants_count1 = sorted(variants_count1,
                             key=lambda x: x['count'],
                             reverse=True)

    variants_count2 = case_statistics.get_variant_statistics(period_2_log)
    variants_count2 = sorted(variants_count2,
                             key=lambda x: x['count'],
                             reverse=True)

    trace_count1 = 0
    trace_count2 = 0

    for i in variants_count1:
        trace_count1 += i["count"]

    for i in variants_count2:
        trace_count2 += i["count"]
Ejemplo n.º 11
0
def sublog2df_num(log, num):
    '''
    change variant dictionary got from sublog into dataframe, so that we can extract the frequency of each variant
    :param log: same as sublog2varlist()
    :param freq_thres: same as sublog2varlist()
    :return: dataframe of variants with their counts
    '''
    variants_count = case_statistics.get_variant_statistics(log)
    variants_count = sorted(variants_count, key=lambda x: x['count'], reverse=True)
    df = pd.DataFrame.from_dict(variants_count)
    df_w_count = df.iloc[0:num, :]
    return df_w_count
Ejemplo n.º 12
0
    def filter_variants(self, filter_level):
        variants_count = case_statistics.get_variant_statistics(self.log)
        variants_count = \
            sorted(variants_count,
                   key=lambda x: x['count'],
                   reverse=True)
        total_traces = len(self.log)
        total_variants = len(variants_count)
        filter_threshold = (1 / total_variants) * filter_level

        desired_variants = \
            [v['variant'] for v in variants_count \
                  if v['count']/total_traces >= filter_threshold]
        self.log = variants_filter.apply(self.log, desired_variants)
Ejemplo n.º 13
0
def dpi_distribution(log):
    '''
    input : event log object - log
    output : numpy array d
    '''

    import matplotlib.pyplot as plt
    from pm4py.statistics.traces.log import case_statistics
    variants_count = case_statistics.get_variant_statistics(log)
    variants_count = sorted(
        variants_count, key=lambda x: x['count'], reverse=True)
    d = np.zeros(len(variants_count))
    for i, v in enumerate(variants_count):
        d[i] = (v['count'])
    d = np.array(d)
    return d
Ejemplo n.º 14
0
def sublog2df(log, freq_thres, num):
    '''
    change variant dictionary got from sublog into dataframe, so that we can extract the frequency of each variant
    :param log: same as sublog2varlist()
    :param freq_thres: same as sublog2varlist()
    :return: dataframe of variants with their counts
    '''
    variants_count = case_statistics.get_variant_statistics(log)
    variants_count = sorted(variants_count, key=lambda x: x['count'], reverse=True)
    df = pd.DataFrame.from_dict(variants_count)
    df_w_count_1 = df[df['count'] >= freq_thres]
    df_w_count_2 = df.iloc[0:num, :]
    # take union of two dataframes
    df_w_count = pd.merge(df_w_count_1, df_w_count_2, how='outer', on=['variant', 'count'])
    # display(df_w_count['variant'])
    return df_w_count
Ejemplo n.º 15
0
def stat(log):
    '''
    Shows statistical information of log
    :param log: input log
    :return: stat_dict(# of events, # of variants, # of cases)
    '''
    a = case_statistics.get_variant_statistics(log)
    num_event = 0
    for trace in log:
        num_event += len(trace)

    stat_dict = {}
    stat_dict['events'] = num_event
    stat_dict['variants'] = len(a)
    stat_dict['cases'] = len(log)

    return stat_dict
Ejemplo n.º 16
0
def execFreqCase(clusters, EF_df):
    activityL = EF_df['activity'].unique().tolist()
    variant_EF_A = []
    for clusteri in range(len(clusters)):
        # per cluster get the variants along with their count
        variants_count = case_statistics.get_variant_statistics(
            clusters[clusteri])
        for variant in range(len(variants_count)):
            # per variant count the number of occurence of each activity
            for key, value in variants_count[variant].items():
                if key == "variant":
                    activityVariant = []
                    for i in range(len(activityL)):
                        EF = len(re.findall(activityL[i], value))
                        if EF > 0:
                            activityVariant.append({
                                'cluster': clusteri,
                                'variant': variant,
                                'activity': activityL[i],
                                'EF': EF
                            })
                else:
                    #also include the count of this variant
                    for item in activityVariant:
                        item.update({"count": value})
            variant_EF_A.extend(activityVariant)
    variant_EF_A_df = pd.DataFrame.from_dict(variant_EF_A,
                                             orient='columns',
                                             dtype=None)
    variant_EF_A_df['EFsum'] = variant_EF_A_df.apply(
        lambda x: x['EF'] * x['count'], axis=1)
    EFc_df = variant_EF_A_df.groupby(by=['cluster', 'activity']).agg({
        'EFsum':
        "sum",
        'count':
        "sum"
    }).reset_index()
    EFc_df['EFc'] = EFc_df.apply(lambda x: x['EFsum'] / x['count'], axis=1)
    EF_EFc_df = pd.merge(left=EF_df,
                         right=EFc_df.drop(['EFsum', 'count'], axis=1),
                         right_on=['cluster', 'activity'],
                         left_on=['cluster', 'activity'],
                         how='left')
    EF_EFc_df = EF_EFc_df.rename(columns={'activityCount': 'EF'})
    EF_EFc_df = EF_EFc_df.fillna(0)
    return (EF_EFc_df)
def compute_variant_variability(logpath):
    """
       python function for computing variants in log.

        Args:
        logpath (path): The path of events log to parse
        

        Returns: Number of distinct  variants in log and 
        a dataframe listing variants and thier frequencies
    """
    log = xes_import_factory.apply(logpath)
    variants_count = case_statistics.get_variant_statistics(log)
    variants_count = sorted(variants_count,
                            key=lambda x: x['count'],
                            reverse=True)
    variants_count = pd.DataFrame(variants_count)
    return len(variants_count), variants_count
Ejemplo n.º 18
0
def get_statistics(log, parameters=None):
    """
    Gets the variants from the dataframe

    Parameters
    ------------
    log
        Log
    parameters
        Possible parameters of the algorithm

    Returns
    ------------
    variants
        Variants of the event log
    """
    if parameters is None:
        parameters = {}

    variants_statistics = case_statistics.get_variant_statistics(
        log, parameters=parameters)

    return variants_statistics
def compute_my_variability(logpath):
    """
    Python function for computing TRACE ENTROPY probabilities exactly the VARIANT frequencies observed in the log.
    This entropy is restricted to the simple likelihood (frequency-based) estimator, as other more complex estimators exist.
  
    Args:
        logpath (path): The path of events log to parse
        
    Returns:
       Trace entropy of all variants in the log (base10, and base2)
    """

    log = xes_import_factory.apply(logpath)
    variants_count = case_statistics.get_variant_statistics(log)
    variant_trace_df = pd.DataFrame(variants_count)

    #calculate probability using the frequencies of each variant
    variant_trace_df['probability'] = variant_trace_df['count'] / sum(
        variant_trace_df['count'])

    print('Entropy (base2) and  Entropy (base10)')

    return entropy(variant_trace_df['probability'],
                   base=2), entropy(variant_trace_df['probability'], base=10)
Ejemplo n.º 20
0
log_af_sa = start_activities_filter.apply_auto_filter(
    log, parameters={"decreasingFactor": 0.6})
print(start_activities_filter.get_start_activities(log_af_sa))

from pm4py.algo.filtering.log.end_activities import end_activities_filter
log_af_ea = end_activities_filter.apply_auto_filter(
    log, parameters={"decreasingFactor": 0.6})
print(end_activities_filter.get_end_activities(log_af_ea))

#traces
from pm4py.algo.filtering.log.variants import variants_filter
variants = variants_filter.get_variants(log)
variants

from pm4py.statistics.traces.log import case_statistics
variants_count = case_statistics.get_variant_statistics(log)
variants_count = sorted(variants_count, key=lambda x: x['count'], reverse=True)
print(variants_count)
print(len(variants_count))

#most common
filtered_log1 = variants_filter.apply(log, [
    "Confirmation of receipt,T02 Check confirmation of receipt,T04 Determine confirmation of receipt,T05 Print and send confirmation of receipt,T06 Determine necessity of stop advice,T10 Determine necessity to stop indication"
])
filtered_log1
variants_count_filtered_log1 = case_statistics.get_variant_statistics(
    filtered_log1)
print(variants_count_filtered_log1)

#---
from pm4py.algo.filtering.log.attributes import attributes_filter
Ejemplo n.º 21
0
def new_cluster(log, neighbourhood_size, minimum_cluster_size,
                distance_technique, discovery_technique, max_distance):

    print('***********New cluster initialization starts!*********\n')
    iteration = 0
    f1_score = 0
    if f1_score == 0:
        cluster = EventLog()
        # if iteration == 0:
        #    variants_count_list = case_statistics.get_variant_statistics(log)
        # else:
        #    variants_count_list = case_statistics.get_variant_statistics(log)
        #    random.shuffle(variants_count_list)
        variants_count_list = case_statistics.get_variant_statistics(log)
        variant_list = get.get_variants(log)
        frequent = variants_count_list[0]['variant']
        frequent_flag = variant_list[frequent][0].flag
        print(
            f'The most frequent variant is: {frequent} with flag: {frequent_flag}'
        )
        """ Building a cluster using KNN (optional)

        neighbour_variants = find_nearest_neighbours(log, frequent, variants_count_list,
                                                     variant_list, neighbourhood_size)
        for neighbour in neighbour_variants:
            trace_list = variant_list[neighbour]
            print(len(trace_list))
            for index, variant_trace in enumerate(trace_list):
                cluster.append(variant_trace)
                log.remove(variant_trace)

        """

        # """ Building a cluster using the most frequent variants
        for trace in variant_list[frequent]:
            cluster.append(trace)
            log.remove(trace)
        # log = EventLog(filter(lambda x: x not in cluster, log))

        for neighbourhood, variant in enumerate(variants_count_list):
            if neighbourhood == 0:
                continue
            if neighbourhood < neighbourhood_size:
                variant_flow = variant['variant']
                neighbour_trace = variant_list[variant_flow][0]
                print("********** Flags! ************ ")
                print(neighbour_trace.flag)
                print(frequent_flag)
                if neighbour_trace.flag != frequent_flag:
                    if distance_technique == 'BOA':
                        frequent_trace = variant_list[frequent][0]
                        # neighbour_trace = variant_list[variant_flow][0]
                        similarity_distance = distance.euclidean(
                            bag_of_activities(frequent_trace, log),
                            bag_of_activities(neighbour_trace, log))

                    if distance_technique == 'levenshtein':
                        similarity_distance = levenshtein(
                            frequent, variant_flow)

                    print(
                        f'Distance with {variant_flow} is: {similarity_distance}'
                    )
                    if similarity_distance <= max_distance:
                        for trace in variant_list[variant_flow]:
                            cluster.append(trace)
                            log.remove(trace)
                    # log = EventLog(filter(lambda x: x not in cluster, log))
            else:
                break
                # """
        print(f'length of cluster: {len(cluster)}, log: {len(log)}')
        # net, im, fm = heuristics_miner.apply(cluster, parameters={"dependency_thresh": 0.99})

        # fitness = replay_fitness_evaluator.apply(cluster, net, im, fm,
        #                                     variant=replay_fitness_evaluator.Variants.TOKEN_BASED)
        # precision = precision_evaluator.apply(cluster, net, im, fm,
        #                                  variant=precision_evaluator.Variants.ETCONFORMANCE_TOKEN)
        # f1_score = 2 * (fitness["log_fitness"] * precision) / (fitness["log_fitness"] + precision)

        eval = cluster_evaluation(cluster, discovery_technique)
        fitness = eval[0]
        precision = eval[1]
        f1_score = eval[2]
        print(f'f1-score is: {f1_score}')
        iteration += 1

    trace_distribution(cluster, log, minimum_cluster_size, discovery_technique,
                       f1_score)
    return cluster, log
Ejemplo n.º 22
0
def trace_distribution(cluster, log, minimum_cluster_size, discovery_technique,
                       score):
    print('***********Trace Distribution Starts!*********\n')
    print(f'length of cluster: {len(cluster)}, log: {len(log)}')

    variants_count_list = case_statistics.get_variant_statistics(log)
    # variants_count_list_sampled = sample(variants_count_list, int(len(variants_count_list) / 4))
    # variants_count_list = variants_count_list_sampled
    variant_trace_list = get.get_variants(log)

    # if discovery_technique == 'heuristic miner':
    #    net, im, fm = heuristics_miner.apply(cluster, parameters={"dependency_thresh": 0.99})

    # if discovery_technique == 'inductive miner':
    #    net, im, fm = inductive_miner.apply(cluster)

    # initial_fitness = replay_fitness_evaluator.apply(cluster, net, im, fm,
    #                                                 variant=replay_fitness_evaluator.Variants.TOKEN_BASED)
    # initial_precision = precision_evaluator.apply(cluster, net, im, fm,
    #                                              variant=precision_evaluator.Variants.ETCONFORMANCE_TOKEN)

    # current_f1_score_initial = 2 * (initial_fitness["log_fitness"] * initial_precision) / (
    #        initial_fitness["log_fitness"] + initial_precision)

    current_f1_score = score
    print(f'initial f1 is: {current_f1_score}')
    for variant in variants_count_list:
        variant_flow = variant['variant']
        trace = variant_trace_list[variant_flow][0]

        cluster.append(trace)
        # net, im, fm = inductive_miner.apply(cluster)
        # new_fitness = replay_fitness_evaluator.apply(cluster, net, im, fm,
        #                                             variant=replay_fitness_evaluator.Variants.TOKEN_BASED)
        # new_precision = precision_evaluator.apply(cluster, net, im, fm,
        #                                          variant=precision_evaluator.Variants.ETCONFORMANCE_TOKEN)
        # new_f1_score = 2 * (new_fitness["log_fitness"] * new_precision) / (new_fitness["log_fitness"] + new_precision)
        eval = cluster_evaluation(cluster, discovery_technique)
        new_f1_score = eval[2]
        # print(f'new fitness is: {initial_fitness}')
        print(f'current f1-score is: {current_f1_score}')
        print(f'new f1 is: {new_f1_score}')
        if current_f1_score <= new_f1_score:
            print(f'*****Improved the model!****: {trace}')
            cluster.remove(trace)
            """ Optional Use of KNN to find neighbours of qualified variant 

            temp_variant_list = case_statistics.get_variant_statistics(log)
            temp_variant_trace_list = get.get_variants(log)

            neighbour_variants = find_nearest_neighbours(log, variant_flow, temp_variant_list,
                                                         temp_variant_trace_list, 5)

            for v in neighbour_variants:
                print(f'v is {v}')
                cc = temp_variant_trace_list[v]
                print(len(cc))
            # for index, variant_instance in enumerate(variant_trace_list[variant_flow]):
                for index, variant_instance in enumerate(cc):
                    cluster.append(variant_instance)
                    log.remove(variant_instance)
            print(f'length of cluster: {len(cluster)}, log: {len(log)}')  # , sample: {len(sampled_log)}')
            net, im, fm = inductive_miner.apply(cluster)
            new_fitness = replay_fitness_evaluator.apply(cluster, net, im, fm,
                                                         variant=replay_fitness_evaluator.Variants.TOKEN_BASED)
            new_precision = precision_evaluator.apply(cluster, net, im, fm,
                                                      variant=precision_evaluator.Variants.ETCONFORMANCE_TOKEN)
            new_f1_score = 2 * (new_fitness["log_fitness"] * new_precision) / (
                        new_fitness["log_fitness"] + new_precision)
            current_f1_score = new_f1_score
            continue
            """

            # """ Without KNN
            current_f1_score = new_f1_score
            for variant_instance in variant_trace_list[variant_flow]:
                cluster.append(variant_instance)
                log.remove(variant_instance)
            print(f'length of cluster: {len(cluster)}, log: {len(log)}')
            continue
            # """

        if current_f1_score > new_f1_score:
            if new_f1_score >= 0.9 and abs(current_f1_score -
                                           new_f1_score) <= 0.05:
                print(
                    f'$$$$Did not improve the model but a close trace!$$$$: {trace}'
                )
                current_f1_score = new_f1_score
                for index, variant_instance in enumerate(
                        variant_trace_list[variant_flow]):
                    if index > 0:
                        cluster.append(variant_instance)
                        log.remove(variant_instance)
                print(f'length of cluster: {len(cluster)}, log: {len(log)}')
                # continue
            else:
                # variants_count_list.remove(variant)
                cluster.remove(trace)
                if len(cluster) >= minimum_cluster_size:
                    print(
                        f'did not improve the model and enough traces!: {trace}'
                    )
                    final_cluster = cluster
                    break
                else:
                    print(
                        f'did not improve the model and not enough traces!: {trace}'
                    )
                    print(
                        f'length of cluster: {len(cluster)}, log: {len(log)}')
                    # continue

    print(f'length of cluster: {len(cluster)}, log: {len(log)}')
Ejemplo n.º 23
0
def apply_filter(req):
	sessions[req.session["id"]] = datetime.now()
	filters = {
		"time": True,
		"variants": True,
		"performance": True,
		"activities": True,
		"attribute": True
	}
	req.session.set_expiry(7200)
	#print(str(req.body))
	o = json.loads(req.body)
	print(str(o))
	custom_time_range = []
	for pair in o["filter1"]:
		#custom_time_range.append((dateutil.parser.parse(pair[0]),dateutil.parser.parse(pair[1])))
		custom_time_range.append((pair[0],pair[1]))
	if o["filter1"] == []:
		filters["time"] = False
	#print(o["filter1"][0])
	#print(custom_time_range[0][0])
	#print(custom_time_range)
	custom_path_range = []
	for pair in o["filter2"]:
		custom_path_range.append((float(pair[0]),float(pair[1])))
	if o["filter2"] == []:
		filters["variants"] = False
		#custom_path_range = [(0,1)] #filter2
	custom_performance_range = []
	for pair in o["filter3"]:
		custom_performance_range.append((float(pair[0]),float(pair[1])))
	if o["filter3"] == []:
		filters["performance"] = False
	custom_activitiy_range = []
	for pair in o["filter4"]:
		custom_activitiy_range.append((float(pair[0]),float(pair[1])))
	if o["filter4"] == []:
		filters["activities"] = False
		#custom_activitiy_range = [(0,1)] #filter3
	custom_attribute_range = []
	for pair in o["filter5"]:
		custom_attribute_range.append((float(pair[0]),float(pair[1])))
	if o["filter5"] == [] or o["filter5attribute"] == "Empty":
		filters["attribute"] = False
	additional_attribute = o["filter5attribute"]

	selected_viz = o["visualization"]
	calc_lev = o["distance"]
	#input_file = os.path.join("webapp","static", req.session["id"] + "_l0.xes")
	input_file = os.path.join("webapp","static", "sepsis.xes")
	input_log = xes_importer.apply(input_file)
	not_filtered_logs = {}
	flatten = lambda l: [item for sublist in l for item in sublist]

	time_timestamp_started = datetime.now()
	if filters["time"]:
		#TODO check overlapping for filter
		custom_time_range = sorted(custom_time_range, reverse=False)
		for i in range(0,len(custom_time_range)-1):
			if(custom_time_range[i][1] > custom_time_range[i+1][0]):
				response = HttpResponse(json.dumps({'error': "Wrong intervals for time filter"}))
				response.status_code = 200
				return response
				#raise ValueError("Overlapping time ranges")

		logs = []
		for (x,y) in custom_time_range:
			logs.append(timestamp_filter.filter_traces_contained(input_log, x, y))

		#log = timestamp_filter.filter_traces_contained(input_log, custom_time_range[0][0], custom_time_range[0][1])
		log = pm4py.objects.log.log.EventLog()
		for timeslice in logs:
			for trace in timeslice:
				log.append(trace)
		print(len(input_log))
		print(len(log))
		#l2
		not_filtered_logs["timestamp_filter"] = pm4py.objects.log.log.EventLog()
		for trace in input_log:
			if trace not in log:
				not_filtered_logs["timestamp_filter"].append(trace)
		print(len(not_filtered_logs["timestamp_filter"]))
	else:
		log = input_log

	time_variants_started = datetime.now() # where should I start?

	if filters["variants"]:
		variants = variants_filter.get_variants(log)
		variants_count = case_statistics.get_variant_statistics(log)
		variants_count = sorted(variants_count, key=lambda x: x['count'], reverse=False)

		custom_path_range = sorted(custom_path_range, reverse=False)
		# check overlapping
		for i in range(0,len(custom_path_range)-1):
			if(custom_path_range[i][1] > custom_path_range[i+1][0]):
				response = HttpResponse(json.dumps({'error': "Wrong intervals for variants filter"}))
				response.status_code = 200
				return response
				#raise ValueError("Overlapping variants ranges")

		nr_variants = len(variants_count)
		custom_path_range * nr_variants
		idx = [(math.floor(x*nr_variants), math.ceil(y*nr_variants)) for (x,y) in custom_path_range]
		variants_subset = [variants_count[x:y+1] for (x,y) in idx]
		variants_subset = flatten(variants_subset)
		filtered_variants = {k:v for k,v in variants.items() if k in [x["variant"] for x in variants_subset]}
		#l2
		not_filtered_variants = {k:v for k,v in variants.items() if k not in [x["variant"] for x in variants_subset]}

		filtered_log = variants_filter.apply(log, filtered_variants)
		#l2
		not_filtered_logs["variant_filter"] = variants_filter.apply(log, not_filtered_variants)
	else:
		filtered_log = log

	time_variants_finished = datetime.now() # note: incl log2 generation

	if filters["performance"]:
		custom_performance_range = sorted(custom_performance_range, reverse=False)
		# check overlapping
		for i in range(0,len(custom_performance_range)-1):
			if(custom_performance_range[i][1] > custom_performance_range[i+1][0]):
				response = HttpResponse(json.dumps({'error': "Wrong intervals for performance filter"}))
				response.status_code = 200
				return response
				#raise ValueError("Overlapping performance ranges")

		#all_case_durations = case_statistics.get_all_casedurations(log, parameters={case_statistics.Parameters.TIMESTAMP_KEY: "time:timestamp"})
		#case_filter.filter_case_performance(log, 86400, 864000)
		performances = []
		for i in range(len(filtered_log)):
			filtered_log[i].attributes["throughput"] = (max([event["time:timestamp"]for event in filtered_log[i]])-min([event["time:timestamp"] for event in filtered_log[i]])).total_seconds()
			performances.append(filtered_log[i].attributes["throughput"])

		nr_cases = len(filtered_log)
		performances = sorted(performances, reverse=False)
		idx = [(math.floor(x*nr_cases), math.ceil(y*nr_cases)) for (x,y) in custom_performance_range]
		perf_subset = [performances[x:y+1] for (x,y) in idx]
		perf_subset = flatten(perf_subset)

		performance_log = pm4py.objects.log.log.EventLog([trace for trace in filtered_log if trace.attributes["throughput"] in perf_subset])
		#l2
		not_filtered_logs["performance_filter"] = pm4py.objects.log.log.EventLog([trace for trace in filtered_log if trace.attributes["throughput"] not in perf_subset])
		#print(str(len(not_filtered_logs["performance_filter"])))

	else:
		performance_log = filtered_log

	time_performance_finished = datetime.now()

	if filters["activities"]:
		variants = variants_filter.get_variants(performance_log)
		variants_count = case_statistics.get_variant_statistics(performance_log)
		variants_count = sorted(variants_count, key=lambda x: x['count'], reverse=False)

		activities = dict()
		for variant in variants_count:
			for activity in variant["variant"].split(","):
				if (activity not in activities.keys()):
					activities[activity] = variant["count"]
				else:
					activities[activity] += variant["count"]

		sorted_activities = {k: v for k, v in sorted(activities.items(), key=lambda item: item[1])}
		activities_sorted_list = list(sorted_activities)
		custom_activitiy_range = sorted(custom_activitiy_range, reverse=False)
		# check overlapping
		for i in range(0,len(custom_activitiy_range)-1):
			if(custom_activitiy_range[i][1] > custom_activitiy_range[i+1][0]):
				response = HttpResponse(json.dumps({'error': "Wrong intervals for activities filter"}))
				response.status_code = 200
				return response
				#raise ValueError("Overlapping activities ranges")
		nr_activities = len(activities_sorted_list)
		idx = [(math.floor(x*nr_activities), math.ceil(y*nr_activities)) for (x,y) in custom_activitiy_range]
		activities_to_keep = [activities_sorted_list[x:y+1] for (x,y) in idx]
		activities_to_keep = flatten(activities_to_keep)
		variants_idx = []
		for i in range(len(variants_count)):
			for activity in activities_to_keep:
				if (activity in variants_count[i]["variant"].split(",") and (i not in variants_idx)):
					variants_idx.append(i)
		variants_subset = [variants_count[i] for i in variants_idx]
		filtered_variants = {k:v for k,v in variants.items() if k in [x["variant"] for x in variants_subset]}
		#l2
		not_filtered_variants = {k:v for k,v in variants.items() if k not in [x["variant"] for x in variants_subset]}

		filtered_log = variants_filter.apply(performance_log, filtered_variants)

		#l2
		not_filtered_logs["activities_filter"] = variants_filter.apply(performance_log, not_filtered_variants)

		new_log = pm4py.objects.log.log.EventLog()
		#not_filtered_logs["activities_filter_traces"] = pm4py.objects.log.log.EventLog()
		for trace in filtered_log:
			new_trace = pm4py.objects.log.log.Trace()
			not_new_trace = pm4py.objects.log.log.Trace()
			for event in trace:
				if(event['concept:name'] in activities_to_keep):
					new_trace.append(event)
				else:
					not_new_trace.append(event)
			if(len(new_trace)>0):
				new_log.append(new_trace)
			if(len(not_new_trace)>0):
				not_filtered_logs["activities_filter"].append(not_new_trace)
	else:
		new_log = performance_log

	time_activities_finished = datetime.now()

	if filters["attribute"]:
		custom_attribute_range = sorted(custom_attribute_range, reverse=False)
		# check overlapping
		for i in range(0,len(custom_attribute_range)-1):
			if(custom_attribute_range[i][1] > custom_attribute_range[i+1][0]):
				response = HttpResponse(json.dumps({'error': "Wrong intervals for additional attribute filter"}))
				response.status_code = 200
				return response

		newest_log = pm4py.objects.log.log.EventLog()
		not_filtered_logs["additional_filter"] = pm4py.objects.log.log.EventLog()

		traces_with_attr = []
		not_traces_with_attr = []
		for trace in new_log:
			if additional_attribute in trace.attributes.keys():
				traces_with_attr.append(trace)
			else:
				not_traces_with_attr.append(trace)
		#check if trace attribute
		if len(traces_with_attr)>0:
			#check if numeric
			if type(traces_with_attr[0].attributes[additional_attribute]) in [int, float]:
				for trace in traces_with_attr:
					if any([trace.attributes[additional_attribute] >= x and trace.attributes[additional_attribute] <= y for (x,y) in custom_attribute_range]):
						newest_log.append(trace)
					else:
						not_filtered_logs["additional_filter"].append(trace)
				for trace in not_traces_with_attr:
					not_filtered_logs["additional_filter"].append(trace)
			else: #string
				attribute_frequencies = dict()
				for trace in traces_with_attr:
					if trace.attributes[additional_attribute] not in attribute_frequencies.keys():
						attribute_frequencies[trace.attributes[additional_attribute]] = 0
					attribute_frequencies[trace.attributes[additional_attribute]] += 1

				sorted_frequencies = {k: v for k, v in sorted(attribute_frequencies.items(), key=lambda item: item[1])}
				frequencies_sorted_list = list(sorted_frequencies)

				nr_values = len(frequencies_sorted_list)
				idx = [(math.floor(x*nr_values), math.ceil(y*nr_values)) for (x,y) in custom_attribute_range]
				values_to_keep = [frequencies_sorted_list[x:y+1] for (x,y) in idx]
				values_to_keep = flatten(values_to_keep)

				for trace in traces_with_attr:
					if trace.attributes[additional_attribute] in values_to_keep:
						newest_log.append(trace)
					else:
						not_filtered_logs["additional_filter"].append(trace)
				for trace in not_traces_with_attr:
					not_filtered_logs["additional_filter"].append(trace)

		else: #event attribute
			if [type(event[additional_attribute]) for trace in new_log for event in trace if additional_attribute in event.keys()][0] in [int, float]:
				for trace in new_log:
					new_trace = pm4py.objects.log.log.Trace()
					not_new_trace = pm4py.objects.log.log.Trace()
					for event in trace:
						if(additional_attribute in event.keys() and any([event[additional_attribute] >= x and event[additional_attribute] <= y for (x,y) in custom_attribute_range ])):
							new_trace.append(event)
						else:
							not_new_trace.append(event)
					if(len(new_trace)>0):
						newest_log.append(new_trace)
					if(len(not_new_trace)>0):
						not_filtered_logs["additional_filter"].append(not_new_trace)
			else: #string
				attribute_frequencies = dict()
				for trace in new_log:
					for event in trace:
						if additional_attribute in event.keys():
							if event[additional_attribute] not in attribute_frequencies.keys():
								attribute_frequencies[event[additional_attribute]] = 0
							attribute_frequencies[event[additional_attribute]] += 1

				sorted_frequencies = {k: v for k, v in sorted(attribute_frequencies.items(), key=lambda item: item[1])}
				frequencies_sorted_list = list(sorted_frequencies)

				nr_values = len(frequencies_sorted_list)
				idx = [(math.floor(x*nr_values), math.ceil(y*nr_values)) for (x,y) in custom_attribute_range]
				values_to_keep = [frequencies_sorted_list[x:y+1] for (x,y) in idx]
				values_to_keep = flatten(values_to_keep)

				for trace in new_log:
					new_trace = pm4py.objects.log.log.Trace()
					not_new_trace = pm4py.objects.log.log.Trace()
					for event in trace:
						if(additional_attribute in event.keys() and event[additional_attribute] in values_to_keep):
							new_trace.append(event)
						else:
							not_new_trace.append(event)
					if(len(new_trace)>0):
						newest_log.append(new_trace)
					if(len(not_new_trace)>0):
						not_filtered_logs["additional_filter"].append(not_new_trace)


	else:
		newest_log = new_log

	time_attribute_finished = datetime.now()

	if(selected_viz=="dfgf"):
		dfg = dfg_discovery.apply(newest_log)
		gviz = dfg_visualization.apply(dfg, log=newest_log, variant=dfg_visualization.Variants.FREQUENCY)
		dfg_visualization.save(gviz, os.path.join("webapp","static", req.session["id"] + "_l1.png"))
	elif(selected_viz=="dfgp"):
		dfg = dfg_discovery.apply(newest_log)
		gviz = dfg_visualization.apply(dfg, log=newest_log, variant=dfg_visualization.Variants.PERFORMANCE)
		dfg_visualization.save(gviz, os.path.join("webapp","static", req.session["id"] + "_l1.png"))
	else:
		heu_net = heuristics_miner.apply_heu(newest_log, parameters={"dependency_thresh": 0.99})
		gviz = hn_vis_factory.apply(heu_net)
		hn_vis_factory.save(gviz, os.path.join("webapp","static", req.session["id"] + "_l1.png"))

	xes_exporter.apply(newest_log, os.path.join("webapp","static", req.session["id"] + "_l1.xes"))


	#l2
	not_filtered_log = pm4py.objects.log.log.EventLog()
	for part in not_filtered_logs.keys():
		for trace in not_filtered_logs[part]:
			not_filtered_log.append(trace)

	if(selected_viz=="dfgf"):
		dfg = dfg_discovery.apply(not_filtered_log)
		gviz = dfg_visualization.apply(dfg, log=not_filtered_log, variant=dfg_visualization.Variants.FREQUENCY)
		dfg_visualization.save(gviz, os.path.join("webapp","static", req.session["id"] + "_l2.png"))
	elif(selected_viz=="dfgp"):
		dfg = dfg_discovery.apply(not_filtered_log)
		gviz = dfg_visualization.apply(dfg, log=not_filtered_log, variant=dfg_visualization.Variants.PERFORMANCE)
		dfg_visualization.save(gviz, os.path.join("webapp","static", req.session["id"] + "_l2.png"))
	else:
		heu_net = heuristics_miner.apply_heu(not_filtered_log, parameters={"dependency_thresh": 0.99})
		gviz = hn_vis_factory.apply(heu_net)
		hn_vis_factory.save(gviz, os.path.join("webapp","static", req.session["id"] + "_l2.png"))
	xes_exporter.apply(not_filtered_log, os.path.join("webapp","static", req.session["id"] + "_l2.xes"))

	if(calc_lev):
		lev_new = [0]*len(newest_log)
		for i in range(len(newest_log)):
			lev_new[i] = [hash(event['concept:name']) for event in newest_log[i]]

		lev_not = [0]*len(not_filtered_log)
		for i in range(len(not_filtered_log)):
			lev_not[i] = [hash(event['concept:name']) for event in not_filtered_log[i]]

		distances = []
		for i in range(len(lev_new)):
			for j in range(len(lev_not)):
				distances.append(lev_dist(lev_new[i], lev_not[j]))
		lev_d = sum(distances)/len(distances)
		print("Levenshtein's distance: "+str(lev_d))
	else:
		lev_d = "null"

	used_paths = 0
	for lower, higher in custom_path_range:
		used_paths += round((higher-lower)*100)
	print(f"Using {used_paths}% of paths. {100-used_paths}% of paths are discarded.")

	print("Timestamp filter: {} seconds. \nVariants filter: {} seconds. \nPerformance filter: {} seconds. \nActivities filter: {} seconds. \nAttribute filter: {} seconds.".format((time_variants_started - time_timestamp_started).total_seconds(), (time_variants_finished - time_variants_started).total_seconds(), (time_performance_finished - time_variants_finished).total_seconds(), (time_activities_finished - time_performance_finished).total_seconds(), (time_attribute_finished - time_activities_finished).total_seconds()))
	response = HttpResponse(json.dumps({'time':(time_variants_started - time_timestamp_started).total_seconds(), 'variants':(time_variants_finished - time_variants_started).total_seconds(),'performance':(time_performance_finished - time_variants_finished).total_seconds(), 'activities':(time_activities_finished - time_performance_finished).total_seconds(), 'attribute':(time_attribute_finished - time_activities_finished).total_seconds(), 'traces':[len(newest_log), len(not_filtered_log)], 'distance':lev_d}))
	response.status_code = 200
	return response
Ejemplo n.º 24
0
from pm4py.objects.log.importer.xes import factory as xes_import_factory
from pm4py.objects.log.exporter.xes import factory as xes_exporter
from pm4py.statistics.traces.log import case_statistics
from pm4py.algo.filtering.log.variants import variants_filter

K = [20]
for k in K:
    event_log = "Sepsis Cases - Event Log.xes"
    log = xes_import_factory.apply(event_log)
    var_with_count = case_statistics.get_variant_statistics(log)
    variants_count = sorted(var_with_count,
                            key=lambda x: x['count'],
                            reverse=True)
    to_filter = []
    count = 0
    for j in range(0, len(variants_count)):
        dict = variants_count[j]
        if dict["count"] < k:
            to_filter.append([dict["variant"]])
        else:
            count += dict["count"]
    for delete in to_filter:
        log = variants_filter.apply(log,
                                    delete,
                                    parameters={"positive": False})
    xes_exporter.export_log(
        log, "baseline" + "_" + str(k) + "-" + "Annonymity" + ".xes")
    print("baseline" + "_" + str(k) + "-" + "Annonymity" + ".xes" +
          " has been exported!")
diff_absolute = diffTwoMatrix(excellent_average, weak_average)

dfg_miner_time_diff_absolute = diffTwoMatrix(dfg_miner_excellent_dfg,
                                             dfg_miner_weak_dfg)

#Inductive Miner
from pm4py.algo.discovery.inductive import factory as inductive_miner

tree = inductive_miner.apply_tree(ex1_personal_log_1_converted)

from pm4py.visualization.process_tree import factory as pt_vis_factory

gviz = pt_vis_factory.apply(tree)
pt_vis_factory.view(gviz)

from pm4py.algo.discovery.inductive import factory as inductive_miner

net, initial_marking, final_marking = inductive_miner.apply(
    ex1_personal_log_1_converted)
from pm4py.visualization.petrinet import factory as pn_vis_factory

gviz = pn_vis_factory.apply(net, initial_marking, final_marking)
pn_vis_factory.view(gviz)

#variant

from pm4py.statistics.traces.log import case_statistics

var_with_count = case_statistics.get_variant_statistics(
    ex1_personal_log_1_converted, parameters={"max_variants_to_return": 5})