Example #1
0
def apply_auto_filter(log, parameters=None):
    """
    Apply some filters in battery to the log_skeleton in order to get a simplified log_skeleton
    
    Parameters
    ----------
    log
        Log
    parameters
        Eventual parameters applied to the algorithms:
            Parameters.DECREASING_FACTOR -> Decreasing factor (provided to all algorithms)
            Parameters.ACTIVITY_KEY -> Activity key (must be specified if different from concept:name)
    
    Returns
    ---------
    filtered_log
        Filtered log_skeleton
    """

    # the following filters are applied:
    # - activity filter (keep only attributes with a reasonable number of occurrences) (if enabled)
    # - variant filter (keep only variants with a reasonable number of occurrences) (if enabled)
    # - start attributes filter (keep only variants that starts with a plausible start activity) (if enabled)
    # - end attributes filter (keep only variants that starts with a plausible end activity) (if enabled)

    if parameters is None:
        parameters = {}

    enable_activities_filter = exec_utils.get_param_value(Parameters.ENABLE_ACTIVITES_FILTER, parameters, True)
    enable_variants_filter = exec_utils.get_param_value(Parameters.ENABLE_VARIANTS_FILTER, parameters, False)
    enable_start_activities_filter = exec_utils.get_param_value(Parameters.ENABLE_START_ACTIVITIES_FILTER, parameters,
                                                                False)
    enable_end_activities_filter = exec_utils.get_param_value(Parameters.ENABLE_END_ACTIVITIES_FILTER, parameters, True)

    attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, xes.DEFAULT_NAME_KEY)

    parameters[Parameters.ATTRIBUTE_KEY] = attribute_key
    parameters[Parameters.ACTIVITY_KEY] = attribute_key

    variants = variants_module.get_variants(log, parameters=parameters)

    filtered_log = log
    if enable_activities_filter:
        filtered_log = attributes_filter.apply_auto_filter(log, variants=variants, parameters=parameters)
        variants = variants_module.get_variants(filtered_log, parameters=parameters)
    if enable_variants_filter:
        filtered_log = variants_module.apply_auto_filter(filtered_log, variants=variants, parameters=parameters)
        variants = variants_module.get_variants(filtered_log, parameters=parameters)
    if enable_start_activities_filter:
        filtered_log = start_activities_filter.apply_auto_filter(filtered_log, variants=variants,
                                                                 parameters=parameters)
    if enable_end_activities_filter:
        filtered_log = end_activities_filter.apply_auto_filter(filtered_log, variants=variants,
                                                               parameters=parameters)

    return filtered_log
Example #2
0
def get_variants_from_log(log, activity_key, disable_variants=False):
    """
    Gets the variants from the log (allow disabling by giving each trace a different variant)

    Parameters
    -------------
    log
        Trace log
    activity_key
        Attribute that is the activity
    disable_variants
        Boolean value that disable variants

    Returns
    -------------
    variants
        Variants contained in the log
    """
    if disable_variants:
        variants = {}
        for trace in log:
            variants[str(hash(trace))] = [trace]
        return variants
    parameters_variants = {
        constants.PARAMETER_CONSTANT_ACTIVITY_KEY: activity_key
    }
    variants = variants_module.get_variants(log,
                                            parameters=parameters_variants)
    return variants
Example #3
0
def get_variant_statistics(log, parameters=None):
    """
    Gets a dictionary whose key is the variant and as value there
    is the list of traces that share the variant

    Parameters
    ----------
    log
        Log
    parameters
        Parameters of the algorithm, including:
            activity_key -> Attribute identifying the activity in the log
            max_variants_to_return -> Maximum number of variants to return
            variants -> If provided, avoid recalculation of the variants

    Returns
    ----------
    variants_list
        List of variants along the statistics
    """

    if parameters is None:
        parameters = {}
    max_variants_to_return = parameters["max_variants_to_return"] if "max_variants_to_return" in parameters else None
    varnt = parameters["variants"] if "variants" in parameters else variants_filter.get_variants(log,
                                                                                                 parameters=parameters)
    variants_list = []
    for var in varnt:
        variants_list.append({"variant": var, "count": len(varnt[var])})
    variants_list = sorted(variants_list, key=lambda x: x["count"], reverse=True)
    if max_variants_to_return:
        variants_list = variants_list[:min(len(variants_list), max_variants_to_return)]
    return variants_list
Example #4
0
def getaverageduration2(log, logname, logtime):
    activities = attributes_filter.get_attribute_values(log, logname)
    time = attributes_filter.get_attribute_values(log, logtime)
    variants = variants_filter.get_variants(log)
    #print('\n',activities,'\n')
    #print('\n',time)
    #print('\n',variants)
    timeList = []
    tracelist = []
    variantsList = []
    activitiesList = []
    for trace in activities:
        activitiesList.append(trace)
    for trace in log:
        for event in trace:
            timeList.append(str(event[logtime]))
        #print (trace,'\n')
    for trace in log:
        variantsList = []
        for event in trace:
            variantsList.append(event[logname])
        tracelist.append(variantsList)
        #print (trace,'\n')
    #print('\n',timeList)
    #print(tracelist,'........')
    duration = []
    #start position in timestamp now
    #526000 must be replaced
    fmt = '%Y-%m-%d %H:%M:%S'
    for i, val in enumerate(activitiesList):
        count = 0
        timeSum = 0
        header = 0

        for i in range(len(tracelist)):
            for j in range(len(tracelist[i])):
                if tracelist[i][j] == val and j != len(tracelist[i]) - 1:
                    end = timeList[header + j + 1][0:19]
                    start = timeList[header + j][0:19]
                    ts = dt.datetime.strptime(end, fmt) - dt.datetime.strptime(
                        start, fmt)

                    timeSum += int(ts.total_seconds())
                    count += 1
            header = header + len(tracelist[i])
            #print(header)

        if timeSum == 0:
            duration.append(0)
        else:
            duration.append(timeSum / count)

        #print(duration,'line 235')

    #print('Here is our list of average duration:','\n',duration,'\n')

    #for i in range(len(duration)):
    #print('The average duration of activity ',activitiesList[i],' is ',duration[i],' seconds')

    return duration
Example #5
0
File: project.py Project: mac40/PM
def compute_variant_variability(log):
    '''
    compute and return the number of variants
    input:
        log = xes log
    output:
        number of variants in the log
    '''
    return (len(variants_filter.get_variants(log)))
Example #6
0
def simple_stats(log):
    n_traces = len(log)

    variants = variants_filter.get_variants(log)
    n_unique_traces = len(variants)

    ratio_unique_traces_per_trace = n_unique_traces / n_traces

    return [n_traces, n_unique_traces, ratio_unique_traces_per_trace]
Example #7
0
def apply(log, parameters=None):
    """
    Calculates the Working Together metric

    Parameters
    ------------
    log
        Log
    parameters
        Possible parameters of the algorithm

    Returns
    -----------
    tuple
        Tuple containing the metric matrix and the resources list. Moreover, last boolean indicates that the metric is
        not directed.
    """

    if parameters is None:
        parameters = {}

    resource_key = parameters[
        constants.
        PARAMETER_CONSTANT_RESOURCE_KEY] if constants.PARAMETER_CONSTANT_RESOURCE_KEY in parameters else xes.DEFAULT_RESOURCE_KEY

    parameters_variants = {
        constants.PARAMETER_CONSTANT_ACTIVITY_KEY: resource_key,
        constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: resource_key
    }
    variants_occ = {
        x: len(y)
        for x, y in variants_filter.get_variants(
            log, parameters=parameters_variants).items()
    }
    variants_resources = list(variants_occ.keys())
    resources = [x.split(",") for x in variants_resources]
    flat_list = sorted(
        list(set([item for sublist in resources for item in sublist])))

    metric_matrix = numpy.zeros((len(flat_list), len(flat_list)))

    for rv in resources:
        ord_res_list = sorted(list(set(rv)))

        for i in range(len(ord_res_list) - 1):
            res_i = flat_list.index(ord_res_list[i])
            for j in range(i + 1, len(ord_res_list)):
                res_j = flat_list.index(ord_res_list[j])
                metric_matrix[res_i, res_j] += float(
                    variants_occ[",".join(rv)]) / float(len(log))
                metric_matrix[res_j, res_i] += float(
                    variants_occ[",".join(rv)]) / float(len(log))

    return (metric_matrix, flat_list, False)
def apply_auto_filter(log, variants=None, parameters=None):
    """
    Apply an end attributes filter detecting automatically a percentage
    
    Parameters
    ----------
    log
        Log
    variants
        (If specified) Dictionary with variant as the key and the list of traces as the value
    parameters
        Parameters of the algorithm, including:
            Parameters.DECREASING_FACTOR -> Decreasing factor (stops the algorithm when the next activity by occurrence is below
            this factor in comparison to previous)
            Parameters.ACTIVITY_KEY -> Attribute key (must be specified if different from concept:name)
    
    Returns
    ---------
    filtered_log
        Filtered log
    """
    if parameters is None:
        parameters = {}

    attribute_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                               parameters, DEFAULT_NAME_KEY)
    decreasing_factor = exec_utils.get_param_value(
        Parameters.DECREASING_FACTOR, parameters,
        filtering_constants.DECREASING_FACTOR)

    if len(log) > 0:
        parameters_variants = {PARAMETER_CONSTANT_ACTIVITY_KEY: attribute_key}
        if variants is None:
            variants = variants_filter.get_variants(
                log, parameters=parameters_variants)
        vc = variants_filter.get_variants_sorted_by_count(variants)
        end_activities = get_end_activities(log,
                                            parameters=parameters_variants)
        ealist = end_activities_common.get_sorted_end_activities_list(
            end_activities)
        eathreshold = end_activities_common.get_end_activities_threshold(
            ealist, decreasing_factor)
        filtered_log = filter_log_by_end_activities(end_activities, variants,
                                                    vc, eathreshold,
                                                    attribute_key)

        return filtered_log

    return log
def apply_auto_filter(log, variants=None, parameters=None):
    """
    Apply an end attributes filter detecting automatically a percentage
    
    Parameters
    ----------
    log
        Log
    variants
        (If specified) Dictionary with variant as the key and the list of traces as the value
    parameters
        Parameters of the algorithm, including:
            decreasingFactor -> Decreasing factor (stops the algorithm when the next activity by occurrence is below
            this factor in comparison to previous)
            attribute_key -> Attribute key (must be specified if different from concept:name)
    
    Returns
    ---------
    filtered_log
        Filtered log    
    """
    if parameters is None:
        parameters = {}

    attribute_key = parameters[
        PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY
    decreasing_factor = parameters[
        "decreasingFactor"] if "decreasingFactor" in parameters else DECREASING_FACTOR

    parameters_variants = {
        constants.PARAMETER_CONSTANT_ACTIVITY_KEY: attribute_key
    }

    if variants is None:
        variants = variants_filter.get_variants(log,
                                                parameters=parameters_variants)
    vc = variants_filter.get_variants_sorted_by_count(variants)
    start_activities = get_start_activities(log,
                                            parameters=parameters_variants)
    salist = start_activities_common.get_sorted_start_activities_list(
        start_activities)
    sathreshold = start_activities_common.get_start_activities_threshold(
        salist, decreasing_factor)
    filtered_log = filter_log_by_start_activities(start_activities, variants,
                                                  vc, sathreshold,
                                                  attribute_key)
    return filtered_log
Example #10
0
def log_statistics(logpath):
    """
    Extracts log statistics such as #Events, #Cases, #Activities and #Variants, given the path of event log.

    Parameters:
        logpath (str): Path of event log

    Returns:
        events (int): Number of events
        cases (int): Number of traces
        activities (int): Number of activities
        variants (int): Number of variants
    """
    log = importer.apply(logpath)
    log_df = log_converter.apply(log,
                                 variant=log_converter.Variants.TO_DATA_FRAME)
    return len(log_df), log_df["case:concept:name"].nunique(
    ), log_df["concept:name"].nunique(), len(variants_filter.get_variants(log))
Example #11
0
def apply_auto_filter(log, variants=None, parameters=None):
    """
    Apply an attributes filter detecting automatically a percentage

    Parameters
    ----------
    log
        Log
    variants
        (If specified) Dictionary with variant as the key and the list of traces as the value
    parameters
        Parameters of the algorithm, including:
            Parameters.DECREASING_FACTOR -> Decreasing factor (stops the algorithm when the next activity by occurrence is below
            this factor in comparison to previous)
            Parameters.ATTRIBUTE_KEY -> Attribute key (must be specified if different from concept:name)

    Returns
    ---------
    filtered_log
        Filtered log_skeleton
    """
    if parameters is None:
        parameters = {}
    attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY,
                                               parameters,
                                               xes.DEFAULT_NAME_KEY)
    decreasing_factor = exec_utils.get_param_value(
        Parameters.DECREASING_FACTOR, parameters,
        filtering_constants.DECREASING_FACTOR)

    parameters_variants = {
        variants_filter.Parameters.ACTIVITY_KEY: attribute_key
    }
    if variants is None:
        variants = variants_filter.get_variants(log,
                                                parameters=parameters_variants)
    vc = variants_filter.get_variants_sorted_by_count(variants)
    pths = get_paths_from_log(log, attribute_key=attribute_key)
    plist = get_sorted_paths_list(pths)
    thresh = get_paths_threshold(plist, decreasing_factor)
    filtered_log = filter_log_by_paths(log, pths, variants, vc, thresh,
                                       attribute_key)
    return filtered_log
Example #12
0
def start_experiments_for_ptml_files(path, file_names_pt, file_name_log, sample_size=None):
    logging.disable(logging.CRITICAL)

    input_data = []
    print("load log")
    log = import_log(path + file_name_log)
    print("finish loading log")
    variants = variants_filter.get_variants(log)
    log_variants = EventLog()
    for v in variants:
        log_variants.append(variants[v][0])
    if sample_size:
        log_variants = random.sample(log_variants, sample_size)
    for ptml_file_name in file_names_pt:
        with open(path + ptml_file_name, "rb") as input_file:
            pt = pickle.load(input_file)
            pt_vis.view(pt_vis.apply(pt, parameters={"format": "svg"}))
            pt = process_tree_to_binary_process_tree(pt)
            pt_vis.view(pt_vis.apply(pt, parameters={"format": "svg"}))
            input_data.append((pt, log_variants))
    start_experiments(input_data=input_data)
Example #13
0
                        event.case_id]:
                    time_granularities[event.case_id] = d
                    if (args.verbose):
                        print("Updating time granularity for trace " +
                              str(event.case_id) + ": " + str(d) +
                              " seconds. Event " + str(event.activity) + " (" +
                              str(event.event_id) + ")")
        m_time_granularity = statistics.mean(
            [time_granularities[case_id] for case_id in time_granularities])
        measure_times['Time granularity'] = time.perf_counter() - m
        print("Time granularity: " + str(m_time_granularity) + " (seconds)")

    m = time.perf_counter()
    if check_measure("affinity", "structure", "distinct_traces", "all"):
        from pm4py.algo.filtering.log.variants import variants_filter
        var = variants_filter.get_variants(pm4py_log)
        measure_times['var'] = time.perf_counter() - m

    m = time.perf_counter()
    if check_measure("affinity", "structure", "all"):
        hashmap = {}
        evts = list(
            set.union(*[event_classes[case_id] for case_id in event_classes]))
        num_act = len(evts)
        i = 0
        for event in evts:
            for event_follows in evts:
                hashmap[(event, event_follows)] = i
                i += 1
        aff = {}
        for variant in var.keys():
Example #14
0
])

from pm4py.algo.filtering.log.start_activities import start_activities_filter

log_af_sa = start_activities_filter.apply_auto_filter(
    log, parameters={"decreasingFactor": 0.6})
print(start_activities_filter.get_start_activities(log_af_sa))

from pm4py.algo.filtering.log.end_activities import end_activities_filter
log_af_ea = end_activities_filter.apply_auto_filter(
    log, parameters={"decreasingFactor": 0.6})
print(end_activities_filter.get_end_activities(log_af_ea))

#traces
from pm4py.algo.filtering.log.variants import variants_filter
variants = variants_filter.get_variants(log)
variants

from pm4py.statistics.traces.log import case_statistics
variants_count = case_statistics.get_variant_statistics(log)
variants_count = sorted(variants_count, key=lambda x: x['count'], reverse=True)
print(variants_count)
print(len(variants_count))

#most common
filtered_log1 = variants_filter.apply(log, [
    "Confirmation of receipt,T02 Check confirmation of receipt,T04 Determine confirmation of receipt,T05 Print and send confirmation of receipt,T06 Determine necessity of stop advice,T10 Determine necessity to stop indication"
])
filtered_log1
variants_count_filtered_log1 = case_statistics.get_variant_statistics(
    filtered_log1)
Example #15
0
def getaverageduration(log, logname, logtime, logtransi, logid):
    activities = attributes_filter.get_attribute_values(log, logname)
    time = attributes_filter.get_attribute_values(log, logtime)
    variants = variants_filter.get_variants(log)
    #print('\n',activities,'\n')
    #print('\n',time)
    #print('\n',variants)
    timeList = []
    variantsList = []
    variantsList1 = []
    transitionList = []
    activitieslist = []
    eventidlist = []
    for trace in activities:
        activitieslist.append(trace)
    for trace in log:
        for event in trace:
            timeList.append(str(event[logtime]))
        #print (trace,'\n')
    for trace in log:
        for event in trace:
            transitionList.append(event[logtransi])
            eventidlist.append(event[logid])
    for trace in log:
        sublist = []
        for event in trace:
            variantsList.append(event[logname])
            sublist.append(event[logname])
        variantsList1.append(sublist)

        #print (trace,'\n')
    #print('\n',timeList)
    #print(variantsList)
    result = []
    durationlist = []
    durationlist1 = []
    durationlist2 = []
    indexlist = []
    #durationlist = []
    #start position in timestamp now
    #526000 must be replaced
    fmt = '%Y-%m-%d %H:%M:%S'
    for i, val in enumerate(activitieslist):
        timeSum = 0
        count = 0
        for j in range(len(transitionList)):

            if transitionList[j] == "START" or transitionList[
                    j] == "start" and variantsList[j] == val:
                k = j + 1
                while True:
                    if (transitionList[k] == "COMPLETE"
                            or transitionList[k] == "complete"
                        ) and eventidlist[k] == eventidlist[j]:
                        end = timeList[k][0:19]
                        start = timeList[j][0:19]
                        duration = dt.datetime.strptime(
                            end, fmt) - dt.datetime.strptime(start, fmt)
                        count += 1
                        timeSum += int(duration.total_seconds())
                        print(duration, timeSum, 'line 66')
                        indexlist.append(k)
                        break
                    k += 1
        durationlist1.append((timeSum, count))
        print(durationlist1, 'line 70')
    #print(indexlist)

    if 2000 in indexlist:
        print('hahaha')

    for i, val in enumerate(activitieslist):
        timeSum = 0
        count = 0

        header = 0
        for j in range(len(variantsList1)):
            for h in range(len(variantsList1[j]) - 1):
                jump = 0
                if transitionList[header + h] == "COMPLETE" or transitionList[
                        header +
                        h] == "complete" and variantsList1[j][h] == val:

                    #for k in range(len(indexlist)):
                    #x = h + header
                    #if x == indexlist[k]:
                    #jump = 1
                    #break
                    '''if 2000 in indexlist:
                     print('hahaha')'''

                    x = h + header
                    if not (x in indexlist):
                        end = timeList[header + h + 1][0:19]
                        start = timeList[header + h][0:19]
                        duration = dt.datetime.strptime(
                            end, fmt) - dt.datetime.strptime(start, fmt)
                        count += 1

                        #print(count)

                        timeSum += int(duration.total_seconds())
            header += len(variantsList1[j])
        durationlist2.append((timeSum, count))
        print(durationlist2, 'line 108')

    for i, val in enumerate(durationlist1):
        duration = val[0] + durationlist2[i][0]
        count = val[1] + durationlist2[i][1]
        if count == 0:
            averageduration = 0
            result.append(0)
        else:
            averageduration = duration / count
            result.append(averageduration)
    print(result, "line 117")
    return result
    '''
     for i,val in enumerate(activitiesList):
         count = 0
         timeSum = 0
         timestart=[]
         timecomplete=[]
         for j in range(len(variantsList)):
             if variantsList[j] == val and j!=len(variantsList)-1 and transitionList[j] == "START":
                 timestart.append(j)
             if variantsList[j] == val and j!=len(variantsList)-1 and transitionList[j] == "COMPLETE":
                 timecomplete.append(j)

         if timestart != []:
             for j in range(len(timestart)):
                 end = timeList[timecomplete[j]][0:19]
                 start = timeList[timestart[j]][0:19]
                 ts = dt.datetime.strptime(end,fmt)-dt.datetime.strptime(start,fmt)
                 if int(ts.total_seconds()) >= 0:
                    timeSum += int(ts.total_seconds())
                    count += 1

             for j in range(len(variantsList)):
                     if variantsList[j] == val and j!=len(variantsList)-1 and transitionList[j] == "SCHEDULE":
                         end = timeList[j+1][0:19]
                         start = timeList[j][0:19]
                         ts = dt.datetime.strptime(end,fmt)-dt.datetime.strptime(start,fmt)
                         if int(ts.total_seconds()) >= 0:
                            timeSum += int(ts.total_seconds())
                            count += 1
         else:
            for j in range(len(variantsList)):
                    if variantsList[j] == val and j!=len(variantsList)-1:
                        end = timeList[j+1][0:19]
                        start = timeList[j][0:19]
                        ts = dt.datetime.strptime(end,fmt)-dt.datetime.strptime(start,fmt)
                        if int(ts.total_seconds()) >= 0:
                           timeSum += int(ts.total_seconds())
                           count += 1


         if timeSum == 0 :
             duration.append(0)
         else:
             duration.append(timeSum/count)
              '''

    #print('Here is our list of average duration:','\n',duration,'\n')

    #for i in range(len(duration)):
    #print('The average duration of activity ',activitiesList[i],' is ',duration[i],' seconds')

    return duration
Example #16
0
def samplingVariantsForAmstc(net,
                             m0,
                             mf,
                             log,
                             sample_size,
                             size_of_run,
                             max_d,
                             max_t,
                             m,
                             maxCounter=2,
                             editDistance=True,
                             silent_label="tau",
                             debug=None):
    '''
    This function computes a AMSTC with a sampling method. See scientific paper : Model-based Trace Variants
    :param net (Petri) : process model
    :param m0 (Marking) : initial marking
    :param mf (Marking) : final marking
    :param log (Log) : log traces
    :param sample_size (int) : number of traces that will be used in the complete AMSTC
    :param size_of_run (int) : length of the run in the process model
    :param max_d (int) : maximal distance between centroids and traces
    :param max_t (int) : maximal number of transitions in a cluster
    :param m (int) : number of cluster
    :param maxCounter (int) : number of trials without results in the sampling method
    :param editDistance (bool) : use of edit distance between traces
    :param silent_label (string) : count 0 every transition that contains its substring
    :return:
    '''
    def logAlignToCluster(tuple_centroid, traces, variants, editDistance,
                          max_d, counter):
        '''
        Private function of the sampling method with variants. From a centroid, cluster all the traces that
        can be aligned for a cost <= max_d
        :param tuple_centroid: (net, m0, mf)
        :param traces: list of log traces
        :param variants: dictionary of variants
        :param editDistance (boolean) : use or not the edit distance heuristic
        :param max_d (int): maximal distance between the traces and centroids (or casual distance !!)
        :param counter (int): number of trials
        :return:
        '''
        centroid, c_m0, c_mf = tuple_centroid
        traces_of_clusters = []
        used_variants = []
        cleaned_clustered_traces = []
        for clustered in traces:
            # remove "w" and "ww" labels of SAT results
            cleaned_clustered_traces.append([
                x for x in clustered
                if x != WAIT_LABEL_TRACE and x != WAIT_LABEL_MODEL
            ])
        for l in variants:
            bool_clustered = False
            if editDistance:
                # format is not the same due to the SAT results
                transformed_l = list(
                    map(lambda e: e[xes_util.DEFAULT_NAME_KEY],
                        variants[l][0]))
                # align clustered traces and entire log traces with edit distance
                for clustered in cleaned_clustered_traces:
                    if editdistance.eval(clustered,
                                         transformed_l) < (max_d + 1):
                        counter = -1
                        traces_of_clusters += variants[l]
                        used_variants.append(l)
                        bool_clustered = True
                        break
            # align centroid and entire log with alignments
            if not bool_clustered:
                alignment = alignments.algorithm.apply(variants[l][0],
                                                       centroid, c_m0, c_mf)
                if alignment['cost'] < 10000 * ((max_d + 1)):
                    counter = -1
                    traces_of_clusters += variants[l]
                    used_variants.append(l)
        return traces_of_clusters, used_variants, cleaned_clustered_traces, counter

    # ---------------------------------------------------------------------------------------------------------------
    log = deepcopy(log)
    start, totalAlign = time.clock(), 0
    counter, nbOfIteration = 0, 0
    clusters = []
    variants = get_variants(log)
    while len(log._list) > 0 and counter < maxCounter:
        clustering = Amstc(net,
                           m0,
                           mf,
                           log,
                           size_of_run,
                           max_d,
                           max_t,
                           m,
                           nbTraces=sample_size,
                           silent_label=silent_label)
        nbOfIteration += 1
        result = clustering.getClustering()

        if debug is not None:
            print("> Found", len(result) - 1, "centroids")
            print(time.clock() - start)

        # if there is at least a clustered trace :
        if len(result) - 1 > 0:
            for (tuple_centroid, traces) in result:
                if type(tuple_centroid) is tuple:

                    # launches logAlignToCluster function that uses trace variant alignments to cluster
                    startAlign = time.clock()
                    traces_of_clusters, used_variants,cleaned_clustered_traces, counter=\
                        logAlignToCluster(tuple_centroid, traces, variants,editDistance, max_d,counter)
                    totalAlign += (time.clock() - startAlign)
                    for v in used_variants:
                        del variants[v]
                    log._list = list(set(log._list) - set(traces_of_clusters))

                    # create the cluster
                    if len(traces_of_clusters) > 0:
                        clusters.append((tuple_centroid, traces_of_clusters))
            # if we found at least a good centroid
            if counter == -1:
                counter = 0
            else:
                counter += 1
        else:
            counter += 1

    if debug is not None:
        print("This clustering has been found in ", nbOfIteration,
              " iterations and " + str(time.clock() - start) + "secondes.")
        print(str(totalAlign) + " secondes have been used to align.")
        for (centroid, traces) in clusters:
            print(len(traces))
            if type(centroid) is tuple:
                net, m0, mf = centroid
                #vizu.apply(net, m0, mf).view()
                #input("enter..")
        print(len(log._list), "traces are unclustered.")
    clusters.append(("nc", log._list))
    return clusters
Example #17
0
def apply_auto_filter(log, parameters=None):
    """
    Apply some filters in battery to the log in order to get a simplified log
    
    Parameters
    ----------
    log
        Log
    parameters
        Eventual parameters applied to the algorithms:
            decreasingFactor -> Decreasing factor (provided to all algorithms)
            activity_key -> Activity key (must be specified if different from concept:name)
    
    Returns
    ---------
    filtered_log
        Filtered log
    """

    # the following filters are applied:
    # - activity filter (keep only attributes with a reasonable number of occurrences) (if enabled)
    # - variant filter (keep only variants with a reasonable number of occurrences) (if enabled)
    # - start attributes filter (keep only variants that starts with a plausible start activity) (if enabled)
    # - end attributes filter (keep only variants that starts with a plausible end activity) (if enabled)

    if parameters is None:
        parameters = {}

    attribute_key = parameters[
        PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY
    decreasing_factor = parameters[
        "decreasingFactor"] if "decreasingFactor" in parameters else filtering_constants.DECREASING_FACTOR

    parameters_child = {
        "decreasingFactor": decreasing_factor,
        constants.PARAMETER_CONSTANT_ACTIVITY_KEY: attribute_key,
        constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: attribute_key
    }

    enable_activities_filter = parameters[
        "enable_activities_filter"] if "enable_activities_filter" in parameters else True
    enable_variants_filter = parameters[
        "enable_variants_filter"] if "enable_variants_filter" in parameters else False
    enable_start_activities_filter = parameters[
        "enable_start_activities_filter"] if "enable_start_activities_filter" in parameters else False
    enable_end_activities_filter = parameters[
        "enable_end_activities_filter"] if "enable_end_activities_filter" in parameters else True

    variants = variants_module.get_variants(log, parameters=parameters_child)

    filtered_log = log
    if enable_activities_filter:
        filtered_log = attributes_filter.apply_auto_filter(
            log, variants=variants, parameters=parameters_child)
        variants = variants_module.get_variants(filtered_log,
                                                parameters=parameters_child)
    if enable_variants_filter:
        filtered_log = variants_module.apply_auto_filter(
            filtered_log, variants=variants, parameters=parameters_child)
        variants = variants_module.get_variants(filtered_log,
                                                parameters=parameters_child)
    if enable_start_activities_filter:
        filtered_log = start_activities_filter.apply_auto_filter(
            filtered_log, variants=variants, parameters=parameters_child)
    if enable_end_activities_filter:
        filtered_log = end_activities_filter.apply_auto_filter(
            filtered_log, variants=variants, parameters=parameters_child)

    return filtered_log
Example #18
0
    log,
    parameters={
        constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: "concept:name",
        "decreasingFactor": 0.6
    })

activities = attributes_filter.get_attribute_values(log, "concept:name")

auto_filtered_activities = attributes_filter.get_attribute_values(
    log, "concept:name")

# Entire variants

from pm4py.algo.filtering.log.variants import variants_filter

variants = variants_filter.get_variants(log)

log = variants_filter.apply_auto_filter(log)

auto_variants = variants_filter.get_variants(auto_filtered_log)

# Export the log

from pm4py.objects.log.exporter.xes import factory as xes_exporter

xes_exporter.export_log(
    log,
    "C:/Users/vince_000/Documents/BPI Challenge 2019/Exports/exportedLog_2.xes"
)

log[0]
Example #19
0
def apply(log, parameters=None):
    """
    Calculates the HW metric

    Parameters
    ------------
    log
        Log
    parameters
        Possible parameters of the algorithm:
            beta -> beta value as described in the Wil SNA paper

    Returns
    -----------
    tuple
        Tuple containing the metric matrix and the resources list. Moreover, last boolean indicates that the metric is
        directed.
    """
    if parameters is None:
        parameters = {}

    resource_key = parameters[
        constants.
        PARAMETER_CONSTANT_RESOURCE_KEY] if constants.PARAMETER_CONSTANT_RESOURCE_KEY in parameters else xes.DEFAULT_RESOURCE_KEY
    beta = parameters[BETA] if BETA in parameters else 0

    parameters_variants = {
        constants.PARAMETER_CONSTANT_ACTIVITY_KEY: resource_key,
        constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: resource_key
    }
    variants_occ = {
        x: len(y)
        for x, y in variants_filter.get_variants(
            log, parameters=parameters_variants).items()
    }
    variants_resources = list(variants_occ.keys())
    resources = [x.split(",") for x in variants_resources]
    flat_list = sorted(
        list(set([item for sublist in resources for item in sublist])))

    metric_matrix = numpy.zeros((len(flat_list), len(flat_list)))

    sum_i_to_j = {}

    for rv in resources:
        for i in range(len(rv) - 1):
            res_i = flat_list.index(rv[i])
            if not res_i in sum_i_to_j:
                sum_i_to_j[res_i] = {}
            for j in range(i + 1, len(rv)):
                res_j = flat_list.index(rv[j])
                if not res_j in sum_i_to_j[res_i]:
                    sum_i_to_j[res_i][res_j] = 0
                if beta == 0:
                    sum_i_to_j[res_i][res_j] += variants_occ[",".join(rv)]
                    break
                else:
                    sum_i_to_j[res_i][res_j] += variants_occ[",".join(rv)] * (
                        beta**(j - i - 1))

    dividend = 0
    for rv in resources:
        if beta == 0:
            dividend = dividend + variants_occ[",".join(rv)] * (len(rv) - 1)
        else:
            dividend = dividend + variants_occ[",".join(rv)] * (len(rv) - 1)

    for key1 in sum_i_to_j:
        for key2 in sum_i_to_j[key1]:
            metric_matrix[key1][key2] = sum_i_to_j[key1][key2] / dividend

    return [metric_matrix, flat_list, True]
Example #20
0
 def test_22(self):
     from pm4py.algo.filtering.log.variants import variants_filter
     log = self.load_running_example_xes()
     variants = variants_filter.get_variants(log)
def apply(log, parameters=None):
    """
    Returns a log from which a sound workflow net could be extracted taking into account
    a discovery algorithm returning models only with visible transitions

    Parameters
    ------------
    log
        Trace log
    parameters
        Possible parameters of the algorithm, including:
            discovery_algorithm -> Discovery algorithm to consider, possible choices: alphaclassic
            max_no_variants -> Maximum number of variants to consider to return a Petri net

    Returns
    ------------
    filtered_log
        Filtered log
    """
    from pm4py.evaluation.replay_fitness import factory as replay_fitness_factory

    if parameters is None:
        parameters = {}
    discovery_algorithm = parameters["discovery_algorithm"] if "discovery_algorithm" in parameters else "alphaclassic"
    max_no_variants = parameters["max_no_variants"] if "max_no_variants" in parameters else 20
    all_variants_dictio = variants_filter.get_variants(log, parameters=parameters)
    all_variants_list = []
    for var in all_variants_dictio:
        all_variants_list.append([var, len(all_variants_dictio[var])])
    all_variants_list = sorted(all_variants_list, key=lambda x: (x[1], x[0]), reverse=True)
    considered_variants = []
    considered_traces = []

    i = 0
    while i < min(len(all_variants_list), max_no_variants):
        variant = all_variants_list[i][0]

        considered_variants.append(variant)
        considered_traces.append(all_variants_dictio[variant][0])
        filtered_log = EventLog(considered_traces)
        net = None
        initial_marking = None
        final_marking = None
        if discovery_algorithm == "alphaclassic" or discovery_algorithm == "alpha":
            net, initial_marking, final_marking = alpha_miner.apply(filtered_log, parameters=parameters)
        is_sound = check_soundness.check_petri_wfnet_and_soundness(net)
        if not is_sound:
            del considered_variants[-1]
            del considered_traces[-1]
        else:
            try:
                fitness = replay_fitness_factory.apply(filtered_log, net, initial_marking, final_marking,
                                                       parameters=parameters)
                if fitness["log_fitness"] < 0.99999:
                    del considered_variants[-1]
                    del considered_traces[-1]
            except TypeError:
                del considered_variants[-1]
                del considered_traces[-1]
        i = i + 1

    sound_log = EventLog()
    if considered_variants:
        sound_log = variants_filter.apply(log, considered_variants, parameters=parameters)

    return sound_log
def apply(log, parameters=None):
    """
    Calculates the Subcontracting metric

    Parameters
    ------------
    log
        Log
    parameters
        Possible parameters of the algorithm:
            n -> n of the algorithm proposed in the Wil SNA paper

    Returns
    -----------
    tuple
        Tuple containing the metric matrix and the resources list
    """
    if parameters is None:
        parameters = {}

    resource_key = parameters[
        constants.
        PARAMETER_CONSTANT_RESOURCE_KEY] if constants.PARAMETER_CONSTANT_RESOURCE_KEY in parameters else xes.DEFAULT_RESOURCE_KEY
    n = parameters[N] if N in parameters else 2

    parameters_variants = {
        constants.PARAMETER_CONSTANT_ACTIVITY_KEY: resource_key,
        constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: resource_key
    }
    variants_occ = {
        x: len(y)
        for x, y in variants_filter.get_variants(
            log, parameters=parameters_variants).items()
    }
    variants_resources = list(variants_occ.keys())
    resources = [x.split(",") for x in variants_resources]
    flat_list = sorted(
        list(set([item for sublist in resources for item in sublist])))

    metric_matrix = numpy.zeros((len(flat_list), len(flat_list)))

    sum_i_to_j = {}

    for rv in resources:
        for i in range(len(rv) - n):
            res_i = flat_list.index(rv[i])
            res_i_n = flat_list.index(rv[i + n])
            if res_i == res_i_n:
                if res_i not in sum_i_to_j:
                    sum_i_to_j[res_i] = {}
                    for j in range(i + 1, i + n):
                        res_j = flat_list.index(rv[j])
                        if res_j not in sum_i_to_j[res_i]:
                            sum_i_to_j[res_i][res_j] = 0
                        sum_i_to_j[res_i][res_j] += variants_occ[",".join(rv)]

    dividend = 0
    for rv in resources:
        dividend = dividend + variants_occ[",".join(rv)] * (len(rv) - 1)

    for key1 in sum_i_to_j:
        for key2 in sum_i_to_j[key1]:
            metric_matrix[key1][key2] = sum_i_to_j[key1][key2] / dividend

    return [metric_matrix, flat_list, True]
Example #23
0
def apply_filter(req):
	sessions[req.session["id"]] = datetime.now()
	filters = {
		"time": True,
		"variants": True,
		"performance": True,
		"activities": True,
		"attribute": True
	}
	req.session.set_expiry(7200)
	#print(str(req.body))
	o = json.loads(req.body)
	print(str(o))
	custom_time_range = []
	for pair in o["filter1"]:
		#custom_time_range.append((dateutil.parser.parse(pair[0]),dateutil.parser.parse(pair[1])))
		custom_time_range.append((pair[0],pair[1]))
	if o["filter1"] == []:
		filters["time"] = False
	#print(o["filter1"][0])
	#print(custom_time_range[0][0])
	#print(custom_time_range)
	custom_path_range = []
	for pair in o["filter2"]:
		custom_path_range.append((float(pair[0]),float(pair[1])))
	if o["filter2"] == []:
		filters["variants"] = False
		#custom_path_range = [(0,1)] #filter2
	custom_performance_range = []
	for pair in o["filter3"]:
		custom_performance_range.append((float(pair[0]),float(pair[1])))
	if o["filter3"] == []:
		filters["performance"] = False
	custom_activitiy_range = []
	for pair in o["filter4"]:
		custom_activitiy_range.append((float(pair[0]),float(pair[1])))
	if o["filter4"] == []:
		filters["activities"] = False
		#custom_activitiy_range = [(0,1)] #filter3
	custom_attribute_range = []
	for pair in o["filter5"]:
		custom_attribute_range.append((float(pair[0]),float(pair[1])))
	if o["filter5"] == [] or o["filter5attribute"] == "Empty":
		filters["attribute"] = False
	additional_attribute = o["filter5attribute"]

	selected_viz = o["visualization"]
	calc_lev = o["distance"]
	#input_file = os.path.join("webapp","static", req.session["id"] + "_l0.xes")
	input_file = os.path.join("webapp","static", "sepsis.xes")
	input_log = xes_importer.apply(input_file)
	not_filtered_logs = {}
	flatten = lambda l: [item for sublist in l for item in sublist]

	time_timestamp_started = datetime.now()
	if filters["time"]:
		#TODO check overlapping for filter
		custom_time_range = sorted(custom_time_range, reverse=False)
		for i in range(0,len(custom_time_range)-1):
			if(custom_time_range[i][1] > custom_time_range[i+1][0]):
				response = HttpResponse(json.dumps({'error': "Wrong intervals for time filter"}))
				response.status_code = 200
				return response
				#raise ValueError("Overlapping time ranges")

		logs = []
		for (x,y) in custom_time_range:
			logs.append(timestamp_filter.filter_traces_contained(input_log, x, y))

		#log = timestamp_filter.filter_traces_contained(input_log, custom_time_range[0][0], custom_time_range[0][1])
		log = pm4py.objects.log.log.EventLog()
		for timeslice in logs:
			for trace in timeslice:
				log.append(trace)
		print(len(input_log))
		print(len(log))
		#l2
		not_filtered_logs["timestamp_filter"] = pm4py.objects.log.log.EventLog()
		for trace in input_log:
			if trace not in log:
				not_filtered_logs["timestamp_filter"].append(trace)
		print(len(not_filtered_logs["timestamp_filter"]))
	else:
		log = input_log

	time_variants_started = datetime.now() # where should I start?

	if filters["variants"]:
		variants = variants_filter.get_variants(log)
		variants_count = case_statistics.get_variant_statistics(log)
		variants_count = sorted(variants_count, key=lambda x: x['count'], reverse=False)

		custom_path_range = sorted(custom_path_range, reverse=False)
		# check overlapping
		for i in range(0,len(custom_path_range)-1):
			if(custom_path_range[i][1] > custom_path_range[i+1][0]):
				response = HttpResponse(json.dumps({'error': "Wrong intervals for variants filter"}))
				response.status_code = 200
				return response
				#raise ValueError("Overlapping variants ranges")

		nr_variants = len(variants_count)
		custom_path_range * nr_variants
		idx = [(math.floor(x*nr_variants), math.ceil(y*nr_variants)) for (x,y) in custom_path_range]
		variants_subset = [variants_count[x:y+1] for (x,y) in idx]
		variants_subset = flatten(variants_subset)
		filtered_variants = {k:v for k,v in variants.items() if k in [x["variant"] for x in variants_subset]}
		#l2
		not_filtered_variants = {k:v for k,v in variants.items() if k not in [x["variant"] for x in variants_subset]}

		filtered_log = variants_filter.apply(log, filtered_variants)
		#l2
		not_filtered_logs["variant_filter"] = variants_filter.apply(log, not_filtered_variants)
	else:
		filtered_log = log

	time_variants_finished = datetime.now() # note: incl log2 generation

	if filters["performance"]:
		custom_performance_range = sorted(custom_performance_range, reverse=False)
		# check overlapping
		for i in range(0,len(custom_performance_range)-1):
			if(custom_performance_range[i][1] > custom_performance_range[i+1][0]):
				response = HttpResponse(json.dumps({'error': "Wrong intervals for performance filter"}))
				response.status_code = 200
				return response
				#raise ValueError("Overlapping performance ranges")

		#all_case_durations = case_statistics.get_all_casedurations(log, parameters={case_statistics.Parameters.TIMESTAMP_KEY: "time:timestamp"})
		#case_filter.filter_case_performance(log, 86400, 864000)
		performances = []
		for i in range(len(filtered_log)):
			filtered_log[i].attributes["throughput"] = (max([event["time:timestamp"]for event in filtered_log[i]])-min([event["time:timestamp"] for event in filtered_log[i]])).total_seconds()
			performances.append(filtered_log[i].attributes["throughput"])

		nr_cases = len(filtered_log)
		performances = sorted(performances, reverse=False)
		idx = [(math.floor(x*nr_cases), math.ceil(y*nr_cases)) for (x,y) in custom_performance_range]
		perf_subset = [performances[x:y+1] for (x,y) in idx]
		perf_subset = flatten(perf_subset)

		performance_log = pm4py.objects.log.log.EventLog([trace for trace in filtered_log if trace.attributes["throughput"] in perf_subset])
		#l2
		not_filtered_logs["performance_filter"] = pm4py.objects.log.log.EventLog([trace for trace in filtered_log if trace.attributes["throughput"] not in perf_subset])
		#print(str(len(not_filtered_logs["performance_filter"])))

	else:
		performance_log = filtered_log

	time_performance_finished = datetime.now()

	if filters["activities"]:
		variants = variants_filter.get_variants(performance_log)
		variants_count = case_statistics.get_variant_statistics(performance_log)
		variants_count = sorted(variants_count, key=lambda x: x['count'], reverse=False)

		activities = dict()
		for variant in variants_count:
			for activity in variant["variant"].split(","):
				if (activity not in activities.keys()):
					activities[activity] = variant["count"]
				else:
					activities[activity] += variant["count"]

		sorted_activities = {k: v for k, v in sorted(activities.items(), key=lambda item: item[1])}
		activities_sorted_list = list(sorted_activities)
		custom_activitiy_range = sorted(custom_activitiy_range, reverse=False)
		# check overlapping
		for i in range(0,len(custom_activitiy_range)-1):
			if(custom_activitiy_range[i][1] > custom_activitiy_range[i+1][0]):
				response = HttpResponse(json.dumps({'error': "Wrong intervals for activities filter"}))
				response.status_code = 200
				return response
				#raise ValueError("Overlapping activities ranges")
		nr_activities = len(activities_sorted_list)
		idx = [(math.floor(x*nr_activities), math.ceil(y*nr_activities)) for (x,y) in custom_activitiy_range]
		activities_to_keep = [activities_sorted_list[x:y+1] for (x,y) in idx]
		activities_to_keep = flatten(activities_to_keep)
		variants_idx = []
		for i in range(len(variants_count)):
			for activity in activities_to_keep:
				if (activity in variants_count[i]["variant"].split(",") and (i not in variants_idx)):
					variants_idx.append(i)
		variants_subset = [variants_count[i] for i in variants_idx]
		filtered_variants = {k:v for k,v in variants.items() if k in [x["variant"] for x in variants_subset]}
		#l2
		not_filtered_variants = {k:v for k,v in variants.items() if k not in [x["variant"] for x in variants_subset]}

		filtered_log = variants_filter.apply(performance_log, filtered_variants)

		#l2
		not_filtered_logs["activities_filter"] = variants_filter.apply(performance_log, not_filtered_variants)

		new_log = pm4py.objects.log.log.EventLog()
		#not_filtered_logs["activities_filter_traces"] = pm4py.objects.log.log.EventLog()
		for trace in filtered_log:
			new_trace = pm4py.objects.log.log.Trace()
			not_new_trace = pm4py.objects.log.log.Trace()
			for event in trace:
				if(event['concept:name'] in activities_to_keep):
					new_trace.append(event)
				else:
					not_new_trace.append(event)
			if(len(new_trace)>0):
				new_log.append(new_trace)
			if(len(not_new_trace)>0):
				not_filtered_logs["activities_filter"].append(not_new_trace)
	else:
		new_log = performance_log

	time_activities_finished = datetime.now()

	if filters["attribute"]:
		custom_attribute_range = sorted(custom_attribute_range, reverse=False)
		# check overlapping
		for i in range(0,len(custom_attribute_range)-1):
			if(custom_attribute_range[i][1] > custom_attribute_range[i+1][0]):
				response = HttpResponse(json.dumps({'error': "Wrong intervals for additional attribute filter"}))
				response.status_code = 200
				return response

		newest_log = pm4py.objects.log.log.EventLog()
		not_filtered_logs["additional_filter"] = pm4py.objects.log.log.EventLog()

		traces_with_attr = []
		not_traces_with_attr = []
		for trace in new_log:
			if additional_attribute in trace.attributes.keys():
				traces_with_attr.append(trace)
			else:
				not_traces_with_attr.append(trace)
		#check if trace attribute
		if len(traces_with_attr)>0:
			#check if numeric
			if type(traces_with_attr[0].attributes[additional_attribute]) in [int, float]:
				for trace in traces_with_attr:
					if any([trace.attributes[additional_attribute] >= x and trace.attributes[additional_attribute] <= y for (x,y) in custom_attribute_range]):
						newest_log.append(trace)
					else:
						not_filtered_logs["additional_filter"].append(trace)
				for trace in not_traces_with_attr:
					not_filtered_logs["additional_filter"].append(trace)
			else: #string
				attribute_frequencies = dict()
				for trace in traces_with_attr:
					if trace.attributes[additional_attribute] not in attribute_frequencies.keys():
						attribute_frequencies[trace.attributes[additional_attribute]] = 0
					attribute_frequencies[trace.attributes[additional_attribute]] += 1

				sorted_frequencies = {k: v for k, v in sorted(attribute_frequencies.items(), key=lambda item: item[1])}
				frequencies_sorted_list = list(sorted_frequencies)

				nr_values = len(frequencies_sorted_list)
				idx = [(math.floor(x*nr_values), math.ceil(y*nr_values)) for (x,y) in custom_attribute_range]
				values_to_keep = [frequencies_sorted_list[x:y+1] for (x,y) in idx]
				values_to_keep = flatten(values_to_keep)

				for trace in traces_with_attr:
					if trace.attributes[additional_attribute] in values_to_keep:
						newest_log.append(trace)
					else:
						not_filtered_logs["additional_filter"].append(trace)
				for trace in not_traces_with_attr:
					not_filtered_logs["additional_filter"].append(trace)

		else: #event attribute
			if [type(event[additional_attribute]) for trace in new_log for event in trace if additional_attribute in event.keys()][0] in [int, float]:
				for trace in new_log:
					new_trace = pm4py.objects.log.log.Trace()
					not_new_trace = pm4py.objects.log.log.Trace()
					for event in trace:
						if(additional_attribute in event.keys() and any([event[additional_attribute] >= x and event[additional_attribute] <= y for (x,y) in custom_attribute_range ])):
							new_trace.append(event)
						else:
							not_new_trace.append(event)
					if(len(new_trace)>0):
						newest_log.append(new_trace)
					if(len(not_new_trace)>0):
						not_filtered_logs["additional_filter"].append(not_new_trace)
			else: #string
				attribute_frequencies = dict()
				for trace in new_log:
					for event in trace:
						if additional_attribute in event.keys():
							if event[additional_attribute] not in attribute_frequencies.keys():
								attribute_frequencies[event[additional_attribute]] = 0
							attribute_frequencies[event[additional_attribute]] += 1

				sorted_frequencies = {k: v for k, v in sorted(attribute_frequencies.items(), key=lambda item: item[1])}
				frequencies_sorted_list = list(sorted_frequencies)

				nr_values = len(frequencies_sorted_list)
				idx = [(math.floor(x*nr_values), math.ceil(y*nr_values)) for (x,y) in custom_attribute_range]
				values_to_keep = [frequencies_sorted_list[x:y+1] for (x,y) in idx]
				values_to_keep = flatten(values_to_keep)

				for trace in new_log:
					new_trace = pm4py.objects.log.log.Trace()
					not_new_trace = pm4py.objects.log.log.Trace()
					for event in trace:
						if(additional_attribute in event.keys() and event[additional_attribute] in values_to_keep):
							new_trace.append(event)
						else:
							not_new_trace.append(event)
					if(len(new_trace)>0):
						newest_log.append(new_trace)
					if(len(not_new_trace)>0):
						not_filtered_logs["additional_filter"].append(not_new_trace)


	else:
		newest_log = new_log

	time_attribute_finished = datetime.now()

	if(selected_viz=="dfgf"):
		dfg = dfg_discovery.apply(newest_log)
		gviz = dfg_visualization.apply(dfg, log=newest_log, variant=dfg_visualization.Variants.FREQUENCY)
		dfg_visualization.save(gviz, os.path.join("webapp","static", req.session["id"] + "_l1.png"))
	elif(selected_viz=="dfgp"):
		dfg = dfg_discovery.apply(newest_log)
		gviz = dfg_visualization.apply(dfg, log=newest_log, variant=dfg_visualization.Variants.PERFORMANCE)
		dfg_visualization.save(gviz, os.path.join("webapp","static", req.session["id"] + "_l1.png"))
	else:
		heu_net = heuristics_miner.apply_heu(newest_log, parameters={"dependency_thresh": 0.99})
		gviz = hn_vis_factory.apply(heu_net)
		hn_vis_factory.save(gviz, os.path.join("webapp","static", req.session["id"] + "_l1.png"))

	xes_exporter.apply(newest_log, os.path.join("webapp","static", req.session["id"] + "_l1.xes"))


	#l2
	not_filtered_log = pm4py.objects.log.log.EventLog()
	for part in not_filtered_logs.keys():
		for trace in not_filtered_logs[part]:
			not_filtered_log.append(trace)

	if(selected_viz=="dfgf"):
		dfg = dfg_discovery.apply(not_filtered_log)
		gviz = dfg_visualization.apply(dfg, log=not_filtered_log, variant=dfg_visualization.Variants.FREQUENCY)
		dfg_visualization.save(gviz, os.path.join("webapp","static", req.session["id"] + "_l2.png"))
	elif(selected_viz=="dfgp"):
		dfg = dfg_discovery.apply(not_filtered_log)
		gviz = dfg_visualization.apply(dfg, log=not_filtered_log, variant=dfg_visualization.Variants.PERFORMANCE)
		dfg_visualization.save(gviz, os.path.join("webapp","static", req.session["id"] + "_l2.png"))
	else:
		heu_net = heuristics_miner.apply_heu(not_filtered_log, parameters={"dependency_thresh": 0.99})
		gviz = hn_vis_factory.apply(heu_net)
		hn_vis_factory.save(gviz, os.path.join("webapp","static", req.session["id"] + "_l2.png"))
	xes_exporter.apply(not_filtered_log, os.path.join("webapp","static", req.session["id"] + "_l2.xes"))

	if(calc_lev):
		lev_new = [0]*len(newest_log)
		for i in range(len(newest_log)):
			lev_new[i] = [hash(event['concept:name']) for event in newest_log[i]]

		lev_not = [0]*len(not_filtered_log)
		for i in range(len(not_filtered_log)):
			lev_not[i] = [hash(event['concept:name']) for event in not_filtered_log[i]]

		distances = []
		for i in range(len(lev_new)):
			for j in range(len(lev_not)):
				distances.append(lev_dist(lev_new[i], lev_not[j]))
		lev_d = sum(distances)/len(distances)
		print("Levenshtein's distance: "+str(lev_d))
	else:
		lev_d = "null"

	used_paths = 0
	for lower, higher in custom_path_range:
		used_paths += round((higher-lower)*100)
	print(f"Using {used_paths}% of paths. {100-used_paths}% of paths are discarded.")

	print("Timestamp filter: {} seconds. \nVariants filter: {} seconds. \nPerformance filter: {} seconds. \nActivities filter: {} seconds. \nAttribute filter: {} seconds.".format((time_variants_started - time_timestamp_started).total_seconds(), (time_variants_finished - time_variants_started).total_seconds(), (time_performance_finished - time_variants_finished).total_seconds(), (time_activities_finished - time_performance_finished).total_seconds(), (time_attribute_finished - time_activities_finished).total_seconds()))
	response = HttpResponse(json.dumps({'time':(time_variants_started - time_timestamp_started).total_seconds(), 'variants':(time_variants_finished - time_variants_started).total_seconds(),'performance':(time_performance_finished - time_variants_finished).total_seconds(), 'activities':(time_activities_finished - time_performance_finished).total_seconds(), 'attribute':(time_attribute_finished - time_activities_finished).total_seconds(), 'traces':[len(newest_log), len(not_filtered_log)], 'distance':lev_d}))
	response.status_code = 200
	return response
Example #24
0
# filter out activities representing work items
w_activities_2017 = [i for i in activities_2017.keys() if i.startswith('W_')]
fil_log_17 = attributes_filter.apply_events(
    log,
    w_activities_2017,
    parameters={
        attributes_filter.PARAMETER_CONSTANT_ATTRIBUTE_KEY: "concept:name",
        "positive": True
    })

# instances
print("2017 instances: ", len(fil_log_17))

# variants
variants = variants_filter.get_variants(fil_log_17)
print("2017 variants: ", len(variants))

# instances per variant
sum_val = 0
for value in variants.values():
    sum_val += len(value)
ipv = sum_val / len(variants)
print("2017 instances per variant", ipv)

# events
events_2017 = 0
for trace in fil_log_17:
    events_2017 += len(trace)
print("2017 events", events_2017)