def test_case_statistics(self): from pm4py.statistics.traces.log import case_statistics log = self.get_log() case_statistics.get_kde_caseduration(log) case_statistics.get_events(log, "N77802") case_statistics.get_variant_statistics(log) case_statistics.get_cases_description(log) case_statistics.get_all_casedurations(log) case_statistics.get_first_quartile_caseduration(log) case_statistics.get_median_caseduration(log)
def get_variants_list(log, parameters=None): """ Gets the list of variants (along with their count) from the particular log_skeleton type Parameters ------------ log Log parameters Parameters of the algorithm Returns ------------- variants_list List of variants of the log_skeleton (along with their count) """ from pm4py.statistics.traces.pandas import case_statistics as pd_case_statistics from pm4py.statistics.traces.log import case_statistics as log_case_statistics variants_list = [] if type(log) is pd.DataFrame: pd_variants = pd_case_statistics.get_variant_statistics( log, parameters=parameters) for var in pd_variants: varkeys = list(var.keys()) del varkeys[varkeys.index("variant")] variants_list.append((var["variant"], var[varkeys[0]])) else: log_variants = log_case_statistics.get_variant_statistics( log, parameters=parameters) for var in log_variants: varkeys = list(var.keys()) del varkeys[varkeys.index("variant")] variants_list.append((var["variant"], var[varkeys[0]])) return variants_list
def W_creater(log, R, w, output=False): W = [] log = variants_filter.apply(log, R) target_size = len(log) * w # it determines the size of W variant = case_statistics.get_variant_statistics(log) variant = sorted(variant, key=lambda x: x['count'], reverse=True) if output: print( "=" * 100, "\nW creater called with w : {} and target size {}\n".format( w, target_size)) W_size = 0 for v in variant: W_size += v['count'] W.append(v['variant']) if output: print( "\t\t{}___added with size {} // {} out of {} // total size : {}" .format(v['variant'][:60], v['count'], W_size, target_size, len(log))) if W_size > target_size: break if output: print("W creater END with its size: {}".format(len(W))) print("=" * 100) return W
def variant_filter(log): new_log = EventLog() result = [] variant_list = get.get_variants(log) variant_list_count = case_statistics.get_variant_statistics(log) sampled = random.sample(variant_list_count, 1000) vlist = [v['variant'] for v in variant_list_count] vlist_s = [v['variant'] for v in sampled] for v in vlist: if v in vlist_s: for trace in variant_list[v]: new_log.append(trace) new_len = len(case_statistics.get_variant_statistics(new_log)) result.extend([new_len, len(new_log), len(unique_activities(new_log))]) return new_log
def sublog_percent(log, upper_percent, parameters=None): ''' change variant dictionary got from sublog into dataframe, so that we can extract the frequency of each variant :param log: same as sublog2varlist() :param freq_thres: same as sublog2varlist() :return: dataframe of variants with their counts together with the correspond var_list(until the percent ) ''' if parameters is None: parameters = {} lower_percent = exec_utils.get_param_value(Parameters.LOWER_PERCENT, parameters, 0) variants_count = case_statistics.get_variant_statistics(log) variants_count = sorted(variants_count, key=lambda x: x['count'], reverse=True) df = pd.DataFrame.from_dict(variants_count) # calculate the cumunative sum csum = np.array(df['count']).cumsum() csum = csum / csum[-1] num_list = csum[csum <= upper_percent] num_list_lower = csum[csum <= lower_percent] # stop until the percent is satisfied df_w_count = df.iloc[len(num_list_lower):len(num_list), :] # get correspond var_list filtered_var_list = df_w_count['variant'].values.tolist() str_var_list = [ variants_util.get_activities_from_variant(v) for v in filtered_var_list ] return df_w_count, str_var_list
def sublog_percent2varlist(log, upper_percent, parameters=None): ''' just need to var list :param log: same as sublog2varlist() :param freq_thres: same as sublog2varlist() :return: dataframe of variants with their counts together with the correspond var_list(until the percent ) ''' if parameters is None: parameters = {} lower_percent = exec_utils.get_param_value(Parameters.LOWER_PERCENT, parameters, 0) variants_count = case_statistics.get_variant_statistics(log) variants_count = sorted(variants_count, key=lambda x: x['count'], reverse=True) df = pd.DataFrame.from_dict(variants_count) # calculate the cumunative sum csum = np.array(df['count']).cumsum() csum = csum / csum[-1] num_list = csum[csum <= upper_percent] num_list_lower = csum[csum <= lower_percent] # stop until the percent is satisfied df_w_count = df.iloc[len(num_list_lower):len(num_list), :] # get correspond var_list filtered_var_list = df_w_count['variant'].values.tolist() return df_w_count, filtered_var_list
def trace_variant(log): variants_count = case_statistics.get_variant_statistics(log) variants_count = sorted(variants_count, key=lambda x: x["count"], reverse=True) occurrences = [x["count"] for x in variants_count] len_occurr, len_log = len(occurrences), len(log) ratio_most_common_variant = sum(occurrences[:1]) / len(log) ratio_top_1_variants = sum(occurrences[:int(len_occurr * 0.01)]) / len_log ratio_top_5_variants = sum(occurrences[:int(len_occurr * 0.05)]) / len_log ratio_top_10_variants = sum(occurrences[:int(len_occurr * 0.1)]) / len_log ratio_top_20_variants = sum(occurrences[:int(len_occurr * 0.2)]) / len_log ratio_top_50_variants = sum(occurrences[:int(len_occurr * 0.5)]) / len_log ratio_top_75_variants = sum(occurrences[:int(len_occurr * 0.75)]) / len_log mean_variant_occurrence = np.mean(occurrences) std_variant_occurrence = np.std(occurrences) skewness_variant_occurrence = stats.skew(occurrences) kurtosis_variant_occurrence = stats.kurtosis(occurrences) return [ ratio_most_common_variant, ratio_top_1_variants, ratio_top_5_variants, ratio_top_10_variants, ratio_top_20_variants, ratio_top_50_variants, ratio_top_75_variants, mean_variant_occurrence, std_variant_occurrence, skewness_variant_occurrence, kurtosis_variant_occurrence, ]
def sublog2varlist(log, freq_thres, num): ''' extract lists of variants from selected sublogs together with frequency threshold to filter out infrequent variants :param log: sublog containing the selected case attribute value :param freq_thres: (int) frequency threshold to filter out infrequent variants :return: lists of variant strings ''' variants_count = case_statistics.get_variant_statistics(log) variants_count = sorted(variants_count, key=lambda x: x['count'], reverse=True) filtered_var_list = [] filtered_var_list_1 = [] filtered_var_list_2 = [] for i in range(len(variants_count)): if variants_count[i]['count'] >= freq_thres: filtered_var_list_1.append( variants_count[i]['variant']) # variant string elif i < num: filtered_var_list_2.append(variants_count[i]['variant']) # union set ensure the ordered union will be satisfied filtered_var_list = filtered_var_list_1 + filtered_var_list_2 str_var_list = [ variants_util.get_activities_from_variant(v) for v in filtered_var_list ] return str_var_list
def test_obtaining_variants(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" input_log = os.path.join(INPUT_DATA_DIR, "running-example.xes") log = xes_importer.import_log(input_log) stats = case_statistics.get_variant_statistics(log) del stats
def get_statistics(period_1_log, period_2_log): variants_count1 = case_statistics.get_variant_statistics(period_1_log) variants_count1 = sorted(variants_count1, key=lambda x: x['count'], reverse=True) variants_count2 = case_statistics.get_variant_statistics(period_2_log) variants_count2 = sorted(variants_count2, key=lambda x: x['count'], reverse=True) trace_count1 = 0 trace_count2 = 0 for i in variants_count1: trace_count1 += i["count"] for i in variants_count2: trace_count2 += i["count"]
def sublog2df_num(log, num): ''' change variant dictionary got from sublog into dataframe, so that we can extract the frequency of each variant :param log: same as sublog2varlist() :param freq_thres: same as sublog2varlist() :return: dataframe of variants with their counts ''' variants_count = case_statistics.get_variant_statistics(log) variants_count = sorted(variants_count, key=lambda x: x['count'], reverse=True) df = pd.DataFrame.from_dict(variants_count) df_w_count = df.iloc[0:num, :] return df_w_count
def filter_variants(self, filter_level): variants_count = case_statistics.get_variant_statistics(self.log) variants_count = \ sorted(variants_count, key=lambda x: x['count'], reverse=True) total_traces = len(self.log) total_variants = len(variants_count) filter_threshold = (1 / total_variants) * filter_level desired_variants = \ [v['variant'] for v in variants_count \ if v['count']/total_traces >= filter_threshold] self.log = variants_filter.apply(self.log, desired_variants)
def dpi_distribution(log): ''' input : event log object - log output : numpy array d ''' import matplotlib.pyplot as plt from pm4py.statistics.traces.log import case_statistics variants_count = case_statistics.get_variant_statistics(log) variants_count = sorted( variants_count, key=lambda x: x['count'], reverse=True) d = np.zeros(len(variants_count)) for i, v in enumerate(variants_count): d[i] = (v['count']) d = np.array(d) return d
def sublog2df(log, freq_thres, num): ''' change variant dictionary got from sublog into dataframe, so that we can extract the frequency of each variant :param log: same as sublog2varlist() :param freq_thres: same as sublog2varlist() :return: dataframe of variants with their counts ''' variants_count = case_statistics.get_variant_statistics(log) variants_count = sorted(variants_count, key=lambda x: x['count'], reverse=True) df = pd.DataFrame.from_dict(variants_count) df_w_count_1 = df[df['count'] >= freq_thres] df_w_count_2 = df.iloc[0:num, :] # take union of two dataframes df_w_count = pd.merge(df_w_count_1, df_w_count_2, how='outer', on=['variant', 'count']) # display(df_w_count['variant']) return df_w_count
def stat(log): ''' Shows statistical information of log :param log: input log :return: stat_dict(# of events, # of variants, # of cases) ''' a = case_statistics.get_variant_statistics(log) num_event = 0 for trace in log: num_event += len(trace) stat_dict = {} stat_dict['events'] = num_event stat_dict['variants'] = len(a) stat_dict['cases'] = len(log) return stat_dict
def execFreqCase(clusters, EF_df): activityL = EF_df['activity'].unique().tolist() variant_EF_A = [] for clusteri in range(len(clusters)): # per cluster get the variants along with their count variants_count = case_statistics.get_variant_statistics( clusters[clusteri]) for variant in range(len(variants_count)): # per variant count the number of occurence of each activity for key, value in variants_count[variant].items(): if key == "variant": activityVariant = [] for i in range(len(activityL)): EF = len(re.findall(activityL[i], value)) if EF > 0: activityVariant.append({ 'cluster': clusteri, 'variant': variant, 'activity': activityL[i], 'EF': EF }) else: #also include the count of this variant for item in activityVariant: item.update({"count": value}) variant_EF_A.extend(activityVariant) variant_EF_A_df = pd.DataFrame.from_dict(variant_EF_A, orient='columns', dtype=None) variant_EF_A_df['EFsum'] = variant_EF_A_df.apply( lambda x: x['EF'] * x['count'], axis=1) EFc_df = variant_EF_A_df.groupby(by=['cluster', 'activity']).agg({ 'EFsum': "sum", 'count': "sum" }).reset_index() EFc_df['EFc'] = EFc_df.apply(lambda x: x['EFsum'] / x['count'], axis=1) EF_EFc_df = pd.merge(left=EF_df, right=EFc_df.drop(['EFsum', 'count'], axis=1), right_on=['cluster', 'activity'], left_on=['cluster', 'activity'], how='left') EF_EFc_df = EF_EFc_df.rename(columns={'activityCount': 'EF'}) EF_EFc_df = EF_EFc_df.fillna(0) return (EF_EFc_df)
def compute_variant_variability(logpath): """ python function for computing variants in log. Args: logpath (path): The path of events log to parse Returns: Number of distinct variants in log and a dataframe listing variants and thier frequencies """ log = xes_import_factory.apply(logpath) variants_count = case_statistics.get_variant_statistics(log) variants_count = sorted(variants_count, key=lambda x: x['count'], reverse=True) variants_count = pd.DataFrame(variants_count) return len(variants_count), variants_count
def get_statistics(log, parameters=None): """ Gets the variants from the dataframe Parameters ------------ log Log parameters Possible parameters of the algorithm Returns ------------ variants Variants of the event log """ if parameters is None: parameters = {} variants_statistics = case_statistics.get_variant_statistics( log, parameters=parameters) return variants_statistics
def compute_my_variability(logpath): """ Python function for computing TRACE ENTROPY probabilities exactly the VARIANT frequencies observed in the log. This entropy is restricted to the simple likelihood (frequency-based) estimator, as other more complex estimators exist. Args: logpath (path): The path of events log to parse Returns: Trace entropy of all variants in the log (base10, and base2) """ log = xes_import_factory.apply(logpath) variants_count = case_statistics.get_variant_statistics(log) variant_trace_df = pd.DataFrame(variants_count) #calculate probability using the frequencies of each variant variant_trace_df['probability'] = variant_trace_df['count'] / sum( variant_trace_df['count']) print('Entropy (base2) and Entropy (base10)') return entropy(variant_trace_df['probability'], base=2), entropy(variant_trace_df['probability'], base=10)
log_af_sa = start_activities_filter.apply_auto_filter( log, parameters={"decreasingFactor": 0.6}) print(start_activities_filter.get_start_activities(log_af_sa)) from pm4py.algo.filtering.log.end_activities import end_activities_filter log_af_ea = end_activities_filter.apply_auto_filter( log, parameters={"decreasingFactor": 0.6}) print(end_activities_filter.get_end_activities(log_af_ea)) #traces from pm4py.algo.filtering.log.variants import variants_filter variants = variants_filter.get_variants(log) variants from pm4py.statistics.traces.log import case_statistics variants_count = case_statistics.get_variant_statistics(log) variants_count = sorted(variants_count, key=lambda x: x['count'], reverse=True) print(variants_count) print(len(variants_count)) #most common filtered_log1 = variants_filter.apply(log, [ "Confirmation of receipt,T02 Check confirmation of receipt,T04 Determine confirmation of receipt,T05 Print and send confirmation of receipt,T06 Determine necessity of stop advice,T10 Determine necessity to stop indication" ]) filtered_log1 variants_count_filtered_log1 = case_statistics.get_variant_statistics( filtered_log1) print(variants_count_filtered_log1) #--- from pm4py.algo.filtering.log.attributes import attributes_filter
def new_cluster(log, neighbourhood_size, minimum_cluster_size, distance_technique, discovery_technique, max_distance): print('***********New cluster initialization starts!*********\n') iteration = 0 f1_score = 0 if f1_score == 0: cluster = EventLog() # if iteration == 0: # variants_count_list = case_statistics.get_variant_statistics(log) # else: # variants_count_list = case_statistics.get_variant_statistics(log) # random.shuffle(variants_count_list) variants_count_list = case_statistics.get_variant_statistics(log) variant_list = get.get_variants(log) frequent = variants_count_list[0]['variant'] frequent_flag = variant_list[frequent][0].flag print( f'The most frequent variant is: {frequent} with flag: {frequent_flag}' ) """ Building a cluster using KNN (optional) neighbour_variants = find_nearest_neighbours(log, frequent, variants_count_list, variant_list, neighbourhood_size) for neighbour in neighbour_variants: trace_list = variant_list[neighbour] print(len(trace_list)) for index, variant_trace in enumerate(trace_list): cluster.append(variant_trace) log.remove(variant_trace) """ # """ Building a cluster using the most frequent variants for trace in variant_list[frequent]: cluster.append(trace) log.remove(trace) # log = EventLog(filter(lambda x: x not in cluster, log)) for neighbourhood, variant in enumerate(variants_count_list): if neighbourhood == 0: continue if neighbourhood < neighbourhood_size: variant_flow = variant['variant'] neighbour_trace = variant_list[variant_flow][0] print("********** Flags! ************ ") print(neighbour_trace.flag) print(frequent_flag) if neighbour_trace.flag != frequent_flag: if distance_technique == 'BOA': frequent_trace = variant_list[frequent][0] # neighbour_trace = variant_list[variant_flow][0] similarity_distance = distance.euclidean( bag_of_activities(frequent_trace, log), bag_of_activities(neighbour_trace, log)) if distance_technique == 'levenshtein': similarity_distance = levenshtein( frequent, variant_flow) print( f'Distance with {variant_flow} is: {similarity_distance}' ) if similarity_distance <= max_distance: for trace in variant_list[variant_flow]: cluster.append(trace) log.remove(trace) # log = EventLog(filter(lambda x: x not in cluster, log)) else: break # """ print(f'length of cluster: {len(cluster)}, log: {len(log)}') # net, im, fm = heuristics_miner.apply(cluster, parameters={"dependency_thresh": 0.99}) # fitness = replay_fitness_evaluator.apply(cluster, net, im, fm, # variant=replay_fitness_evaluator.Variants.TOKEN_BASED) # precision = precision_evaluator.apply(cluster, net, im, fm, # variant=precision_evaluator.Variants.ETCONFORMANCE_TOKEN) # f1_score = 2 * (fitness["log_fitness"] * precision) / (fitness["log_fitness"] + precision) eval = cluster_evaluation(cluster, discovery_technique) fitness = eval[0] precision = eval[1] f1_score = eval[2] print(f'f1-score is: {f1_score}') iteration += 1 trace_distribution(cluster, log, minimum_cluster_size, discovery_technique, f1_score) return cluster, log
def trace_distribution(cluster, log, minimum_cluster_size, discovery_technique, score): print('***********Trace Distribution Starts!*********\n') print(f'length of cluster: {len(cluster)}, log: {len(log)}') variants_count_list = case_statistics.get_variant_statistics(log) # variants_count_list_sampled = sample(variants_count_list, int(len(variants_count_list) / 4)) # variants_count_list = variants_count_list_sampled variant_trace_list = get.get_variants(log) # if discovery_technique == 'heuristic miner': # net, im, fm = heuristics_miner.apply(cluster, parameters={"dependency_thresh": 0.99}) # if discovery_technique == 'inductive miner': # net, im, fm = inductive_miner.apply(cluster) # initial_fitness = replay_fitness_evaluator.apply(cluster, net, im, fm, # variant=replay_fitness_evaluator.Variants.TOKEN_BASED) # initial_precision = precision_evaluator.apply(cluster, net, im, fm, # variant=precision_evaluator.Variants.ETCONFORMANCE_TOKEN) # current_f1_score_initial = 2 * (initial_fitness["log_fitness"] * initial_precision) / ( # initial_fitness["log_fitness"] + initial_precision) current_f1_score = score print(f'initial f1 is: {current_f1_score}') for variant in variants_count_list: variant_flow = variant['variant'] trace = variant_trace_list[variant_flow][0] cluster.append(trace) # net, im, fm = inductive_miner.apply(cluster) # new_fitness = replay_fitness_evaluator.apply(cluster, net, im, fm, # variant=replay_fitness_evaluator.Variants.TOKEN_BASED) # new_precision = precision_evaluator.apply(cluster, net, im, fm, # variant=precision_evaluator.Variants.ETCONFORMANCE_TOKEN) # new_f1_score = 2 * (new_fitness["log_fitness"] * new_precision) / (new_fitness["log_fitness"] + new_precision) eval = cluster_evaluation(cluster, discovery_technique) new_f1_score = eval[2] # print(f'new fitness is: {initial_fitness}') print(f'current f1-score is: {current_f1_score}') print(f'new f1 is: {new_f1_score}') if current_f1_score <= new_f1_score: print(f'*****Improved the model!****: {trace}') cluster.remove(trace) """ Optional Use of KNN to find neighbours of qualified variant temp_variant_list = case_statistics.get_variant_statistics(log) temp_variant_trace_list = get.get_variants(log) neighbour_variants = find_nearest_neighbours(log, variant_flow, temp_variant_list, temp_variant_trace_list, 5) for v in neighbour_variants: print(f'v is {v}') cc = temp_variant_trace_list[v] print(len(cc)) # for index, variant_instance in enumerate(variant_trace_list[variant_flow]): for index, variant_instance in enumerate(cc): cluster.append(variant_instance) log.remove(variant_instance) print(f'length of cluster: {len(cluster)}, log: {len(log)}') # , sample: {len(sampled_log)}') net, im, fm = inductive_miner.apply(cluster) new_fitness = replay_fitness_evaluator.apply(cluster, net, im, fm, variant=replay_fitness_evaluator.Variants.TOKEN_BASED) new_precision = precision_evaluator.apply(cluster, net, im, fm, variant=precision_evaluator.Variants.ETCONFORMANCE_TOKEN) new_f1_score = 2 * (new_fitness["log_fitness"] * new_precision) / ( new_fitness["log_fitness"] + new_precision) current_f1_score = new_f1_score continue """ # """ Without KNN current_f1_score = new_f1_score for variant_instance in variant_trace_list[variant_flow]: cluster.append(variant_instance) log.remove(variant_instance) print(f'length of cluster: {len(cluster)}, log: {len(log)}') continue # """ if current_f1_score > new_f1_score: if new_f1_score >= 0.9 and abs(current_f1_score - new_f1_score) <= 0.05: print( f'$$$$Did not improve the model but a close trace!$$$$: {trace}' ) current_f1_score = new_f1_score for index, variant_instance in enumerate( variant_trace_list[variant_flow]): if index > 0: cluster.append(variant_instance) log.remove(variant_instance) print(f'length of cluster: {len(cluster)}, log: {len(log)}') # continue else: # variants_count_list.remove(variant) cluster.remove(trace) if len(cluster) >= minimum_cluster_size: print( f'did not improve the model and enough traces!: {trace}' ) final_cluster = cluster break else: print( f'did not improve the model and not enough traces!: {trace}' ) print( f'length of cluster: {len(cluster)}, log: {len(log)}') # continue print(f'length of cluster: {len(cluster)}, log: {len(log)}')
def apply_filter(req): sessions[req.session["id"]] = datetime.now() filters = { "time": True, "variants": True, "performance": True, "activities": True, "attribute": True } req.session.set_expiry(7200) #print(str(req.body)) o = json.loads(req.body) print(str(o)) custom_time_range = [] for pair in o["filter1"]: #custom_time_range.append((dateutil.parser.parse(pair[0]),dateutil.parser.parse(pair[1]))) custom_time_range.append((pair[0],pair[1])) if o["filter1"] == []: filters["time"] = False #print(o["filter1"][0]) #print(custom_time_range[0][0]) #print(custom_time_range) custom_path_range = [] for pair in o["filter2"]: custom_path_range.append((float(pair[0]),float(pair[1]))) if o["filter2"] == []: filters["variants"] = False #custom_path_range = [(0,1)] #filter2 custom_performance_range = [] for pair in o["filter3"]: custom_performance_range.append((float(pair[0]),float(pair[1]))) if o["filter3"] == []: filters["performance"] = False custom_activitiy_range = [] for pair in o["filter4"]: custom_activitiy_range.append((float(pair[0]),float(pair[1]))) if o["filter4"] == []: filters["activities"] = False #custom_activitiy_range = [(0,1)] #filter3 custom_attribute_range = [] for pair in o["filter5"]: custom_attribute_range.append((float(pair[0]),float(pair[1]))) if o["filter5"] == [] or o["filter5attribute"] == "Empty": filters["attribute"] = False additional_attribute = o["filter5attribute"] selected_viz = o["visualization"] calc_lev = o["distance"] #input_file = os.path.join("webapp","static", req.session["id"] + "_l0.xes") input_file = os.path.join("webapp","static", "sepsis.xes") input_log = xes_importer.apply(input_file) not_filtered_logs = {} flatten = lambda l: [item for sublist in l for item in sublist] time_timestamp_started = datetime.now() if filters["time"]: #TODO check overlapping for filter custom_time_range = sorted(custom_time_range, reverse=False) for i in range(0,len(custom_time_range)-1): if(custom_time_range[i][1] > custom_time_range[i+1][0]): response = HttpResponse(json.dumps({'error': "Wrong intervals for time filter"})) response.status_code = 200 return response #raise ValueError("Overlapping time ranges") logs = [] for (x,y) in custom_time_range: logs.append(timestamp_filter.filter_traces_contained(input_log, x, y)) #log = timestamp_filter.filter_traces_contained(input_log, custom_time_range[0][0], custom_time_range[0][1]) log = pm4py.objects.log.log.EventLog() for timeslice in logs: for trace in timeslice: log.append(trace) print(len(input_log)) print(len(log)) #l2 not_filtered_logs["timestamp_filter"] = pm4py.objects.log.log.EventLog() for trace in input_log: if trace not in log: not_filtered_logs["timestamp_filter"].append(trace) print(len(not_filtered_logs["timestamp_filter"])) else: log = input_log time_variants_started = datetime.now() # where should I start? if filters["variants"]: variants = variants_filter.get_variants(log) variants_count = case_statistics.get_variant_statistics(log) variants_count = sorted(variants_count, key=lambda x: x['count'], reverse=False) custom_path_range = sorted(custom_path_range, reverse=False) # check overlapping for i in range(0,len(custom_path_range)-1): if(custom_path_range[i][1] > custom_path_range[i+1][0]): response = HttpResponse(json.dumps({'error': "Wrong intervals for variants filter"})) response.status_code = 200 return response #raise ValueError("Overlapping variants ranges") nr_variants = len(variants_count) custom_path_range * nr_variants idx = [(math.floor(x*nr_variants), math.ceil(y*nr_variants)) for (x,y) in custom_path_range] variants_subset = [variants_count[x:y+1] for (x,y) in idx] variants_subset = flatten(variants_subset) filtered_variants = {k:v for k,v in variants.items() if k in [x["variant"] for x in variants_subset]} #l2 not_filtered_variants = {k:v for k,v in variants.items() if k not in [x["variant"] for x in variants_subset]} filtered_log = variants_filter.apply(log, filtered_variants) #l2 not_filtered_logs["variant_filter"] = variants_filter.apply(log, not_filtered_variants) else: filtered_log = log time_variants_finished = datetime.now() # note: incl log2 generation if filters["performance"]: custom_performance_range = sorted(custom_performance_range, reverse=False) # check overlapping for i in range(0,len(custom_performance_range)-1): if(custom_performance_range[i][1] > custom_performance_range[i+1][0]): response = HttpResponse(json.dumps({'error': "Wrong intervals for performance filter"})) response.status_code = 200 return response #raise ValueError("Overlapping performance ranges") #all_case_durations = case_statistics.get_all_casedurations(log, parameters={case_statistics.Parameters.TIMESTAMP_KEY: "time:timestamp"}) #case_filter.filter_case_performance(log, 86400, 864000) performances = [] for i in range(len(filtered_log)): filtered_log[i].attributes["throughput"] = (max([event["time:timestamp"]for event in filtered_log[i]])-min([event["time:timestamp"] for event in filtered_log[i]])).total_seconds() performances.append(filtered_log[i].attributes["throughput"]) nr_cases = len(filtered_log) performances = sorted(performances, reverse=False) idx = [(math.floor(x*nr_cases), math.ceil(y*nr_cases)) for (x,y) in custom_performance_range] perf_subset = [performances[x:y+1] for (x,y) in idx] perf_subset = flatten(perf_subset) performance_log = pm4py.objects.log.log.EventLog([trace for trace in filtered_log if trace.attributes["throughput"] in perf_subset]) #l2 not_filtered_logs["performance_filter"] = pm4py.objects.log.log.EventLog([trace for trace in filtered_log if trace.attributes["throughput"] not in perf_subset]) #print(str(len(not_filtered_logs["performance_filter"]))) else: performance_log = filtered_log time_performance_finished = datetime.now() if filters["activities"]: variants = variants_filter.get_variants(performance_log) variants_count = case_statistics.get_variant_statistics(performance_log) variants_count = sorted(variants_count, key=lambda x: x['count'], reverse=False) activities = dict() for variant in variants_count: for activity in variant["variant"].split(","): if (activity not in activities.keys()): activities[activity] = variant["count"] else: activities[activity] += variant["count"] sorted_activities = {k: v for k, v in sorted(activities.items(), key=lambda item: item[1])} activities_sorted_list = list(sorted_activities) custom_activitiy_range = sorted(custom_activitiy_range, reverse=False) # check overlapping for i in range(0,len(custom_activitiy_range)-1): if(custom_activitiy_range[i][1] > custom_activitiy_range[i+1][0]): response = HttpResponse(json.dumps({'error': "Wrong intervals for activities filter"})) response.status_code = 200 return response #raise ValueError("Overlapping activities ranges") nr_activities = len(activities_sorted_list) idx = [(math.floor(x*nr_activities), math.ceil(y*nr_activities)) for (x,y) in custom_activitiy_range] activities_to_keep = [activities_sorted_list[x:y+1] for (x,y) in idx] activities_to_keep = flatten(activities_to_keep) variants_idx = [] for i in range(len(variants_count)): for activity in activities_to_keep: if (activity in variants_count[i]["variant"].split(",") and (i not in variants_idx)): variants_idx.append(i) variants_subset = [variants_count[i] for i in variants_idx] filtered_variants = {k:v for k,v in variants.items() if k in [x["variant"] for x in variants_subset]} #l2 not_filtered_variants = {k:v for k,v in variants.items() if k not in [x["variant"] for x in variants_subset]} filtered_log = variants_filter.apply(performance_log, filtered_variants) #l2 not_filtered_logs["activities_filter"] = variants_filter.apply(performance_log, not_filtered_variants) new_log = pm4py.objects.log.log.EventLog() #not_filtered_logs["activities_filter_traces"] = pm4py.objects.log.log.EventLog() for trace in filtered_log: new_trace = pm4py.objects.log.log.Trace() not_new_trace = pm4py.objects.log.log.Trace() for event in trace: if(event['concept:name'] in activities_to_keep): new_trace.append(event) else: not_new_trace.append(event) if(len(new_trace)>0): new_log.append(new_trace) if(len(not_new_trace)>0): not_filtered_logs["activities_filter"].append(not_new_trace) else: new_log = performance_log time_activities_finished = datetime.now() if filters["attribute"]: custom_attribute_range = sorted(custom_attribute_range, reverse=False) # check overlapping for i in range(0,len(custom_attribute_range)-1): if(custom_attribute_range[i][1] > custom_attribute_range[i+1][0]): response = HttpResponse(json.dumps({'error': "Wrong intervals for additional attribute filter"})) response.status_code = 200 return response newest_log = pm4py.objects.log.log.EventLog() not_filtered_logs["additional_filter"] = pm4py.objects.log.log.EventLog() traces_with_attr = [] not_traces_with_attr = [] for trace in new_log: if additional_attribute in trace.attributes.keys(): traces_with_attr.append(trace) else: not_traces_with_attr.append(trace) #check if trace attribute if len(traces_with_attr)>0: #check if numeric if type(traces_with_attr[0].attributes[additional_attribute]) in [int, float]: for trace in traces_with_attr: if any([trace.attributes[additional_attribute] >= x and trace.attributes[additional_attribute] <= y for (x,y) in custom_attribute_range]): newest_log.append(trace) else: not_filtered_logs["additional_filter"].append(trace) for trace in not_traces_with_attr: not_filtered_logs["additional_filter"].append(trace) else: #string attribute_frequencies = dict() for trace in traces_with_attr: if trace.attributes[additional_attribute] not in attribute_frequencies.keys(): attribute_frequencies[trace.attributes[additional_attribute]] = 0 attribute_frequencies[trace.attributes[additional_attribute]] += 1 sorted_frequencies = {k: v for k, v in sorted(attribute_frequencies.items(), key=lambda item: item[1])} frequencies_sorted_list = list(sorted_frequencies) nr_values = len(frequencies_sorted_list) idx = [(math.floor(x*nr_values), math.ceil(y*nr_values)) for (x,y) in custom_attribute_range] values_to_keep = [frequencies_sorted_list[x:y+1] for (x,y) in idx] values_to_keep = flatten(values_to_keep) for trace in traces_with_attr: if trace.attributes[additional_attribute] in values_to_keep: newest_log.append(trace) else: not_filtered_logs["additional_filter"].append(trace) for trace in not_traces_with_attr: not_filtered_logs["additional_filter"].append(trace) else: #event attribute if [type(event[additional_attribute]) for trace in new_log for event in trace if additional_attribute in event.keys()][0] in [int, float]: for trace in new_log: new_trace = pm4py.objects.log.log.Trace() not_new_trace = pm4py.objects.log.log.Trace() for event in trace: if(additional_attribute in event.keys() and any([event[additional_attribute] >= x and event[additional_attribute] <= y for (x,y) in custom_attribute_range ])): new_trace.append(event) else: not_new_trace.append(event) if(len(new_trace)>0): newest_log.append(new_trace) if(len(not_new_trace)>0): not_filtered_logs["additional_filter"].append(not_new_trace) else: #string attribute_frequencies = dict() for trace in new_log: for event in trace: if additional_attribute in event.keys(): if event[additional_attribute] not in attribute_frequencies.keys(): attribute_frequencies[event[additional_attribute]] = 0 attribute_frequencies[event[additional_attribute]] += 1 sorted_frequencies = {k: v for k, v in sorted(attribute_frequencies.items(), key=lambda item: item[1])} frequencies_sorted_list = list(sorted_frequencies) nr_values = len(frequencies_sorted_list) idx = [(math.floor(x*nr_values), math.ceil(y*nr_values)) for (x,y) in custom_attribute_range] values_to_keep = [frequencies_sorted_list[x:y+1] for (x,y) in idx] values_to_keep = flatten(values_to_keep) for trace in new_log: new_trace = pm4py.objects.log.log.Trace() not_new_trace = pm4py.objects.log.log.Trace() for event in trace: if(additional_attribute in event.keys() and event[additional_attribute] in values_to_keep): new_trace.append(event) else: not_new_trace.append(event) if(len(new_trace)>0): newest_log.append(new_trace) if(len(not_new_trace)>0): not_filtered_logs["additional_filter"].append(not_new_trace) else: newest_log = new_log time_attribute_finished = datetime.now() if(selected_viz=="dfgf"): dfg = dfg_discovery.apply(newest_log) gviz = dfg_visualization.apply(dfg, log=newest_log, variant=dfg_visualization.Variants.FREQUENCY) dfg_visualization.save(gviz, os.path.join("webapp","static", req.session["id"] + "_l1.png")) elif(selected_viz=="dfgp"): dfg = dfg_discovery.apply(newest_log) gviz = dfg_visualization.apply(dfg, log=newest_log, variant=dfg_visualization.Variants.PERFORMANCE) dfg_visualization.save(gviz, os.path.join("webapp","static", req.session["id"] + "_l1.png")) else: heu_net = heuristics_miner.apply_heu(newest_log, parameters={"dependency_thresh": 0.99}) gviz = hn_vis_factory.apply(heu_net) hn_vis_factory.save(gviz, os.path.join("webapp","static", req.session["id"] + "_l1.png")) xes_exporter.apply(newest_log, os.path.join("webapp","static", req.session["id"] + "_l1.xes")) #l2 not_filtered_log = pm4py.objects.log.log.EventLog() for part in not_filtered_logs.keys(): for trace in not_filtered_logs[part]: not_filtered_log.append(trace) if(selected_viz=="dfgf"): dfg = dfg_discovery.apply(not_filtered_log) gviz = dfg_visualization.apply(dfg, log=not_filtered_log, variant=dfg_visualization.Variants.FREQUENCY) dfg_visualization.save(gviz, os.path.join("webapp","static", req.session["id"] + "_l2.png")) elif(selected_viz=="dfgp"): dfg = dfg_discovery.apply(not_filtered_log) gviz = dfg_visualization.apply(dfg, log=not_filtered_log, variant=dfg_visualization.Variants.PERFORMANCE) dfg_visualization.save(gviz, os.path.join("webapp","static", req.session["id"] + "_l2.png")) else: heu_net = heuristics_miner.apply_heu(not_filtered_log, parameters={"dependency_thresh": 0.99}) gviz = hn_vis_factory.apply(heu_net) hn_vis_factory.save(gviz, os.path.join("webapp","static", req.session["id"] + "_l2.png")) xes_exporter.apply(not_filtered_log, os.path.join("webapp","static", req.session["id"] + "_l2.xes")) if(calc_lev): lev_new = [0]*len(newest_log) for i in range(len(newest_log)): lev_new[i] = [hash(event['concept:name']) for event in newest_log[i]] lev_not = [0]*len(not_filtered_log) for i in range(len(not_filtered_log)): lev_not[i] = [hash(event['concept:name']) for event in not_filtered_log[i]] distances = [] for i in range(len(lev_new)): for j in range(len(lev_not)): distances.append(lev_dist(lev_new[i], lev_not[j])) lev_d = sum(distances)/len(distances) print("Levenshtein's distance: "+str(lev_d)) else: lev_d = "null" used_paths = 0 for lower, higher in custom_path_range: used_paths += round((higher-lower)*100) print(f"Using {used_paths}% of paths. {100-used_paths}% of paths are discarded.") print("Timestamp filter: {} seconds. \nVariants filter: {} seconds. \nPerformance filter: {} seconds. \nActivities filter: {} seconds. \nAttribute filter: {} seconds.".format((time_variants_started - time_timestamp_started).total_seconds(), (time_variants_finished - time_variants_started).total_seconds(), (time_performance_finished - time_variants_finished).total_seconds(), (time_activities_finished - time_performance_finished).total_seconds(), (time_attribute_finished - time_activities_finished).total_seconds())) response = HttpResponse(json.dumps({'time':(time_variants_started - time_timestamp_started).total_seconds(), 'variants':(time_variants_finished - time_variants_started).total_seconds(),'performance':(time_performance_finished - time_variants_finished).total_seconds(), 'activities':(time_activities_finished - time_performance_finished).total_seconds(), 'attribute':(time_attribute_finished - time_activities_finished).total_seconds(), 'traces':[len(newest_log), len(not_filtered_log)], 'distance':lev_d})) response.status_code = 200 return response
from pm4py.objects.log.importer.xes import factory as xes_import_factory from pm4py.objects.log.exporter.xes import factory as xes_exporter from pm4py.statistics.traces.log import case_statistics from pm4py.algo.filtering.log.variants import variants_filter K = [20] for k in K: event_log = "Sepsis Cases - Event Log.xes" log = xes_import_factory.apply(event_log) var_with_count = case_statistics.get_variant_statistics(log) variants_count = sorted(var_with_count, key=lambda x: x['count'], reverse=True) to_filter = [] count = 0 for j in range(0, len(variants_count)): dict = variants_count[j] if dict["count"] < k: to_filter.append([dict["variant"]]) else: count += dict["count"] for delete in to_filter: log = variants_filter.apply(log, delete, parameters={"positive": False}) xes_exporter.export_log( log, "baseline" + "_" + str(k) + "-" + "Annonymity" + ".xes") print("baseline" + "_" + str(k) + "-" + "Annonymity" + ".xes" + " has been exported!")
diff_absolute = diffTwoMatrix(excellent_average, weak_average) dfg_miner_time_diff_absolute = diffTwoMatrix(dfg_miner_excellent_dfg, dfg_miner_weak_dfg) #Inductive Miner from pm4py.algo.discovery.inductive import factory as inductive_miner tree = inductive_miner.apply_tree(ex1_personal_log_1_converted) from pm4py.visualization.process_tree import factory as pt_vis_factory gviz = pt_vis_factory.apply(tree) pt_vis_factory.view(gviz) from pm4py.algo.discovery.inductive import factory as inductive_miner net, initial_marking, final_marking = inductive_miner.apply( ex1_personal_log_1_converted) from pm4py.visualization.petrinet import factory as pn_vis_factory gviz = pn_vis_factory.apply(net, initial_marking, final_marking) pn_vis_factory.view(gviz) #variant from pm4py.statistics.traces.log import case_statistics var_with_count = case_statistics.get_variant_statistics( ex1_personal_log_1_converted, parameters={"max_variants_to_return": 5})