def getaverageduration2(log, logname, logtime): activities = attributes_filter.get_attribute_values(log, logname) time = attributes_filter.get_attribute_values(log, logtime) variants = variants_filter.get_variants(log) #print('\n',activities,'\n') #print('\n',time) #print('\n',variants) timeList = [] tracelist = [] variantsList = [] activitiesList = [] for trace in activities: activitiesList.append(trace) for trace in log: for event in trace: timeList.append(str(event[logtime])) #print (trace,'\n') for trace in log: variantsList = [] for event in trace: variantsList.append(event[logname]) tracelist.append(variantsList) #print (trace,'\n') #print('\n',timeList) #print(tracelist,'........') duration = [] #start position in timestamp now #526000 must be replaced fmt = '%Y-%m-%d %H:%M:%S' for i, val in enumerate(activitiesList): count = 0 timeSum = 0 header = 0 for i in range(len(tracelist)): for j in range(len(tracelist[i])): if tracelist[i][j] == val and j != len(tracelist[i]) - 1: end = timeList[header + j + 1][0:19] start = timeList[header + j][0:19] ts = dt.datetime.strptime(end, fmt) - dt.datetime.strptime( start, fmt) timeSum += int(ts.total_seconds()) count += 1 header = header + len(tracelist[i]) #print(header) if timeSum == 0: duration.append(0) else: duration.append(timeSum / count) #print(duration,'line 235') #print('Here is our list of average duration:','\n',duration,'\n') #for i in range(len(duration)): #print('The average duration of activity ',activitiesList[i],' is ',duration[i],' seconds') return duration
def dataPreprocess(log): """ Transform every trace in the log file in a way that we will have direct access to every event in a trace and its time. Also returns a array with the initial sequence of events in a trace that will be used latter to create the pairs """ activities_all = log_attributes_filter.get_attribute_values( log, "concept:name") activities = list(activities_all.keys()) dataVectors = [] theIndex = [] for trace in log: k = [0 for i in range(len(activities))] times = [[] for i in range(len(activities))] previousTime = trace.attributes["REG_DATE"] aIndex = [] for index, event in enumerate(trace): indexActivity = activities.index(event["concept:name"]) k[indexActivity] += 1 times[indexActivity].append(event["time:timestamp"] - previousTime) aIndex.append([index, indexActivity, len(times[indexActivity])]) previousTime = event["time:timestamp"] timesSeconds = [[i.total_seconds() for i in x] for x in times] dataVectors.append(timesSeconds) theIndex.append(aIndex) return dataVectors, theIndex
def readFromFile(log): """ This functions will read the distribution evaluation from the file. It will be used if we had already run the experiments, to save time. """ dists = [] with open("distributions.txt", "r") as f: for line in f: dists.append(line.split(", ")[1:-1]) distributions = [[i.split("-") for i in d] for d in dists] distributions = [] for index, d in enumerate(dists): distributions.append([]) for i in d: k = i.split("-") if len(k) == 4: k.remove("") k[2] = "-" + k[2] distributions[index].append(k) p = [[[i[0], float(i[1]), float(i[2])] for i in dist] for dist in distributions] pSorted = [[sorted(i, key=lambda x: x[2], reverse=True)] for i in p] oneDist = [i[0][0] for i in pSorted] distributionsDF = pd.DataFrame() activities_all = log_attributes_filter.get_attribute_values( log, "concept:name") activities = list(activities_all.keys()) distributionsDF["Activity_Name"] = activities distributionsDF['Distribution'] = [i[0] for i in oneDist] distributionsDF['RMSE'] = [i[1] for i in oneDist] distributionsDF["R2"] = [i[2] for i in oneDist] return distributionsDF
def dataPreprocessPerActivity(log): activities_all = log_attributes_filter.get_attribute_values( log, "concept:name") activities = list(activities_all.keys()) dataVectors = [] for traceIndex, trace in enumerate(log): k = [0 for i in range(len(activities))] times = [[] for i in range(len(activities))] previousTime = trace.attributes["REG_DATE"] for index, event in enumerate(trace): indexActivity = activities.index(event["concept:name"]) k[indexActivity] += 1 times[indexActivity].append( [traceIndex, event["time:timestamp"] - previousTime]) previousTime = event["time:timestamp"] timesSeconds = [[[i[0], i[1].total_seconds()] for i in x] for x in times] dataVectors.append(timesSeconds) #Transpose dataVectors transposedDataVectors = [[ dataVector[index] for dataVector in dataVectors if dataVector[index] != [] ] for index in range(len(dataVectors[0]))] return [[event for trace in dataVector for event in trace] for dataVector in transposedDataVectors]
def execute_script(variant="frequency"): # read the log using the nonstandard importer (faster) log_path = os.path.join("..", "tests", "input_data", "receipt.xes") log = xes_importer.import_log(log_path, variant="nonstandard") # applies Inductive Miner on the log net, initial_marking, final_marking = inductive_miner.apply(log) # find shortest paths in the net spaths = get_shortest_paths(net) # then we start to decorate the net # we decide if we should decorate it with frequency or performance # we decide the aggregation measure (sum, min, max, mean, median, stdev) aggregation_measure = "mean" if variant == "frequency": aggregation_measure = "sum" # we find the DFG dfg = dfg_factory.apply(log, variant=variant) # we find the number of activities occurrences in the log activities_count = attributes_filter.get_attribute_values(log, "concept:name") # we calculate the statistics on the Petri net applying the greedy algorithm aggregated_statistics = get_decorations_from_dfg_spaths_acticount(net, dfg, spaths, activities_count, variant=variant, aggregation_measure=aggregation_measure) # we find the gviz gviz = pn_vis_factory.apply(net, initial_marking, final_marking, variant=variant, aggregated_statistics=aggregated_statistics, parameters={"format": "svg"}) # we show the viz on screen pn_vis_factory.view(gviz)
def get_activities_list(log, parameters=None): """ Gets the activities list from a log object, sorted by activity name Parameters -------------- log Log parameters Possible parameters of the algorithm Returns ------------- activities_list List of activities sorted by activity name """ if parameters is None: parameters = {} activity_key = parameters[ constants.PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY if type(log) is pd.DataFrame: activities = pd_attributes_filter.get_attribute_values(log, activity_key) else: activities = log_attributes_filter.get_attribute_values(log, activity_key) return sorted(list(activities.keys()))
def apply(dfg, log=None, parameters=None, activities_count=None, measure="frequency"): if parameters is None: parameters = {} activity_key = parameters[ PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY image_format = "png" max_no_of_edges_in_diagram = 75 if "format" in parameters: image_format = parameters["format"] if "maxNoOfEdgesInDiagram" in parameters: max_no_of_edges_in_diagram = parameters["maxNoOfEdgesInDiagram"] if activities_count is None: activities_count = attributes_filter.get_attribute_values( log, activity_key, parameters=parameters) return graphviz_visualization( activities_count, dfg, image_format=image_format, measure=measure, max_no_of_edges_in_diagram=max_no_of_edges_in_diagram)
def activities(log): activities = attributes_filter.get_attribute_values(log, "concept:name") n_unique_activities = len(activities) activities_occurrences = list(activities.values()) activities_min = np.min(activities_occurrences) activities_max = np.max(activities_occurrences) activities_mean = np.mean(activities_occurrences) activities_median = np.median(activities_occurrences) activities_std = np.std(activities_occurrences) activities_variance = np.var(activities_occurrences) activities_q1 = np.percentile(activities_occurrences, 25) activities_q3 = np.percentile(activities_occurrences, 75) activities_iqr = stats.iqr(activities_occurrences) activities_skewness = stats.skew(activities_occurrences) activities_kurtosis = stats.kurtosis(activities_occurrences) return [ n_unique_activities, activities_min, activities_max, activities_mean, activities_median, activities_std, activities_variance, activities_q1, activities_q3, activities_iqr, activities_skewness, activities_kurtosis, ]
def get_activities(data): """ Filteres event log to only return attribute names. :param data: event log :return: event log activities """ return list( sorted( attributes_filter.get_attribute_values(data, "concept:name").keys()))
def filterfile(sourceFile, outputFile, patternText, inclusive): log = importer.apply(sourceFile) activities = attributes_filter.get_attribute_values(log, CONCEPT_NAME) filteredLog = attributes_filter.apply( log, [patternText], parameters={ attributes_filter.Parameters.ATTRIBUTE_KEY: CONCEPT_NAME, attributes_filter.Parameters.POSITIVE: inclusive }) xes_exporter.apply(log, outputFile)
def stats(log): activities = list([i for i in attributes_filter.get_attribute_values(log, "concept:name")]) times=[[0 for _ in range(len(activities))] for _ in range(len(log))] for index_t,trace in enumerate(log): previous_time=0 for index,event in enumerate(trace): if index==0: previous_time=trace.attributes["REG_DATE"] time=event["time:timestamp"] duration=time-previous_time times[index_t][activities.index(event["concept:name"])]+=duration.total_seconds()
def apply(log, parameters): """ Apply the IMDF algorithm to a log obtaining a Petri net along with an initial and final marking Parameters ----------- log Log parameters Parameters of the algorithm, including: pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY -> attribute of the log to use as activity name (default concept:name) Returns ----------- net Petri net initial_marking Initial marking final_marking Final marking """ if parameters is None: parameters = {} if pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY not in parameters: parameters[pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_util.DEFAULT_NAME_KEY if pmutil.constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY not in parameters: parameters[pmutil.constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = parameters[ pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY] activity_key = parameters[pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY] # get the DFG dfg = [(k, v) for k, v in dfg_inst.apply(log, parameters={ pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY: activity_key}).items() if v > 0] # get the activities in the log activities = attributes_filter.get_attribute_values(log, activity_key) # gets the start activities from the log start_activities = list(start_activities_filter.get_start_activities(log, parameters=parameters).keys()) # gets the end activities from the log end_activities = list(end_activities_filter.get_end_activities(log, parameters=parameters).keys()) # check if the log contains empty traces contains_empty_traces = False traces_length = [len(trace) for trace in log] if traces_length: contains_empty_traces = min([len(trace) for trace in log]) == 0 net, initial_marking, final_marking = apply_dfg(dfg, parameters=parameters, activities=activities, contains_empty_traces=contains_empty_traces, start_activities=start_activities, end_activities=end_activities) return net, initial_marking, final_marking
def preprocess(log): """ Transform every trace in the log file, which is represented as a json, in a array that we will have easy access to times for every event in a trace and the sequence of these events. Also uses standarization to transofrm the time values per activity. """ activities_all = log_attributes_filter.get_attribute_values( log, "concept:name") activities = list(activities_all.keys()) dataVectors = [] sequentialData = [[] for i in range(len(log))] for outerIndex, trace in enumerate(log): times = [[] for i in range(len(activities))] previousTime = trace.attributes["REG_DATE"] for index, event in enumerate(trace): indexActivity = activities.index(event["concept:name"]) time = event["time:timestamp"] - previousTime times[indexActivity].append(time) previousTime = event["time:timestamp"] timesSeconds = [[i.total_seconds() for i in x] for x in times] sequentialData[outerIndex].append( [indexActivity, time.total_seconds()]) dataVectors.append(timesSeconds) #transofrm datavectors to contain times per activity timesPerActivity = [[ k for i in [x[index] for x in dataVectors] for k in i ] for index in range(len(dataVectors[0]))] #standard scalers standarScalers = [ ] #contains all the scalers that have been fitting to the allTimesSeconds for index, i in enumerate(timesPerActivity): sc = StandardScaler() numpyArray = np.array(i) numpyArray = numpyArray.reshape(-1, 1) sc.fit(numpyArray) #fit to the all of the times spend standarScalers.append(sc) #create pairwise data [traceIndex,activityA,activityB,standarizedTimeA,standarizedTimeB] data = [] for traceIndex, trace in enumerate(sequentialData): for eventIndex, event in enumerate(trace[:-1]): eventNext = sequentialData[traceIndex][eventIndex + 1] timeA = standarScalers[event[0]].transform( np.array(event[1]).reshape(1, -1)) timeB = standarScalers[eventNext[0]].transform( np.array(eventNext[1]).reshape(1, -1)) data.append([ traceIndex, event[0], eventNext[0], round(float(timeA), 5), round(float(timeB), 5), eventIndex ]) return data
def apply(log, parameters=None): """ Gets the performance HNet Parameters ------------ log Log parameters Parameters of the algorithm Returns ------------ base64 Base64 of an SVG representing the model model Text representation of the model format Format of the model """ if parameters is None: parameters = {} decreasingFactor = parameters[ "decreasingFactor"] if "decreasingFactor" in parameters else constants.DEFAULT_DEC_FACTOR activity_key = parameters[pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] if pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY log = attributes_filter.filter_log_on_max_no_activities(log, max_no_activities=constants.MAX_NO_ACTIVITIES, parameters=parameters) filtered_log = auto_filter.apply_auto_filter(log, parameters=parameters) activities_count = attributes_filter.get_attribute_values(filtered_log, activity_key) start_activities_count = start_activities_filter.get_start_activities(filtered_log, parameters=parameters) end_activities_count = end_activities_filter.get_end_activities(filtered_log, parameters=parameters) activities = list(activities_count.keys()) start_activities = list(start_activities_count.keys()) end_activities = list(end_activities_count.keys()) dfg_freq = dfg_factory.apply(filtered_log, parameters=parameters) dfg_perf = dfg_factory.apply(filtered_log, variant="performance", parameters=parameters) heu_net = HeuristicsNet(dfg_freq, performance_dfg=dfg_perf, activities=activities, start_activities=start_activities, end_activities=end_activities, activities_occurrences=activities_count) heu_net.calculate(dfg_pre_cleaning_noise_thresh=constants.DEFAULT_DFG_CLEAN_MULTIPLIER * decreasingFactor) vis = heu_vis_factory.apply(heu_net, parameters={"format": "svg"}) vis2 = heu_vis_factory.apply(heu_net, parameters={"format": "dot"}) gviz_base64 = get_base64_from_file(vis2.name) return get_base64_from_file(vis.name), None, "", "xes", activities, start_activities, end_activities, gviz_base64, [], "heuristics", "perf", None, "", activity_key
def dataSequence(log): activities = log_attributes_filter.get_attribute_values( log, "concept:name") letters = [i for i in "abcdefghijklmnopqrstuvwxyz"] bag = [[] for i in range(2)] for i in activities: bag[0].append(i) for index, activity in enumerate(activities): bag[1].append(getActivityLetter(index, letters)) response = [] for trace in log: response.append(transformAtrace(trace, bag)) return response, bag
def test_dfdoc1(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" from pm4py.objects.log.importer.xes import factory as xes_importer log = xes_importer.import_log(os.path.join("input_data", "running-example.xes")) from pm4py.algo.discovery.dfg import factory as dfg_factory dfg = dfg_factory.apply(log) from pm4py.algo.filtering.log.attributes import attributes_filter activities_count = attributes_filter.get_attribute_values(log, "concept:name") from pm4py.visualization.dfg.versions import simple_visualize as dfg_visualize gviz = dfg_visualize.graphviz_visualization(activities_count, dfg) del gviz
def getaverageduration3(log, logname, logtime, logstti, logcoti): #print('\n',activities,'\n') #print('\n',time) #print('\n',variants) timedict = {} durationdict = {} activities = attributes_filter.get_attribute_values(log, logname) fmt = '%Y-%m-%d %H:%M:%S' ''' end = timeList[header+j+1][0:19] start = timeList[header+j][0:19] ts = dt.datetime.strptime(end,fmt)-dt.datetime.strptime(start,fmt) timeSum += int(ts.total_seconds()) ''' duration = [] for trace in log: for event in trace: end = str(event[logcoti])[0:19] start = str(event[logstti])[0:19] ts = dt.datetime.strptime(end, fmt) - dt.datetime.strptime( start, fmt) #timeList.append((event[logname],int(ts.total_seconds()))) if not event[logname] in durationdict.keys(): durationdict[event[logname]] = [int(ts.total_seconds())] else: durationdict[event[logname]].append(int(ts.total_seconds())) if not event[logname] in timedict.keys(): timedict[event[logname]] = (int(ts.total_seconds()), 1) else: timedict[event[logname]] = (timedict[event[logname]][0] + int(ts.total_seconds()), timedict[event[logname]][1] + 1) #print (trace,'\n') for key in timedict.keys(): timedict[key] = timedict[key][0] / timedict[key][1] for ele in activities: duration.append(timedict[ele]) deviationlist = [] for key in durationdict.keys(): literal = 0 for ele in durationdict[key]: literal += pow(timedict[key] - ele, 2) deviation = pow((literal / len(durationdict[key])), 1 / 2) deviationlist.append((key, deviation)) return duration
def apply_heu(log, parameters=None): """ Discovers an Heuristics Net using Heuristics Miner Parameters ------------ log Event log parameters Possible parameters of the algorithm, including: activity_key, case_id_glue, timestamp_key, dependency_thresh, and_measure_thresh, min_act_count, min_dfg_occurrences, dfg_pre_cleaning_noise_thresh, loops_length_two_thresh Returns ------------ heu Heuristics Net """ if parameters is None: parameters = {} activity_key = parameters[ constants. PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY start_activities = log_sa_filter.get_start_activities( log, parameters=parameters) end_activities = log_ea_filter.get_end_activities(log, parameters=parameters) activities_occurrences = log_attributes.get_attribute_values( log, activity_key, parameters=parameters) activities = list(activities_occurrences.keys()) dfg = dfg_factory.apply(log, parameters=parameters) parameters_w2 = deepcopy(parameters) parameters_w2["window"] = 2 dfg_window_2 = dfg_factory.apply(log, parameters=parameters_w2) freq_triples = dfg_factory.apply(log, parameters=parameters, variant="freq_triples") return apply_heu_dfg(dfg, activities=activities, activities_occurrences=activities_occurrences, start_activities=start_activities, end_activities=end_activities, dfg_window_2=dfg_window_2, freq_triples=freq_triples, parameters=parameters)
def execFreq(clusters, activityKey): EF = [] for i in range(len(clusters)): activities_count = attributes_filter.get_attribute_values( clusters[i], attribute_key=activityKey) EF.append(activities_count) EF_df = pd.DataFrame.from_dict(EF, orient='columns', dtype=None).T EF_df = EF_df.reset_index().melt(id_vars='index', var_name='cluster', value_name='activityCount') EF_df = EF_df.fillna(0) EF_df = EF_df.rename(columns={'index': 'activity'}) ############## Execution Frequency: case ############ EF_EFc = execFreqCase(clusters, EF_df) return (EF_EFc)
def transformTraces(log:EventLog) -> list: activities = attributes_filter.get_attribute_values(log, "concept:name") activity_names = [i for i in activities] data,data_durations=mean_value_per_Activity(log) log_list=[] for n_trace,trace in enumerate(log): l_trace=[0 for i in range(len(activity_names))] times=[0 for i in range(len(activity_names))] for n_event,event in enumerate(trace): index = activity_names.index(event["concept:name"]) l_trace[index]+=data_durations[n_trace][n_event] times[index]+=1 l_trace=[x/y if y!=0 else 0 for x,y in zip(l_trace,times)] log_list.append(l_trace) means,stdevs = meanAndstdev(data,activity_names) log_list= [[(x-y)/z if z!=0 else 0 for x,y,z in zip(l,means,stdevs)]for l in log_list] return log_list
def apply_tree(log, parameters): """ Apply the IMDF algorithm to a log obtaining a process tree Parameters ---------- log Log parameters Parameters of the algorithm, including: pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY -> attribute of the log to use as activity name (default concept:name) Returns ---------- tree Process tree """ if parameters is None: parameters = {} if pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY not in parameters: parameters[pmutil.constants. PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_util.DEFAULT_NAME_KEY activity_key = parameters[pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY] # get the DFG dfg = [(k, v) for k, v in dfg_inst.apply( log, parameters={ pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY: activity_key }).items() if v > 0] # get the activities in the log activities = attributes_filter.get_attribute_values(log, activity_key) # check if the log contains empty traces contains_empty_traces = False traces_length = [len(trace) for trace in log] if traces_length: contains_empty_traces = min([len(trace) for trace in log]) == 0 return apply_tree_dfg(dfg, parameters, activities=activities, contains_empty_traces=contains_empty_traces)
def get_attribute_values(self, attribute_key, parameters=None): """ Gets the attribute values from the log Returns ------------- attribute_values List of values """ if parameters is None: parameters = {} parameters[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = self.activity_key parameters[constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = attribute_key initial_dict = attributes_filter.get_attribute_values(self.log, attribute_key, parameters=parameters) return_dict = {} for key in initial_dict: return_dict[str(key)] = int(initial_dict[key]) return return_dict
def dfg_vis(dfg, log=None, parameters=None, activities_count=None, measure="frequency"): if parameters is None: parameters = {} activity_key = ( parameters[PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY ) max_no_of_edges_in_diagram = 75 if "maxNoOfEdgesInDiagram" in parameters: max_no_of_edges_in_diagram = parameters["maxNoOfEdgesInDiagram"] start_activities = ( parameters["start_activities"] if "start_activities" in parameters else [] ) end_activities = ( parameters["end_activities"] if "end_activities" in parameters else [] ) if activities_count is None: if log is not None: activities_count = attributes_filter.get_attribute_values( log, activity_key, parameters=parameters ) activities_count["start"] = len(log) else: activities = dfg_utils.get_activities_from_dfg(dfg) activities_count = {key: 1 for key in activities} activities_count["start"] = None return graphviz_visualization( activities_count, dfg, measure=measure, max_no_of_edges_in_diagram=max_no_of_edges_in_diagram, start_activities=start_activities, end_activities=end_activities, )
def dataPreprocess(log): """ In this function data from log, will be transformed to a vector """ activities_all = log_attributes_filter.get_attribute_values(log, "concept:name") activities=list(activities_all.keys()) dataVectors=[] times=[[] for i in range(len(activities))] for trace in log: activitiesCounter=[0 for i in range(len(activities))] timesSpend=[datetime.timedelta(0) for i in range(len(activities))] previousTime=trace.attributes["REG_DATE"] for index,event in enumerate(trace): indexActivity=activities.index(event["concept:name"]) activitiesCounter[indexActivity]+=1 timesSpend[indexActivity]+=event["time:timestamp"]-previousTime times[indexActivity].append(event["time:timestamp"]-previousTime) previousTime=event["time:timestamp"] timesSpend=[(timesSpend[i]/activitiesCounter[i]).total_seconds() if activitiesCounter[i]!=0 else 0 for i in range(len(activities))] #contains the mo of all the activities dataVectors.append(activitiesCounter+timesSpend) return dataVectors,times,activities
def get_log_match_with_model(log, bpmn_graph, parameters=None): """ Get log match with model Parameters ------------ log Trace log bpmn_graph BPMN graph parameters Possible parameters of the algorithm Returns ------------ model_to_log Correspondence between model activities and log activities log_to_model Correspondence between log activities and model activities """ if parameters is None: parameters = {} activity_key = parameters[ constants.PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY model_to_log = {} log_to_model = {} log_activities = list(attributes_filter.get_attribute_values(log, activity_key).keys()) nodes = bpmn_graph.diagram_graph.nodes bpmn_activities = list([nodes[n]["node_name"] for n in nodes if "task" in nodes[n]["type"].lower()]) for act in bpmn_activities: close_matches = difflib.get_close_matches(act, log_activities) if close_matches and close_matches[0] not in log_to_model: model_to_log[act] = close_matches[0] log_to_model[close_matches[0]] = act return model_to_log, log_to_model
def dataPreprocess2017(log): """ Takes the log file and transform every trace in a way, that we will keep the information for the time per event and also the original sequence for every event in the same trace """ activities_all = log_attributes_filter.get_attribute_values( log, "concept:name") activities = list(activities_all.keys()) times = [[] for i in range(len(activities))] sequence = [] for indexTrace, trace in enumerate(log): previousTime = trace[0]['time:timestamp'] sequence.append([]) for index, event in enumerate(trace): indexActivity = activities.index(event["concept:name"]) time = event["time:timestamp"] - previousTime times[indexActivity].append( [indexTrace, index, time.total_seconds()]) previousTime = event["time:timestamp"] sequence[-1].append([indexActivity, time.total_seconds()]) return times, sequence
def form_encoding_dictio_from_log(log, parameters=None): """ Forms the encoding dictionary from the current log Parameters ------------- log Event log parameters Parameters of the algorithm Returns ------------- encoding_dictio Encoding dictionary """ if parameters is None: parameters = {} activity_key = parameters[ PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY from pm4py.algo.filtering.log.attributes import attributes_filter shared_obj = SharedObj() activities = attributes_filter.get_attribute_values(log, activity_key, parameters=parameters) mapping = {} for act in activities: get_new_char(act, shared_obj) mapping[act] = shared_obj.mapping_dictio[act] return mapping
def apply_fall_through(self, parameters=None): if parameters is None: parameters = {} activity_key = exec_utils.get_param_value( Parameters.ACTIVITY_KEY, parameters, pmutil.xes_constants.DEFAULT_NAME_KEY) # set flags for fall_throughs, base case is True (enabled) use_empty_trace = (Parameters.EMPTY_TRACE_KEY not in parameters ) or parameters[Parameters.EMPTY_TRACE_KEY] use_act_once_per_trace = ( Parameters.ONCE_PER_TRACE_KEY not in parameters) or parameters[Parameters.ONCE_PER_TRACE_KEY] use_act_concurrent = (Parameters.CONCURRENT_KEY not in parameters ) or parameters[Parameters.CONCURRENT_KEY] use_strict_tau_loop = (Parameters.STRICT_TAU_LOOP_KEY not in parameters ) or parameters[Parameters.STRICT_TAU_LOOP_KEY] use_tau_loop = (Parameters.TAU_LOOP_KEY not in parameters ) or parameters[Parameters.TAU_LOOP_KEY] if use_empty_trace: empty_trace, new_log = fall_through.empty_trace(self.log) # if an empty trace is found, the empty trace fallthrough applies # else: empty_trace = False if empty_trace: logging.debug("empty_trace") activites_left = [] for trace in new_log: for act in trace: if act[activity_key] not in activites_left: activites_left.append(act[activity_key]) self.detected_cut = 'empty_trace' new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( new_log, activity_key) start_activities = list( start_activities_filter.get_start_activities( new_log, parameters=self.parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( new_log, parameters=self.parameters).keys()) self.children.append( SubtreePlain( new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) else: if use_act_once_per_trace: activity_once, new_log, small_log = fall_through.act_once_per_trace( self.log, self.activities, activity_key) small_log = filtering_utils.keep_one_trace_per_variant( small_log, parameters=parameters) else: activity_once = False if use_act_once_per_trace and activity_once: self.detected_cut = 'parallel' # create two new dfgs as we need them to append to self.children later new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( new_log, activity_key) small_dfg = [(k, v) for k, v in dfg_inst.apply( small_log, parameters=parameters).items() if v > 0] small_activities = attributes_filter.get_attribute_values( small_log, activity_key) self.children.append( SubtreePlain( small_log, small_dfg, self.master_dfg, self.initial_dfg, small_activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) # continue with the recursion on the new log_skeleton start_activities = list( start_activities_filter.get_start_activities( new_log, parameters=self.parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( new_log, parameters=self.parameters).keys()) self.children.append( SubtreePlain( new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) else: if use_act_concurrent: activity_concurrent, new_log, small_log, activity_left_out = fall_through.activity_concurrent( self, self.log, self.activities, activity_key, parameters=parameters) small_log = filtering_utils.keep_one_trace_per_variant( small_log, parameters=parameters) else: activity_concurrent = False if use_act_concurrent and activity_concurrent: self.detected_cut = 'parallel' # create two new dfgs on to append later new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( new_log, activity_key) small_dfg = [(k, v) for k, v in dfg_inst.apply( small_log, parameters=parameters).items() if v > 0] small_activities = attributes_filter.get_attribute_values( small_log, activity_key) # append the concurrent activity as leaf: self.children.append( SubtreePlain( small_log, small_dfg, self.master_dfg, self.initial_dfg, small_activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, initial_start_activities=self. initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) # continue with the recursion on the new log_skeleton: start_activities = list( start_activities_filter.get_start_activities( new_log, parameters=self.parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( new_log, parameters=self.parameters).keys()) self.children.append( SubtreePlain( new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) else: if use_strict_tau_loop: strict_tau_loop, new_log = fall_through.strict_tau_loop( self.log, self.start_activities, self.end_activities, activity_key) new_log = filtering_utils.keep_one_trace_per_variant( new_log, parameters=parameters) else: strict_tau_loop = False if use_strict_tau_loop and strict_tau_loop: activites_left = [] for trace in new_log: for act in trace: if act[activity_key] not in activites_left: activites_left.append(act[activity_key]) self.detected_cut = 'strict_tau_loop' new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( new_log, activity_key) start_activities = list( start_activities_filter.get_start_activities( new_log, parameters=self.parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( new_log, parameters=self.parameters).keys()) self.children.append( SubtreePlain(new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) else: if use_tau_loop: tau_loop, new_log = fall_through.tau_loop( self.log, self.start_activities, activity_key) new_log = filtering_utils.keep_one_trace_per_variant( new_log, parameters=parameters) else: tau_loop = False if use_tau_loop and tau_loop: activites_left = [] for trace in new_log: for act in trace: if act[activity_key] not in activites_left: activites_left.append( act[activity_key]) self.detected_cut = 'tau_loop' new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( new_log, activity_key) start_activities = list( start_activities_filter.get_start_activities( new_log, parameters=self.parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( new_log, parameters=self.parameters).keys()) self.children.append( SubtreePlain( new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) else: logging.debug("flower model") activites_left = [] for trace in self.log: for act in trace: if act[activity_key] not in activites_left: activites_left.append( act[activity_key]) self.detected_cut = 'flower'
def detect_cut(self, second_iteration=False, parameters=None): if pkgutil.find_loader("networkx"): import networkx as nx if parameters is None: parameters = {} activity_key = exec_utils.get_param_value( Parameters.ACTIVITY_KEY, parameters, pmutil.xes_constants.DEFAULT_NAME_KEY) # check base cases: empty_log = base_case.empty_log(self.log) single_activity = base_case.single_activity(self.log, activity_key) if empty_log: self.detected_cut = 'empty_log' elif single_activity: self.detected_cut = 'single_activity' # if no base cases are found, search for a cut: else: conn_components = detection_utils.get_connected_components( self.ingoing, self.outgoing, self.activities) this_nx_graph = transform_dfg_to_directed_nx_graph( self.dfg, activities=self.activities) strongly_connected_components = [ list(x) for x in nx.strongly_connected_components(this_nx_graph) ] xor_cut = self.detect_xor(conn_components) # the following part searches for a cut in the current log_skeleton # if a cut is found, the log_skeleton is split according to the cut, the resulting logs are saved in new_logs # recursion is used on all the logs in new_logs if xor_cut[0]: logging.debug("xor_cut") self.detected_cut = 'concurrent' new_logs = split.split_xor(xor_cut[1], self.log, activity_key) for i in range(len(new_logs)): new_logs[ i] = filtering_utils.keep_one_trace_per_variant( new_logs[i], parameters=parameters) for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply( l, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( l, activity_key) start_activities = list( start_activities_filter.get_start_activities( l, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( l, parameters=parameters).keys()) self.children.append( SubtreePlain(l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) else: sequence_cut = cut_detection.detect_sequential_cut( self, self.dfg, strongly_connected_components) if sequence_cut[0]: logging.debug("sequence_cut") new_logs = split.split_sequence( sequence_cut[1], self.log, activity_key) for i in range(len(new_logs)): new_logs[ i] = filtering_utils.keep_one_trace_per_variant( new_logs[i], parameters=parameters) self.detected_cut = "sequential" for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply( l, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( l, activity_key) start_activities = list( start_activities_filter.get_start_activities( l, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( l, parameters=parameters).keys()) self.children.append( SubtreePlain( l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) else: parallel_cut = self.detect_concurrent() if parallel_cut[0]: logging.debug("parallel_cut") new_logs = split.split_parallel( parallel_cut[1], self.log, activity_key) for i in range(len(new_logs)): new_logs[ i] = filtering_utils.keep_one_trace_per_variant( new_logs[i], parameters=parameters) self.detected_cut = "parallel" for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply( l, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( l, activity_key) start_activities = list( start_activities_filter. get_start_activities( l, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( l, parameters=parameters).keys()) self.children.append( SubtreePlain( l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) else: loop_cut = self.detect_loop() if loop_cut[0]: logging.debug("loop_cut") new_logs = split.split_loop( loop_cut[1], self.log, activity_key) for i in range(len(new_logs)): new_logs[ i] = filtering_utils.keep_one_trace_per_variant( new_logs[i], parameters=parameters) self.detected_cut = "loopCut" for l in new_logs: new_dfg = [ (k, v) for k, v in dfg_inst.apply( l, parameters=parameters).items() if v > 0 ] activities = attributes_filter.get_attribute_values( l, activity_key) start_activities = list( start_activities_filter. get_start_activities( l, parameters=parameters).keys()) end_activities = list( end_activities_filter. get_end_activities( l, parameters=parameters).keys()) self.children.append( SubtreePlain( l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self. noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) # if the code gets to this point, there is no base_case and no cut found in the log_skeleton # therefore, we now apply fall through: else: self.apply_fall_through(parameters) else: msg = "networkx is not available. inductive miner cannot be used!" logging.error(msg) raise Exception(msg)
def apply(log, parameters=None): """ Gets the frequency DFG Parameters ------------ log Log parameters Parameters of the algorithm Returns ------------ base64 Base64 of an SVG representing the model model Text representation of the model format Format of the model """ if parameters is None: parameters = {} decreasingFactor = parameters[ "decreasingFactor"] if "decreasingFactor" in parameters else constants.DEFAULT_DEC_FACTOR activity_key = parameters[ pm4_constants. PARAMETER_CONSTANT_ACTIVITY_KEY] if pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY log = attributes_filter.filter_log_on_max_no_activities( log, max_no_activities=constants.MAX_NO_ACTIVITIES, parameters=parameters) filtered_log = auto_filter.apply_auto_filter(log, parameters=parameters) activities_count = attributes_filter.get_attribute_values( filtered_log, activity_key) activities = list(activities_count.keys()) start_activities = list( start_activities_filter.get_start_activities( filtered_log, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities(filtered_log, parameters=parameters).keys()) dfg = dfg_factory.apply(filtered_log, parameters=parameters) dfg = clean_dfg_based_on_noise_thresh( dfg, activities, decreasingFactor * constants.DEFAULT_DFG_CLEAN_MULTIPLIER, parameters=parameters) parameters["format"] = "svg" parameters["start_activities"] = start_activities parameters["end_activities"] = end_activities gviz = dfg_vis_factory.apply(dfg, log=filtered_log, variant="frequency", parameters=parameters) gviz_base64 = base64.b64encode(str(gviz).encode('utf-8')) ret_graph = get_graph.get_graph_from_dfg(dfg, start_activities, end_activities) net, im, fm = dfg_conv_factory.apply(dfg, parameters={ "start_activities": start_activities, "end_activities": end_activities }) return get_base64_from_gviz(gviz), export_petri_as_string( net, im, fm ), ".pnml", "xes", activities, start_activities, end_activities, gviz_base64, ret_graph, "dfg", "freq", None, "", activity_key