def test_statistics_log(self): log = pm4py.read_xes("input_data/running-example.xes") pm4py.get_start_activities(log) pm4py.get_end_activities(log) pm4py.get_attributes(log) pm4py.get_trace_attributes(log) pm4py.get_attribute_values(log, "org:resource") pm4py.get_variants(log)
def test_statistics_df(self): df = pd.read_csv("input_data/running-example.csv") df = pm4py.format_dataframe(df, case_id="case:concept:name", activity_key="concept:name", timestamp_key="time:timestamp") pm4py.get_start_activities(df) pm4py.get_end_activities(df) pm4py.get_attributes(df) pm4py.get_attribute_values(df, "org:resource") pm4py.get_variants(df)
def test_filter_act_percentage(self): from pm4py.algo.filtering.dfg import dfg_filtering log = pm4py.read_xes("input_data/running-example.xes") dfg, sa, ea = pm4py.discover_dfg(log) act_count = pm4py.get_attribute_values(log, "concept:name") dfg_filtering.filter_dfg_on_activities_percentage( dfg, sa, ea, act_count, 0.1)
def execute_script(): log = pm4py.read_xes( os.path.join("..", "tests", "input_data", "receipt.xes")) activities = pm4py.get_attribute_values(log, "concept:name") dfg, sa, ea = pm4py.discover_dfg(log) # filters the DFG to make a simpler one perc = 0.5 dfg, sa, ea, activities = dfg_filtering.filter_dfg_on_activities_percentage( dfg, sa, ea, activities, perc) dfg, sa, ea, activities = dfg_filtering.filter_dfg_on_paths_percentage( dfg, sa, ea, activities, perc) # creates the simulated log simulated_log = dfg_playout.apply(dfg, sa, ea) print(simulated_log) print(len(simulated_log)) print(sum(x.attributes["probability"] for x in simulated_log)) # shows the two DFGs to show that they are identical pm4py.view_dfg(dfg, sa, ea, log=log, format="svg") new_dfg, new_sa, new_ea = pm4py.discover_dfg(simulated_log) pm4py.view_dfg(new_dfg, new_sa, new_ea, log=simulated_log, format="svg") for trace in simulated_log: print(list(x["concept:name"] for x in trace)) print(trace.attributes["probability"], dfg_playout.get_trace_probability(trace, dfg, sa, ea)) break dfg, sa, ea = pm4py.discover_dfg(log) variants = pm4py.get_variants(log) sum_prob_log_variants = 0.0 for var in variants: sum_prob_log_variants += dfg_playout.get_trace_probability( variants[var][0], dfg, sa, ea) print( "percentage of behavior allowed from DFG that is in the log (from 0.0 to 1.0): ", sum_prob_log_variants)
def detect(log: EventLog, alphabet: Dict[str, int], act_key: str, use_msd: bool) -> Optional[str]: candidates = set(alphabet.keys()) for t in log: candidates = candidates.intersection(set(map(lambda e: e[act_key], t))) if len(candidates) == 0: return None for a in candidates: proj = EventLog() for t in log: proj.append(pm4py.filter_trace(lambda e: e[act_key] != a, t)) if len(list(filter(lambda t: len(t) == 0, proj))) == 0: dfg_proj = discover_dfg.apply(proj, parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) alphabet_proj = pm4py.get_attribute_values(proj, act_key) start_act_proj = get_starters.get_start_activities(proj, parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) end_act_proj = get_ends.get_end_activities(log, parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) pre_proj, post_proj = dfg_utils.get_transitive_relations(dfg_proj, alphabet_proj) cut = sequence_cut.detect(alphabet_proj, pre_proj, post_proj) if cut is not None: return a cut = xor_cut.detect(dfg_proj, alphabet_proj) if cut is not None: return a cut = concurrent_cut.detect(dfg_proj, alphabet_proj, start_act_proj, end_act_proj, msd= msdw_algo.derive_msd_witnesses(proj, msd_algo.apply(log, parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}), parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) if use_msd else None) if cut is not None: return a cut = loop_cut.detect(dfg_proj, alphabet_proj, start_act_proj, end_act_proj) if cut is not None: return a return None
def get_change_points(log): attr_datetime = pm4py.get_attribute_values(log, 'time:timestamp') start_date = min(attr_datetime).date() end_date = max(attr_datetime).date() delta = datetime.timedelta(days=1) print("Start date: ", start_date, "\nEnd date: ", end_date) event_counts = {} i = start_date while i <= end_date: event_counts[i.strftime('%Y-%m-%d')] = 0 #print(i) i += delta #print(event_counts) for t in attr_datetime: event_counts[t.date().strftime('%Y-%m-%d')] += 1 dates = np.array(list(event_counts.values())) # detection algo = rpt.Pelt(model=MODEL).fit(dates) detect_result = algo.predict(pen=PENALTY) # display rpt.display(dates, detect_result, detect_result) plt.savefig('change_points.png') plt.show() print('Change point plot is saved as "change_points.png"') return event_counts, detect_result
def test_dfg_align(self): import pm4py from pm4py.objects.dfg.filtering import dfg_filtering from pm4py.objects.dfg.utils import dfg_alignment log = pm4py.read_xes(os.path.join("input_data", "running-example.xes")) dfg, sa, ea = pm4py.discover_dfg(log) act_count = pm4py.get_attribute_values(log, "concept:name") dfg, sa, ea, act_count = dfg_filtering.filter_dfg_on_activities_percentage(dfg, sa, ea, act_count, 0.5) dfg, sa, ea, act_count = dfg_filtering.filter_dfg_on_paths_percentage(dfg, sa, ea, act_count, 0.5) aligned_traces = dfg_alignment.apply(log, dfg, sa, ea)
def apply(log: Union[DataFrame, EventLog, EventStream], parameters: Optional[Dict[str, Any]] = None) -> Dict[str, int]: ''' This algorithm computes the minimum self-distance for each activity observed in an event log. The self distance of a in <a> is infinity, of a in <a,a> is 0, in <a,b,a> is 1, etc. The minimum self distance is the minimal observed self distance value in the event log. The activity key needs to be specified in the parameters input object (if None, default value 'concept:name' is used). Parameters ---------- log event log (either pandas.DataFrame, EventLog or EventStream) parameters parameters object; Returns ------- dict mapping an activity to its self-distance, if it exists, otherwise it is not part of the dict. ''' log = pm4py.convert_to_event_log(log) act_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) alphabet = pm4py.get_attribute_values(log, act_key) log = list(map(lambda t: list(map(lambda e: e[act_key], t)), log)) min_self_distances = dict() for a in alphabet: if len( list( filter( lambda t: len(t) > 1, list( map(lambda t: list(filter(lambda e: e == a, t)), log))))) > 0: activity_indices = list( filter( lambda t: len(t) > 1, list( map(lambda t: [i for i, x in enumerate(t) if x == a], log)))) min_self_distances[a] = min([ i for l in list( map( lambda t: [ t[i] - t[i - 1] - 1 for i, x in enumerate(t) if i > 0 ], activity_indices)) for i in l ]) return min_self_distances
def derive_msd_witnesses( log: EventLog, msd: Optional[Dict[str, int]] = None, parameters: Optional[Dict[str, Any]] = None) -> Dict[str, Set[str]]: ''' This function derives the minimum self distance witnesses. The self distance of a in <a> is infinity, of a in <a,a> is 0, in <a,b,a> is 1, etc. The minimum self distance is the minimal observed self distance value in the event log. A 'witness' is an activity that witnesses the minimum self distance. For example, if the minimum self distance of activity a in some log L is 2, then, if trace <a,b,c,a> is in log L, b and c are a witness of a. Parameters ---------- log Event Log to use msd Optional minimum self distance dictionary parameters Optional parameters dictionary Returns ------- Dictionary mapping each activity to a set of witnesses. ''' log = pm4py.convert_to_event_log(log) act_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) alphabet = pm4py.get_attribute_values(log, act_key) msd = msd if msd is not None else msd_algo.apply(log, parameters) log = list(map(lambda t: list(map(lambda e: e[act_key], t)), log)) witnesses = dict() for a in alphabet: if a in msd and msd[a] > 0: witnesses[a] = set() else: continue for t in log: if len(list(filter(lambda e: e == a, t))) > 1: indices = [i for i, x in enumerate(t) if x == a] for i in range(len(indices) - 1): if indices[i + 1] - indices[i] - 1 == msd[a]: for b in t[indices[i] + 1:indices[i + 1]]: witnesses[a].add(b) return witnesses
def get_process_svg(): parameters = request.args.get("parameters") parameters = __process_parameters(parameters) log = __prepare_event_log(parameters) ext_type = parameters[ "ext_type"] if "ext_type" in parameters else "document_flow_log" log_type = __get_log_type_from_ext_type(ext_type) if log_type == 0: log.type = "succint" from pm4pymdl.algo.mvp.gen_framework import algorithm as discovery from pm4pymdl.visualization.mvp.gen_framework import visualizer as vis_factory model = discovery.apply(log, model_type_variant="model3", node_freq_variant="type31", edge_freq_variant="type11") gviz = vis_factory.apply(model, parameters={"format": "svg"}) elif log_type == 1 or log_type == 2: import pandas as pd if type(log) is pd.DataFrame: from pm4py.objects.dfg.retrieval.pandas import get_dfg_graph dfg = get_dfg_graph(log) from pm4py.statistics.start_activities.pandas import get as pd_sa_get from pm4py.statistics.end_activities.pandas import get as pd_ea_get sa = pd_sa_get.get_start_activities(log) ea = pd_ea_get.get_end_activities(log) else: dfg, sa, ea = pm4py.discover_dfg(log) act_count = pm4py.get_attribute_values(log, "concept:name") dfg, sa, ea, act_count = dfg_filtering.filter_dfg_on_paths_percentage( dfg, sa, ea, act_count, 0.2, keep_all_activities=True) gviz = pm4py.visualization.dfg.visualizer.apply( dfg, activities_count=act_count, parameters={ "format": "svg", "start_activities": sa, "end_activities": ea }) ser = pm4py.visualization.dfg.visualizer.serialize(gviz).decode("utf-8") return ser
def execute_script(): log = pm4py.read_xes("../tests/input_data/receipt.xes") dfg, sa, ea = pm4py.discover_dfg(log) act_count = pm4py.get_attribute_values(log, "concept:name") # keep the specified amount of activities dfg, sa, ea, act_count = pm4py.objects.dfg.filtering.dfg_filtering.filter_dfg_on_activities_percentage( dfg, sa, ea, act_count, 0.3) # keep the specified amount of paths dfg, sa, ea, act_count = pm4py.objects.dfg.filtering.dfg_filtering.filter_dfg_on_paths_percentage( dfg, sa, ea, act_count, 0.3) # view the DFG gviz = dfg_visualizer.apply( dfg, activities_count=act_count, parameters={ dfg_visualizer.Variants.FREQUENCY.value.Parameters.START_ACTIVITIES: sa, dfg_visualizer.Variants.FREQUENCY.value.Parameters.END_ACTIVITIES: ea, dfg_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "svg" }) dfg_visualizer.view(gviz)
def execute_script(): ENABLE_VISUALIZATION = True # reads a XES into an event log log1 = pm4py.read_xes("../tests/input_data/running-example.xes") # reads a CSV into a dataframe df = pd.read_csv("../tests/input_data/running-example.csv") # formats the dataframe with the mandatory columns for process mining purposes df = pm4py.format_dataframe(df, case_id="case:concept:name", activity_key="concept:name", timestamp_key="time:timestamp") # converts the dataframe to an event log log2 = pm4py.convert_to_event_log(df) # converts the log read from XES into a stream and dataframe respectively stream1 = pm4py.convert_to_event_stream(log1) df2 = pm4py.convert_to_dataframe(log1) # writes the log1 to a XES file pm4py.write_xes(log1, "ru1.xes") dfg, dfg_sa, dfg_ea = pm4py.discover_dfg(log1) petri_alpha, im_alpha, fm_alpha = pm4py.discover_petri_net_alpha(log1) petri_inductive, im_inductive, fm_inductive = pm4py.discover_petri_net_inductive( log1) petri_heuristics, im_heuristics, fm_heuristics = pm4py.discover_petri_net_heuristics( log1) tree_inductive = pm4py.discover_tree_inductive(log1) heu_net = pm4py.discover_heuristics_net(log1) pm4py.write_dfg(dfg, dfg_sa, dfg_ea, "ru_dfg.dfg") pm4py.write_petri_net(petri_alpha, im_alpha, fm_alpha, "ru_alpha.pnml") pm4py.write_petri_net(petri_inductive, im_inductive, fm_inductive, "ru_inductive.pnml") pm4py.write_petri_net(petri_heuristics, im_heuristics, fm_heuristics, "ru_heuristics.pnml") pm4py.write_process_tree(tree_inductive, "ru_inductive.ptml") dfg, dfg_sa, dfg_ea = pm4py.read_dfg("ru_dfg.dfg") petri_alpha, im_alpha, fm_alpha = pm4py.read_petri_net("ru_alpha.pnml") petri_inductive, im_inductive, fm_inductive = pm4py.read_petri_net( "ru_inductive.pnml") petri_heuristics, im_heuristics, fm_heuristics = pm4py.read_petri_net( "ru_heuristics.pnml") tree_inductive = pm4py.read_process_tree("ru_inductive.ptml") pm4py.save_vis_petri_net(petri_alpha, im_alpha, fm_alpha, "ru_alpha.png") pm4py.save_vis_petri_net(petri_inductive, im_inductive, fm_inductive, "ru_inductive.png") pm4py.save_vis_petri_net(petri_heuristics, im_heuristics, fm_heuristics, "ru_heuristics.png") pm4py.save_vis_process_tree(tree_inductive, "ru_inductive_tree.png") pm4py.save_vis_heuristics_net(heu_net, "ru_heunet.png") pm4py.save_vis_dfg(dfg, dfg_sa, dfg_ea, "ru_dfg.png") if ENABLE_VISUALIZATION: pm4py.view_petri_net(petri_alpha, im_alpha, fm_alpha, format="svg") pm4py.view_petri_net(petri_inductive, im_inductive, fm_inductive, format="svg") pm4py.view_petri_net(petri_heuristics, im_heuristics, fm_heuristics, format="svg") pm4py.view_process_tree(tree_inductive, format="svg") pm4py.view_heuristics_net(heu_net, format="svg") pm4py.view_dfg(dfg, dfg_sa, dfg_ea, format="svg") aligned_traces = pm4py.conformance_alignments(log1, petri_inductive, im_inductive, fm_inductive) replayed_traces = pm4py.conformance_tbr(log1, petri_inductive, im_inductive, fm_inductive) fitness_tbr = pm4py.evaluate_fitness_tbr(log1, petri_inductive, im_inductive, fm_inductive) print("fitness_tbr", fitness_tbr) fitness_align = pm4py.evaluate_fitness_alignments(log1, petri_inductive, im_inductive, fm_inductive) print("fitness_align", fitness_align) precision_tbr = pm4py.evaluate_precision_tbr(log1, petri_inductive, im_inductive, fm_inductive) print("precision_tbr", precision_tbr) precision_align = pm4py.evaluate_precision_alignments( log1, petri_inductive, im_inductive, fm_inductive) print("precision_align", precision_align) print("log start activities = ", pm4py.get_start_activities(log2)) print("df start activities = ", pm4py.get_start_activities(df2)) print("log end activities = ", pm4py.get_end_activities(log2)) print("df end activities = ", pm4py.get_end_activities(df2)) print("log attributes = ", pm4py.get_attributes(log2)) print("df attributes = ", pm4py.get_attributes(df2)) print("log org:resource values = ", pm4py.get_attribute_values(log2, "org:resource")) print("df org:resource values = ", pm4py.get_attribute_values(df2, "org:resource")) print("start_activities len(filt_log) = ", len(pm4py.filter_start_activities(log2, ["register request"]))) print("start_activities len(filt_df) = ", len(pm4py.filter_start_activities(df2, ["register request"]))) print("end_activities len(filt_log) = ", len(pm4py.filter_end_activities(log2, ["pay compensation"]))) print("end_activities len(filt_df) = ", len(pm4py.filter_end_activities(df2, ["pay compensation"]))) print( "attributes org:resource len(filt_log) (cases) cases = ", len( pm4py.filter_attribute_values(log2, "org:resource", ["Ellen"], level="case"))) print( "attributes org:resource len(filt_log) (cases) events = ", len( pm4py.filter_attribute_values(log2, "org:resource", ["Ellen"], level="event"))) print( "attributes org:resource len(filt_df) (events) cases = ", len( pm4py.filter_attribute_values(df2, "org:resource", ["Ellen"], level="case"))) print( "attributes org:resource len(filt_df) (events) events = ", len( pm4py.filter_attribute_values(df2, "org:resource", ["Ellen"], level="event"))) print( "attributes org:resource len(filt_df) (events) events notpositive = ", len( pm4py.filter_attribute_values(df2, "org:resource", ["Ellen"], level="event", retain=False))) print("variants log = ", pm4py.get_variants(log2)) print("variants df = ", pm4py.get_variants(df2)) print( "variants filter log = ", len( pm4py.filter_variants(log2, [[ "register request", "examine thoroughly", "check ticket", "decide", "reject request" ]]))) print( "variants filter df = ", len( pm4py.filter_variants(df2, [[ "register request", "examine thoroughly", "check ticket", "decide", "reject request" ]]))) print("variants filter percentage = ", len(pm4py.filter_variants_percentage(log2, threshold=0.8))) print( "paths filter log len = ", len( pm4py.filter_directly_follows_relation( log2, [("register request", "examine casually")]))) print( "paths filter dataframe len = ", len( pm4py.filter_directly_follows_relation( df2, [("register request", "examine casually")]))) print( "timeframe filter log events len = ", len( pm4py.filter_time_range(log2, "2011-01-01 00:00:00", "2011-02-01 00:00:00", mode="events"))) print( "timeframe filter log traces_contained len = ", len( pm4py.filter_time_range(log2, "2011-01-01 00:00:00", "2011-02-01 00:00:00", mode="traces_contained"))) print( "timeframe filter log traces_intersecting len = ", len( pm4py.filter_time_range(log2, "2011-01-01 00:00:00", "2011-02-01 00:00:00", mode="traces_intersecting"))) print( "timeframe filter df events len = ", len( pm4py.filter_time_range(df2, "2011-01-01 00:00:00", "2011-02-01 00:00:00", mode="events"))) print( "timeframe filter df traces_contained len = ", len( pm4py.filter_time_range(df2, "2011-01-01 00:00:00", "2011-02-01 00:00:00", mode="traces_contained"))) print( "timeframe filter df traces_intersecting len = ", len( pm4py.filter_time_range(df2, "2011-01-01 00:00:00", "2011-02-01 00:00:00", mode="traces_intersecting"))) # remove the temporary files os.remove("ru1.xes") os.remove("ru_dfg.dfg") os.remove("ru_alpha.pnml") os.remove("ru_inductive.pnml") os.remove("ru_heuristics.pnml") os.remove("ru_inductive.ptml") os.remove("ru_alpha.png") os.remove("ru_inductive.png") os.remove("ru_heuristics.png") os.remove("ru_inductive_tree.png") os.remove("ru_heunet.png") os.remove("ru_dfg.png")
def test_filter_paths_percentage(self): log = pm4py.read_xes("input_data/running-example.xes") dfg, sa, ea = pm4py.discover_dfg(log) act_count = pm4py.get_attribute_values(log, "concept:name") pm4py.objects.dfg.filtering.dfg_filtering.filter_dfg_on_paths_percentage( dfg, sa, ea, act_count, 0.3)
def lstm_algorithm(index, len_of_points, resample_dataset, LSTM_CELLS): #print('Current path: ',os.getcwd()) if not os.path.exists('results'): os.makedirs('results') COLOR_CYCLE = ["#4286f4", "#f44174"] split_percentage = 0.8 """ answer = input('Give me the length of window: ') if answer == 'max': len_of_points = 0 else: len_of_points = int(answer) #len_of_points = 3 """ fileinfo = { 0: { 'filename': 'base_kasteren.csv', 'separator': ' ', 'columns': ['date', 'time', 'attr1', 'attr2', 'state', 'concept:name'] }, 1: { 'filename': 'activity3.csv', 'separator': ',', 'columns': [ 'id', 'case:concept:name', 'subjectID', 'attr_starttime', 'time:timestamp', 'concept:name', 'label_subactivity' ] }, 2: { 'filename': 'activitylog_uci_detailed_labour.xes', 'separator': '', 'columns': [] }, 3: { 'filename': 'atmo1.csv', 'separator': ' ', 'columns': ['date', 'time', 'concept:name', 'state', 'activity'] }, 4: { 'filename': 'activity1.csv', 'separator': ',', 'columns': [ 'id', 'case:concept:name', 'subjectID', 'attr_starttime', 'time:timestamp', 'concept:name', 'label_subactivity' ] }, 5: { 'filename': 'activity2.csv', 'separator': ',', 'columns': [ 'id', 'case:concept:name', 'subjectID', 'attr_starttime', 'time:timestamp', 'concept:name', 'label_subactivity' ] }, 6: { 'filename': 'espa.xes', 'separator': ';', 'columns': [] }, 7: { 'filename': 'activity3.csv', 'separator': ',', 'columns': [ 'id', 'case:concept:name', 'subjectID', 'attr_starttime', 'time:timestamp', 'concept:name', 'label_subactivity' ] }, 8: { 'filename': 'activity4.csv', 'separator': ',', 'columns': [ 'id', 'case:concept:name', 'subjectID', 'attr_starttime', 'time:timestamp', 'concept:name', 'label_subactivity' ] }, 9: { 'filename': 'activity5.csv', 'separator': ',', 'columns': [ 'id', 'case:concept:name', 'subjectID', 'attr_starttime', 'time:timestamp', 'concept:name', 'label_subactivity' ] }, 10: { 'filename': 'activity6.csv', 'separator': ',', 'columns': [ 'id', 'case:concept:name', 'subjectID', 'attr_starttime', 'time:timestamp', 'concept:name', 'label_subactivity' ] }, 11: { 'filename': 'activity7.csv', 'separator': ',', 'columns': [ 'id', 'case:concept:name', 'subjectID', 'attr_starttime', 'time:timestamp', 'concept:name', 'label_subactivity' ] }, 12: { 'filename': 'BPI_Challenge_2017.xes', 'separator': '', 'columns': [] }, } #choose file filename = fileinfo[index]['filename'] filepath = '../datasets/' + fileinfo[index]['filename'] dataframe = pd.DataFrame() if not os.path.exists('results/' + filename): os.makedirs('results/' + filename) if not os.path.exists('results/' + filename + '/' + str(len_of_points) + '/'): os.makedirs('results/' + filename + '/' + str(len_of_points) + '/') #if it is a csv file if (filename.find('.csv') != -1): #load file to dataframe dataframe = pd.read_csv(filepath, sep=fileinfo[index]['separator'], names=fileinfo[index]['columns'], low_memory=False) #for Kastern dataset prepare columns if index in [0, 3, 12]: dataframe[ 'time:timestamp'] = dataframe['date'] + ' ' + dataframe['time'] dataframe['case:concept:name'] = dataframe['date'] #dataframe = dataframe[dataframe['concept:name']!='None'] #print ("file is csv ") #print(dataframe.head(20)) #drop nan #convert csv to xes log = pm4py.convert_to_event_log(dataframe) else: #the file is xes #import log #xes_importer.iterparse.Parameters.MAX_TRACES = 10 #parameters = {xes_importer.iterparse.Parameters.MAX_TRACES: 50} #log = xes_importer.apply('datasets/BPI Challenge 2018.xes.gz', parameters=parameters) log = pm4py.read_xes(filepath) print(log) #convert to dataframe dataframe = pm4py.convert_to_dataframe(log) print(dataframe) #print(dataframe['time:timestamp'][0].replace(tzinfo=timezone.utc).astimezone(tz=None)) #dataframe['time:timestamp'] = dataframe['time:timestamp'].dt.tz_convert(None) if index in [2, 12]: #process time:timestamp remove zone information dataframe['time:timestamp'] = dataframe[ 'time:timestamp'].dt.tz_convert(None) #del log print('Dataframe print\n', dataframe) #get only start events if lifecycle:transition if column does not exists create it if 'lifecycle:transition' in dataframe.columns: dataframe = dataframe[dataframe['lifecycle:transition'] == 'complete'] else: dataframe['lifecycle:transition'] = 'complete' #remove Start and End events dataframe = dataframe[dataframe['concept:name'] != 'Start'] dataframe = dataframe[dataframe['concept:name'] != 'End'] #sort by time if 'time:timestamp' in dataframe.columns: dataframe = dataframe.sort_values('time:timestamp') else: print('Error: no column time:timestamp in event log') #print('Sorted dataframe\n',dataframe) #plot time vs activity #fig, axes = plt.subplots(1, 1, figsize=(100, 100)) #fig = dataframe.plot(x='time:timestamp', y='concept:name', kind="scatter").get_figure() #fig.savefig('results/'+filename+'/'+str(len_of_points) +'/conceptname.png', bbox_inches='tight') #plot time vs trace id #df.plot(x='col_name_1', y='col_name_2', style='o') #fig = dataframe.plot(x='time:timestamp', y='case:concept:name', kind="scatter").get_figure() #fig.savefig('results/'+filename+'/'+str(len_of_points) +'/caseconceptname.png', bbox_inches='tight') #keep only mandatory columns dataframe = dataframe[[ 'case:concept:name', 'concept:name', 'time:timestamp' ]] #convert sorted dataframe to log log = pm4py.convert_to_event_log(dataframe) #initial_df = dataframe.copy() #print('Initial dataframe\n',initial_df) #----------------------------------------------------------------- ############################################################ #-------------- Resample ----------------------------------- ########################################################### if resample_dataset: #preprocess timestamp to be prepared for resample #make time:timestamp datetime dataframe.loc[:, 'time:timestamp'] = pd.to_datetime( dataframe['time:timestamp']) #set time:timestamp as index dataframe = dataframe.set_index(["time:timestamp"]) #remove duplicates #print('Duplicated\n') #print(dataframe[dataframe.index.duplicated()]) dataframe = dataframe[~dataframe.index.duplicated(keep='first')] #------Reasample dataframe every 5min keep last value found if Nan------------- dataframe = dataframe.resample("5T").fillna("backfill") print('Resample', dataframe) #print( dataframe.last()) #save resampled dataframe to csv dataframe.to_csv('../datasets/resampled_sorted_df.csv') #dataframe is initial event log sorted by time (start event only) #convert sorted by time dataframe back to log (xes) #log = pm4py.convert_to_event_log(dataframe) #-----------------------------------save to csv------------------------------------- #uncomment only if you need it #dataframe.to_csv('datasets/activitylog_uci_detailed_labour.csv') #print('\nDataframe LOG\n',dataframe) #--------------- Concat activities of a trace in one row --------------- #concat events with same case:concept:name (space separated) print('dataframe\n', dataframe) df = dataframe.groupby('case:concept:name', sort=False).agg( {'concept:name': lambda x: ' '.join(x)}) print('df\n', df) df = df.reset_index() if len_of_points: print('--------------------------------------') df['concept:name'] = df['concept:name'].apply( lambda x: list(x.split(' '))) df['concept:name'] = df['concept:name'].apply( lambda x: [x[i:i + len_of_points] for i in range(0, len(x), len_of_points)]) df = df.set_index('case:concept:name')['concept:name'].apply( pd.Series).stack().reset_index(level=0).rename( columns={0: 'concept:name'}) df['concept:name'] = df['concept:name'].apply(lambda x: ' '.join(x)) print('\ndftest\n', df) df = df.reset_index() #check here #del dataframe #print the activities of the log activities = pm4py.get_attribute_values(log, 'concept:name') print('\nActivities:\n', activities) #split data from event log - 80% for train and 20% for test #shuffle before split #df = shuffle(df) #print('df', df,'\n') #----------------------- Split Train and Test data ---------------------- #split rows depending on percentage split_rows = int(df.shape[0] * split_percentage) print('Split Rows', split_rows, '\n') #train dataframe train_df = df[:split_rows] train_df.to_csv('train.csv') print('Train Rows', train_df, '\n') #test dataframe test_df = df[split_rows:] test_df.to_csv('test.csv') #print('Test Rows', test_df,'\n') # -------------------------------------------------------------- #data = df['concept:name'].copy().to_list() data = train_df['concept:name'].copy().to_list() #Just for Eating/Drinking #data = data.replace('Eating/Drinking','EatDrink') #print('Data\n',data) tokenizer = Tokenizer() #reads the words in data and gives an index for every words based on frequency tokenizer.fit_on_texts([data]) print('Word index: ') print(tokenizer.word_index) #replace every word in the text to correspoding word index - returns list of list with one element so use [0] to get the one and only first list encoded = tokenizer.texts_to_sequences([data])[0] #print('encoded: \n') #print(encoded) vocab_size = len(tokenizer.word_index) + 1 print('Vocabulary Size: %d' % vocab_size) #print('list\n',[e for e in encoded]) #print('Min ',min([len(e) for e in encoded])) # LSTM 3 timesteps - prepare data - encode 2 words -> 1 word sequences = list() for i in range(n_input, len(encoded)): sequence = encoded[i - n_input:i + 1] sequences.append(sequence) print('Total Sequences: %d' % len(sequences)) print('Sequences: \n') print(sequences) max_length = max([len(seq) for seq in sequences]) #max_length is 3 # Pad sequence to be of the same length # length of sequence must be 3 (maximum) # 'pre' or 'post': pad either before or after each sequence sequences = pad_sequences(sequences, maxlen=max_length, padding='pre') print('Max Sequence Length: %d' % max_length) #convert list to array to get X,y train sequences = array(sequences) X, y = sequences[:, :-1], sequences[:, -1] print('X: \n') print(X) print('y: \n') print(y) #convert y to binary vectors y = to_categorical(y, num_classes=vocab_size) print('y: \n') print(y) #test data test_data = test_df['concept:name'].copy().to_list() test_encoded = tokenizer.texts_to_sequences([test_data])[0] test_sequences = list() for i in range(n_input, len(test_encoded)): test_sequence = test_encoded[i - n_input:i + 1] test_sequences.append(test_sequence) max_length = max([len(seq) for seq in test_sequences]) test_sequences = pad_sequences(test_sequences, maxlen=max_length, padding='pre') test_sequences = array(test_sequences) test_X, test_y = test_sequences[:, :-1], test_sequences[:, -1] #convert y to binary vectors test_yl = to_categorical(test_y, num_classes=vocab_size) model = Sequential() #the first layer # - the largest integer (i.e. word index) in the input should be no larger than vocabulary size # - The Embedding layer is initialized with random weights and will learn an embedding for all of the words in the training dataset. # - output_dim (50): This is the size of the vector space in which words will be embedded (size of the embedding vectors). It defines the size of the output vectors from this layer for each word. For example, it could be 32 or 100 or even larger. Test different values for your problem. # - input_length: This is the length of input sequences (here is 2) # The Embedding layer has weights that are learned. If you save your model to file, this will include weights for the Embedding layer. # The output of the Embedding layer is a 2D vector with one embedding for each word in the input sequence of words (input document). # If you wish to connect a Dense layer directly to an Embedding layer, you must first flatten the 2D output matrix to a 1D vector using the Flatten layer. model.add( Embedding(vocab_size + 1, LSTM_CELLS, input_length=max_length - 1)) model.add(LSTM(vocab_size)) model.add(Dropout(0.1)) model.add(Dense(vocab_size, activation='softmax')) opt = Adam(learning_rate=0.001) model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) history = model.fit(X, y, epochs=500, verbose=0, batch_size=20, validation_data=(test_X, test_yl)) print(model.summary()) model.save('lstm_model.h5') # creates a HDF5 file #del model # deletes the existing model #predict sequence of n_words activities def generate_seq(model, tokenizer, max_length, seed_text, n_words): #get input activity in_text = seed_text #print('in_text',in_text,'\n') #for the number of activities on sequence you want to predict for _ in range(n_words): encoded = tokenizer.texts_to_sequences([in_text])[0] #pad if less than max text length encoded = pad_sequences([encoded], maxlen=max_length, padding='pre') #print('in text ',in_text) #predict one activity #yhat = model.predict_classes(encoded, verbose=0) yhat = np.argmax(model.predict(encoded), axis=-1) out_word = '' for word, index in tokenizer.word_index.items(): #convert predicted activity to word if index == yhat: #print('Word',word,'\n') out_word = word break #feed the next input with the sequence of activities in_text += ' ' + out_word return in_text #load trained model #model = load_model('lstm_model.h5') # Evaluate network print('LSTM Network Evaluation:\n') train_score = model.evaluate(X, y, verbose=0) print('Train Score\n', train_score) score = model.evaluate(test_X, test_yl, verbose=0) print('Test Score\n') print(score) print('History\n') print(history.history.keys()) # plot loss during training fig = plt.figure() plt.subplot(211) plt.title('Loss') plt.plot(history.history['loss'], label='train') plt.plot(history.history['val_loss'], label='test') plt.legend() fig.savefig('results/' + filename + '/' + str(len_of_points) + '/Loss.png', bbox_inches='tight') # plot accuracy during training fig = plt.figure() plt.subplot(212) plt.title('Accuracy') plt.plot(history.history['accuracy'], label='train') plt.plot(history.history['val_accuracy'], label='test') plt.legend() plt.show() fig.savefig('results/' + filename + '/' + str(len_of_points) + '/Accuracy.png', bbox_inches='tight') print('LSTM Results: ') print('\n') #generated_text = '' #sequence prediction for i in tokenizer.word_index: #print(tokenizer.index_word) w = generate_seq(model, tokenizer, max_length - 1, i, n_input + 1) #generated_text = generated_text.join('\n'+w) print(w) print('LSTM Results: ') print('\n') #for i in tokenizer.word_index: # print(generate_seq(model, tokenizer, max_length-1, i , 1)) all_data = df['concept:name'].copy().to_list() all_encoded = tokenizer.texts_to_sequences([all_data])[0] all_sequences = list() for i in range(n_input, len(all_encoded)): all_sequence = all_encoded[i - n_input:i + 1] all_sequences.append(all_sequence) max_length = max([len(seq) for seq in all_sequences]) all_sequences = pad_sequences(all_sequences, maxlen=max_length, padding='pre') all_sequences = array(all_sequences) all_X, all_y = all_sequences[:, :-1], all_sequences[:, -1] #convert y to binary vectors all_yl = to_categorical(all_y, num_classes=vocab_size) #load trained model #model = load_model('lstm_model.h5') #print('Tokenizer \n',tokenizer) print('Tokenizer word index\n', tokenizer.word_index) np.set_printoptions(suppress=True) cnt = 0 for i in range(len(all_X)): #yhat = model.predict_classes(all_X[i].reshape(1,2,1), verbose=0) yhat = np.argmax(model.predict(all_X[i].reshape(1, n_input, 1)), axis=-1) df.loc[i, 'X_input'] = str(all_X[i]) df.loc[i, 'Expected'] = all_y[i] df.loc[i, 'predicted'] = yhat #print('Expected:', all_y[i] , 'Predicted', yhat) prob = model.predict_proba(all_X[i].reshape(1, n_input, 1))[0] df.loc[i, 'probabilities'] = ' '.join([str(elem) for elem in list(prob)]) if (all_y[i] == yhat): df.loc[i, 'result'] = 'ok' cnt += 1 else: df.loc[i, 'result'] = 'Error' #print(df['predicted'].replace(tokenizer.word_index)) df.to_csv('results/' + filename + '/' + str(len_of_points) + '/resample_' + str(resample_dataset) + '_lstm.csv') print('Total successful: ', cnt, ' out of ', len(all_X), 'Percentage: ', cnt / len(all_X)) # predict probabilities for test set yhat_probs = model.predict(test_X, verbose=0) # predict crisp classes for test set yhat_classes = model.predict_classes(test_X, verbose=0) print('yhat_classes\n', yhat_classes) # reduce to 1d array #yhat_probs = yhat_probs[:, 0] #yhat_classes = yhat_classes[:, 0] # accuracy: (tp + tn) / (p + n) accuracy = accuracy_score(test_y, yhat_classes) print('Accuracy: %f' % accuracy) # precision tp / (tp + fp) precision = precision_score(test_y, yhat_classes, average='weighted') print('Precision: %f' % precision) # recall: tp / (tp + fn) recall = recall_score(test_y, yhat_classes, average='weighted') print('Recall: %f' % recall) # f1: 2 tp / (2 tp + fp + fn) f1 = f1_score(test_y, yhat_classes, average='weighted') print('F1 score: %f' % f1) # kappa kappa = cohen_kappa_score(test_y, yhat_classes) print('Cohens kappa: %f' % kappa) # ROC AUC #auc = roc_auc_score(test_y, yhat_probs,multi_class='ovr') #print('ROC AUC: %f' % auc) # confusion matrix matrix = confusion_matrix(test_y, yhat_classes) print(matrix) fig = plt.figure() sns.heatmap(matrix, center=True) plt.show() fig.savefig('results/' + filename + '/' + str(len_of_points) + '/ConfusionMatrix.png', bbox_inches='tight') #headers #filename - resample - len of points - train loss + Accuracy - test score #write results to csv fd = open("total_results.csv", "a+") row = filename + '\t' + str(resample_dataset) + '\t' + str( len_of_points ) + '\t' + str(train_score[0]) + '\t' + str(train_score[1]) + '\t' + str( score[0]) + '\t' + str(score[1]) + '\t' + str(accuracy) + '\t' + str( precision) + '\t' + str(recall) + '\t' + str(f1) + '\t' + str( kappa) + '\t' + '' + '\t' + json.dumps( tokenizer.word_index) + '\n' fd.write(row) fd.close()
def read_data_equitemp(no_intervals, interval_width, sorted_aps, act_map, log, dataset): timestamps = pm4py.get_attribute_values(log, 'time:timestamp') print('Earliest:', min(timestamps)) print('Latest:', max(timestamps)) interval_length = (max(timestamps) - min(timestamps)) / no_intervals print('Interval length:', interval_length) no_act = len(act_map.keys()) dfg_time_matrix = np.zeros([no_intervals, no_act, no_act], dtype=int) interval_timing = [] no_events_sums = 0 no_events_logs = 0 no_dfs = 0 for i in range(0, no_intervals): print('Interval ', i, '/', no_intervals) lower_bound = min(timestamps) + i * interval_length if i == (no_intervals - 1): upper_bound = min(timestamps) + (i + 1) * interval_length * 2 else: upper_bound = min(timestamps) + (i + 1) * interval_length lb = lower_bound ub = upper_bound print(lb) print(ub) dfs = [] empty_mat = np.zeros([no_act, no_act], dtype=float) filtered_events = {} start = Event() end = Event() start['concept:name'] = str(act_map['start']) end['concept:name'] = str(act_map['end']) highest = datetime(1970, 1, 1, tzinfo=pytz.UTC) lowest = datetime(2050, 1, 1, tzinfo=pytz.UTC) count = 0 for df in sorted_aps: if ub > df.event2[ 'time:timestamp'] >= lb: # and ub > df.event['time:timestamp'] >= lb: dfs.append(df) no_dfs += len(dfs) log_dfs = {} for df in dfs: if df.trace_no not in log_dfs.keys(): log_dfs[df.trace_no] = [] log_dfs[df.trace_no].append(df) for trace_no, dfss in log_dfs.items(): # print('\nTrace:', trace_no) sorted_dfs = sorted(dfss) filtered_events[trace_no] = [] for df in sorted_dfs: # print(df) filtered_events[trace_no].append(df.event) no_events_sums += 1 filtered_events[trace_no].append(sorted_dfs[len(sorted_dfs) - 1].event2) no_events_sums += 1 print('#traces:', len(log_dfs)) for trace_no, events in filtered_events.items(): empty_mat[act_map['start'], act_map[events[0]['concept:name']]] += 1 empty_mat[act_map[events[-1]['concept:name']], act_map['end']] += 1 # Export filtered events to interval event logs new_log = EventLog() no_eve = 0 for t, trace in enumerate(log): new_trace = Trace() # new_trace.append(start) for trace_no, events in filtered_events.items(): if t == trace_no: for event in trace: if event in events: if event['time:timestamp'] < lowest: lowest = event['time:timestamp'] if event['time:timestamp'] > highest: highest = event['time:timestamp'] new_event = Event() new_event['concept:name'] = str( act_map[event['concept:name']]) new_trace.append(new_event) no_events_sums += 1 no_eve += 1 if len(new_trace) > 0: # new_trace.append(end) new_log.append(new_trace) exporter.apply( new_log, './logs/' + dataset + '_log_interval_' + str(i) + '-' + str(no_intervals) + '_equitemp.xes') # print('no eve:', no_eve) for act_pair in dfs: a1 = act_map[act_pair.a1] a2 = act_map[act_pair.a2] empty_mat[a1, a2] += 1 dfg_time_matrix[i] = empty_mat interval_timing.append((lowest, highest)) print('Event sums:', no_events_sums) print('Event logs:', no_events_logs) print('#DFS:', no_dfs) return dfg_time_matrix, interval_timing
def inductive_miner(log, dfg, threshold, root, act_key, use_msd): alphabet = pm4py.get_attribute_values(log, act_key) start_activities = get_starters.get_start_activities( log, parameters={constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) end_activities = get_ends.get_end_activities( log, parameters={constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) empty_traces = pm4py.filter_log(lambda trace: len(trace) == 0, log) if len(empty_traces) == 0: if _is_base_case_act(log, act_key) or _is_base_case_silent(log): return _apply_base_case(log, root, act_key) pre, post = dfg_utils.get_transitive_relations(dfg, alphabet) cut = sequence_cut.detect(alphabet, pre, post) if cut is not None: return _add_operator_recursive( pt.ProcessTree(pt.Operator.SEQUENCE, root), threshold, act_key, sequence_cut.project(log, cut, act_key), use_msd) cut = xor_cut.detect(dfg, alphabet) if cut is not None: return _add_operator_recursive( pt.ProcessTree(pt.Operator.XOR, root), threshold, act_key, xor_cut.project(log, cut, act_key), use_msd) cut = concurrent_cut.detect( dfg, alphabet, start_activities, end_activities, msd=msdw_algo.derive_msd_witnesses( log, msd_algo.apply(log, parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key }), parameters={ constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key }) if use_msd else None) if cut is not None: return _add_operator_recursive( pt.ProcessTree(pt.Operator.PARALLEL, root), threshold, act_key, concurrent_cut.project(log, cut, act_key), use_msd) cut = loop_cut.detect(dfg, alphabet, start_activities, end_activities) if cut is not None: return _add_operator_recursive( pt.ProcessTree(pt.Operator.LOOP, root), threshold, act_key, loop_cut.project(log, cut, act_key), use_msd) if len(empty_traces) > 0: nempty = pm4py.filter_log(lambda t: len(t) > 0, log) return _add_operator_recursive(pt.ProcessTree(pt.Operator.XOR, root), threshold, act_key, [EventLog(), nempty], use_msd) aopt = activity_once_per_trace.detect(log, alphabet, act_key) if aopt is not None: operator = pt.ProcessTree(operator=pt.Operator.PARALLEL, parent=root) operator.children.append( pt.ProcessTree(operator=None, parent=operator, label=aopt)) return _add_operator_recursive( operator, threshold, act_key, activity_once_per_trace.project(log, aopt, act_key), use_msd) act_conc = activity_concurrent.detect(log, alphabet, act_key, use_msd) if act_conc is not None: return _add_operator_recursive( pt.ProcessTree(pt.Operator.PARALLEL, root), threshold, act_key, activity_concurrent.project(log, act_conc, act_key), use_msd) stl = strict_tau_loop.detect(log, start_activities, end_activities, act_key) if stl is not None: return _add_operator_recursive(pt.ProcessTree(pt.Operator.LOOP, root), threshold, act_key, [stl, EventLog()], use_msd) tl = tau_loop.detect(log, start_activities, act_key) if tl is not None: return _add_operator_recursive(pt.ProcessTree(pt.Operator.LOOP, root), threshold, act_key, [tl, EventLog()], use_msd) return _flower(alphabet, root)
agg_type = 'equisize' no_pairs = 0 horizon = 25 no_intervals = 75 no_folds = 10 no_intervals_all = 100 # Parameters ############ variant = xes_importer.Variants.ITERPARSE paras = {variant.value.Parameters.MAX_TRACES: 1000000000} log = xes_importer.apply(dataset + '.xes', parameters=paras) # read and encode data activity_names = pm4py.get_attribute_values(log, 'concept:name') no_act = len(activity_names) act_map = {} reverse_map = {} for a, value in enumerate(activity_names.keys()): act_map[value] = a reverse_map[a] = value # add start and end points for DFGs act_map['start'] = no_act act_map['end'] = no_act + 1 reverse_map[no_act] = 'start' reverse_map[no_act + 1] = 'end' no_act += 2 print('Activity encoding:', act_map)