def create_process_structure(bpmn, drawing=False, verbose=True): # Loading of bpmn structure into a directed graph g = load_process_structure(bpmn, verbose) if drawing: graph_network_x(g) if verbose: sup.print_done_task() return g
def get_xes_events_data(self): log = pm4py.read_xes(self.input) try: source = log.attributes['source'] except: source = '' flattern_log = ([{ **event, **{ 'caseid': trace.attributes['concept:name'] } } for trace in log for event in trace]) temp_data = pd.DataFrame(flattern_log) temp_data['time:timestamp'] = temp_data.apply( lambda x: x['time:timestamp'].strftime(self.timeformat), axis=1) temp_data['time:timestamp'] = pd.to_datetime( temp_data['time:timestamp'], format=self.timeformat) temp_data.rename(columns={ 'concept:name': 'task', 'lifecycle:transition': 'event_type', 'org:resource': 'user', 'time:timestamp': 'timestamp' }, inplace=True) temp_data = ( temp_data[~temp_data.task.isin(['Start', 'End', 'start', 'end'])]. reset_index(drop=True)) temp_data = (temp_data[temp_data.event_type.isin( ['start', 'complete'])].reset_index(drop=True)) if source == 'com.qbpsimulator': if len(temp_data.iloc[0].elementId.split('_')) > 1: temp_data['etype'] = temp_data.apply( lambda x: x.elementId.split('_')[0], axis=1) temp_data = (temp_data[temp_data.etype == 'Task'].reset_index( drop=True)) self.raw_data = temp_data.to_dict('records') if self.verbose: sup.print_performed_task('Rearranging log traces ') self.data = self.reorder_xes(temp_data) self.data = pd.DataFrame(self.data) self.data.drop_duplicates(inplace=True) self.data = self.data.to_dict('records') self.append_csv_start_end() if self.verbose: sup.print_done_task()
def discover_roles(self): associations = lambda x: (self.tasks[x['task']], self.users[x['user']]) self.data['ac_rl'] = self.data.apply(associations, axis=1) freq_matrix = (self.data.groupby( by='ac_rl')['task'].count().reset_index().rename( columns={'task': 'freq'})) freq_matrix = { x['ac_rl']: x['freq'] for x in freq_matrix.to_dict('records') } profiles = self.build_profile(freq_matrix) sup.print_progress(((20 / 100) * 100), 'Analysing resource pool ') # building of a correl matrix between resouces profiles correl_matrix = self.det_correl_matrix(profiles) sup.print_progress(((40 / 100) * 100), 'Analysing resource pool ') # creation of a rel network between resouces g = nx.Graph() for user in self.users.values(): g.add_node(user) for rel in correl_matrix: # creation of edges between nodes excluding the same elements # and those below the similarity threshold if rel['distance'] > self.sim_threshold and rel['x'] != rel['y']: g.add_edge(rel['x'], rel['y'], weight=rel['distance']) sup.print_progress(((60 / 100) * 100), 'Analysing resource pool ') # extraction of fully conected subgraphs as roles sub_graphs = list(nx.connected_components(g)) sup.print_progress(((80 / 100) * 100), 'Analysing resource pool ') # role definition from graph roles = self.role_definition(sub_graphs) # plot creation (optional) # if drawing == True: # graph_network(g, sub_graphs) sup.print_progress(((100 / 100) * 100), 'Analysing resource pool ') sup.print_done_task() return roles
def get_csv_events_data(self): """ reads and parse all the events information from a csv file """ if self.verbose: sup.print_performed_task('Reading log traces ') log = pd.read_csv(self.input) if self.one_timestamp: self.column_names['Complete Timestamp'] = 'end_timestamp' log = log.rename(columns=self.column_names) log = log.astype({'caseid': object}) log = (log[(log.task != 'Start') & (log.task != 'End')].reset_index(drop=True)) if self.filter_d_attrib: log = log[['caseid', 'task', 'user', 'end_timestamp']] log['end_timestamp'] = pd.to_datetime(log['end_timestamp'], format=self.timeformat) else: self.column_names['Start Timestamp'] = 'start_timestamp' self.column_names['Complete Timestamp'] = 'end_timestamp' log = log.rename(columns=self.column_names) log = log.astype({'caseid': object}) log = (log[(log.task != 'Start') & (log.task != 'End')].reset_index(drop=True)) if self.filter_d_attrib: log = log[[ 'caseid', 'task', 'user', 'start_timestamp', 'end_timestamp' ]] log['start_timestamp'] = pd.to_datetime(log['start_timestamp'], format=self.timeformat) log['end_timestamp'] = pd.to_datetime(log['end_timestamp'], format=self.timeformat) self.data = log.to_dict('records') self.append_csv_start_end() self.split_event_transitions() if self.verbose: sup.print_done_task()
def _predict_next_event_shared_cat(self, parameters, vectorizer): """Generate business process suffixes using a keras trained model. Args: model (keras model): keras trained model. prefixes (list): list of prefixes. ac_index (dict): index of activities. rl_index (dict): index of roles. imp (str): method of next event selection. """ # Generation of predictions results = list() for i, _ in enumerate(self.spl['prefixes']['activities']): # Activities and roles input shape(1,5) x_ac_ngram = (np.append( np.zeros(parameters['dim']['time_dim']), np.array(self.spl['prefixes']['activities'][i]), axis=0)[-parameters['dim']['time_dim']:].reshape( (1, parameters['dim']['time_dim']))) x_rl_ngram = (np.append( np.zeros(parameters['dim']['time_dim']), np.array(self.spl['prefixes']['roles'][i]), axis=0)[-parameters['dim']['time_dim']:].reshape( (1, parameters['dim']['time_dim']))) # times input shape(1,5,1) times_attr_num = (self.spl['prefixes']['times'][i].shape[1]) x_t_ngram = np.array([ np.append(np.zeros( (parameters['dim']['time_dim'], times_attr_num)), self.spl['prefixes']['times'][i], axis=0)[-parameters['dim']['time_dim']:].reshape( (parameters['dim']['time_dim'], times_attr_num)) ]) # add intercase features if necessary if vectorizer in ['basic']: inputs = [x_ac_ngram, x_rl_ngram, x_t_ngram] elif vectorizer in ['inter']: # times input shape(1,5,1) inter_attr_num = ( self.spl['prefixes']['inter_attr'][i].shape[1]) x_inter_ngram = np.array([ np.append(np.zeros( (parameters['dim']['time_dim'], inter_attr_num)), self.spl['prefixes']['inter_attr'][i], axis=0)[-parameters['dim']['time_dim']:].reshape( (parameters['dim']['time_dim'], inter_attr_num)) ]) inputs = [x_ac_ngram, x_rl_ngram, x_t_ngram, x_inter_ngram] # predict preds = self.model.predict(inputs) if self.imp == 'random_choice': # Use this to get a random choice following as PDF pos = np.random.choice(np.arange(0, len(preds[0][0])), p=preds[0][0]) pos1 = np.random.choice(np.arange(0, len(preds[1][0])), p=preds[1][0]) elif self.imp == 'arg_max': # Use this to get the max prediction pos = np.argmax(preds[0][0]) pos1 = np.argmax(preds[1][0]) # save results predictions = [pos, pos1, preds[2][0][0]] if not parameters['one_timestamp']: predictions.extend([preds[2][0][1]]) results.append( self.create_result_record(i, self.spl, predictions, parameters)) sup.print_done_task() return results
def _predict_suffix_shared_cat(self, parms, vectorizer): """Generate business process suffixes using a keras trained model. Args: model (keras model): keras trained model. prefixes (list): list of prefixes. ac_index (dict): index of activities. rl_index (dict): index of roles. imp (str): method of next event selection. """ # Generation of predictions results = list() for i, _ in enumerate(self.spl['prefixes']['activities']): # Activities and roles input shape(1,5) x_ac_ngram = (np.append( np.zeros(parms['dim']['time_dim']), np.array(self.spl['prefixes']['activities'][i]), axis=0)[-parms['dim']['time_dim']:].reshape( (1, parms['dim']['time_dim']))) x_rl_ngram = (np.append( np.zeros(parms['dim']['time_dim']), np.array(self.spl['prefixes']['roles'][i]), axis=0)[-parms['dim']['time_dim']:].reshape( (1, parms['dim']['time_dim']))) times_attr_num = (self.spl['prefixes']['times'][i].shape[1]) x_t_ngram = np.array([ np.append(np.zeros((parms['dim']['time_dim'], times_attr_num)), self.spl['prefixes']['times'][i], axis=0)[-parms['dim']['time_dim']:].reshape( (parms['dim']['time_dim'], times_attr_num)) ]) if vectorizer in ['basic']: inputs = [x_ac_ngram, x_rl_ngram, x_t_ngram] elif vectorizer in ['inter']: inter_attr_num = self.spl['prefixes']['inter_attr'][i].shape[1] x_inter_ngram = np.array([ np.append(np.zeros( (parms['dim']['time_dim'], inter_attr_num)), self.spl['prefixes']['inter_attr'][i], axis=0)[-parms['dim']['time_dim']:].reshape( (parms['dim']['time_dim'], inter_attr_num)) ]) inputs = [x_ac_ngram, x_rl_ngram, x_t_ngram, x_inter_ngram] pref_size = len(self.spl['prefixes']['activities'][i]) acum_dur, acum_wait = list(), list() ac_suf, rl_suf = list(), list() for _ in range(1, self.max_trace_size): preds = self.model.predict(inputs) if self.imp == 'random_choice': # Use this to get a random choice following as PDF the predictions pos = np.random.choice(np.arange(0, len(preds[0][0])), p=preds[0][0]) pos1 = np.random.choice(np.arange(0, len(preds[1][0])), p=preds[1][0]) elif self.imp == 'arg_max': # Use this to get the max prediction pos = np.argmax(preds[0][0]) pos1 = np.argmax(preds[1][0]) # Activities accuracy evaluation x_ac_ngram = np.append(x_ac_ngram, [[pos]], axis=1) x_ac_ngram = np.delete(x_ac_ngram, 0, 1) x_rl_ngram = np.append(x_rl_ngram, [[pos1]], axis=1) x_rl_ngram = np.delete(x_rl_ngram, 0, 1) x_t_ngram = np.append(x_t_ngram, [preds[2]], axis=1) x_t_ngram = np.delete(x_t_ngram, 0, 1) if vectorizer in ['basic']: inputs = [x_ac_ngram, x_rl_ngram, x_t_ngram] elif vectorizer in ['inter']: x_inter_ngram = np.append(x_inter_ngram, [preds[3]], axis=1) x_inter_ngram = np.delete(x_inter_ngram, 0, 1) inputs = [x_ac_ngram, x_rl_ngram, x_t_ngram, x_inter_ngram] # Stop if the next prediction is the end of the trace # otherwise until the defined max_size ac_suf.append(pos) rl_suf.append(pos1) acum_dur.append(preds[2][0][0]) if not parms['one_timestamp']: acum_wait.append(preds[2][0][1]) if parms['index_ac'][pos] == 'end': break # save results predictions = [ac_suf, rl_suf, acum_dur] if not parms['one_timestamp']: predictions.extend([acum_wait]) results.append( self.create_result_record(i, self.spl, predictions, parms, pref_size)) sup.print_done_task() return results