def __init__(self, dataframe: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None): if parameters is None: parameters = {} case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) activity_key = exec_utils.get_param_value( Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) index_key = exec_utils.get_param_value(Parameters.INDEX_KEY, parameters, constants.DEFAULT_INDEX_KEY) if not (hasattr(dataframe, "attrs") and dataframe.attrs): # dataframe has not been initialized through format_dataframe dataframe = pandas_utils.insert_index(dataframe, index_key) dataframe.sort_values([case_id_key, timestamp_key, index_key]) cases = dataframe[case_id_key].to_numpy() self.activities = dataframe[activity_key].to_numpy() self.timestamps = dataframe[timestamp_key].to_numpy() self.c_unq, self.c_ind, self.c_counts = np.unique(cases, return_index=True, return_counts=True) self.no_traces = len(self.c_ind) self.i = 0
def apply(df, parameters=None): """ Discovers a footprint object from a dataframe (the footprints of the dataframe are returned) Parameters -------------- df Dataframe parameters Parameters of the algorithm Returns -------------- footprints_obj Footprints object """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) caseid_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) start_timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, None) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) sort_required = exec_utils.get_param_value(Parameters.SORT_REQUIRED, parameters, DEFAULT_SORT_REQUIRED) index_key = exec_utils.get_param_value(Parameters.INDEX_KEY, parameters, DEFAULT_INDEX_KEY) df = df[[caseid_key, activity_key, timestamp_key]] if sort_required: df = pandas_utils.insert_index(df, index_key) if start_timestamp_key is not None: df = df.sort_values([caseid_key, start_timestamp_key, timestamp_key, index_key]) else: df = df.sort_values([caseid_key, timestamp_key, index_key]) grouped_df = df.groupby(caseid_key) dfg = df_statistics.get_dfg_graph(df, measure="frequency", activity_key=activity_key, case_id_glue=caseid_key, timestamp_key=timestamp_key, sort_caseid_required=False, sort_timestamp_along_case_id=False, start_timestamp_key=start_timestamp_key) activities = set(df[activity_key].unique()) start_activities = set(grouped_df.first()[activity_key].unique()) end_activities = set(grouped_df.last()[activity_key].unique()) parallel = {(x, y) for (x, y) in dfg if (y, x) in dfg} sequence = set(causal_discovery.apply(dfg, causal_discovery.Variants.CAUSAL_ALPHA)) ret = {} ret[Outputs.DFG.value] = dfg ret[Outputs.SEQUENCE.value] = sequence ret[Outputs.PARALLEL.value] = parallel ret[Outputs.ACTIVITIES.value] = activities ret[Outputs.START_ACTIVITIES.value] = start_activities ret[Outputs.END_ACTIVITIES.value] = end_activities ret[Outputs.MIN_TRACE_LENGTH.value] = int(grouped_df.size().min()) return ret
def format_dataframe(df: pd.DataFrame, case_id: str = constants.CASE_CONCEPT_NAME, activity_key: str = xes_constants.DEFAULT_NAME_KEY, timestamp_key: str = xes_constants.DEFAULT_TIMESTAMP_KEY, timest_format: Optional[str] = None) -> pd.DataFrame: """ Give the appropriate format on the dataframe, for process mining purposes Parameters -------------- df Dataframe case_id Case identifier column activity_key Activity column timestamp_key Timestamp column timest_format Timestamp format that is provided to Pandas Returns -------------- df Dataframe """ from pm4py.objects.log.util import dataframe_utils if case_id not in df.columns: raise Exception(case_id + " column (case ID) is not in the dataframe!") if activity_key not in df.columns: raise Exception(activity_key + " column (activity) is not in the dataframe!") if timestamp_key not in df.columns: raise Exception(timestamp_key + " column (timestamp) is not in the dataframe!") df = df.rename( columns={ case_id: constants.CASE_CONCEPT_NAME, activity_key: xes_constants.DEFAULT_NAME_KEY, timestamp_key: xes_constants.DEFAULT_TIMESTAMP_KEY }) df[constants.CASE_CONCEPT_NAME] = df[constants.CASE_CONCEPT_NAME].astype( str) # makes sure that the timestamp column is of timestamp type df = dataframe_utils.convert_timestamp_columns_in_df( df, timest_format=timest_format, timest_columns=[xes_constants.DEFAULT_TIMESTAMP_KEY]) # set an index column df = pandas_utils.insert_index(df, INDEX_COLUMN) # sorts the dataframe df = df.sort_values([ constants.CASE_CONCEPT_NAME, xes_constants.DEFAULT_TIMESTAMP_KEY, INDEX_COLUMN ]) # logging.warning( # "please convert the dataframe for advanced process mining applications. log = pm4py.convert_to_event_log(df)") return df
def A_next_B_next_C(df0, A, B, C, parameters=None): """ Applies the A net B next C rule Parameters ------------ df0 Dataframe A A Attribute value B B Attribute value C C Attribute value parameters Parameters of the algorithm, including the attribute key and the positive parameter: - If True, returns all the cases containing A, B and C and in which A was directly followed by B and B was directly followed by C - If False, returns all the cases not containing A or B or C, or in which none instance of A was directly followed by an instance of B and B was directly followed by C Returns ------------ filtered_df Filtered dataframe """ if parameters is None: parameters = {} case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME) attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY) positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True) df = df0.copy() df = df[[case_id_glue, attribute_key]] df = pandas_utils.insert_index(df) df_A = df[df[attribute_key] == A].copy() df_B = df[df[attribute_key] == B].copy() df_C = df[df[attribute_key] == C].copy() df_B["@@conceptname"] = df_B[case_id_glue] df_B = df_B.groupby(case_id_glue).last().set_index("@@conceptname") df_C["@@conceptname"] = df_C[case_id_glue] df_C = df_C.groupby(case_id_glue).last().set_index("@@conceptname") df_join = df_A.join(df_B, on=case_id_glue, rsuffix="_2").dropna().join(df_C, on=case_id_glue, rsuffix="_3").dropna() df_join["@@diffindex"] = df_join[constants.DEFAULT_INDEX_KEY+"_2"] - df_join[constants.DEFAULT_INDEX_KEY] df_join["@@diffindex2"] = df_join[constants.DEFAULT_INDEX_KEY+"_3"] - df_join[constants.DEFAULT_INDEX_KEY+"_2"] df_join = df_join[df_join["@@diffindex"] == 1] df_join = df_join[df_join["@@diffindex2"] == 1] i1 = df.set_index(case_id_glue).index i2 = df_join.set_index(case_id_glue).index if positive: return df0[i1.isin(i2)] else: return df0[~i1.isin(i2)]
def directly_follows_dataframe(dataframe: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None): """ Calculates the directly-follows dataframe (internal usage) """ if parameters is None: parameters = {} timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) index_key = exec_utils.get_param_value(Parameters.INDEX_KEY, parameters, constants.DEFAULT_INDEX_KEY) if not (hasattr(dataframe, "attrs") and dataframe.attrs): # dataframe has not been initialized through format_dataframe dataframe = pandas_utils.insert_index(dataframe, index_key) dataframe.sort_values([case_id_key, timestamp_key, index_key]) dataframe = pandas_utils.insert_index(dataframe, index_key) insert_parameters = copy(parameters) insert_parameters["use_extremes_timestamp"] = True dataframe = dataframe_utils.insert_artificial_start_end( dataframe, parameters=insert_parameters) df_shifted = dataframe.shift(-1) df_shifted.columns = [x + "_2" for x in df_shifted.columns] dataframe = pd.concat([dataframe, df_shifted], axis=1) dataframe = dataframe[dataframe[case_id_key] == dataframe[case_id_key + "_2"]] return dataframe
def occu_suc(dfg, filter_percent): ''' :param dfg: a counter containing all the direct succession relationship with frequency :param filter_percent: clarify the percentage of direct succession one wants to preserve :return: dataframe of direct succession relationship with frequency ''' df = pd.DataFrame.from_dict(dict(dfg), orient='index', columns=['freq']) df = df.sort_values(axis=0, by=['freq'], ascending=False) df = df.reset_index().rename(columns={'index': 'suc'}) # delete duplicated successions df = df.drop_duplicates('suc', keep='first') # delete self succession # filter out direct succession by percentage filter = list(range(0, round(filter_percent * len(df)))) df = pandas_utils.insert_index(df) df = df[df[constants.DEFAULT_INDEX_KEY].isin(filter)].reset_index(drop=True) return df
def A_eventually_B(df0, A, B, parameters=None): """ Applies the A eventually B rule Parameters ------------ df0 Dataframe A A Attribute value B B Attribute value parameters Parameters of the algorithm, including the attribute key and the positive parameter: - If True, returns all the cases containing A and B and in which A was eventually followed by B - If False, returns all the cases not containing A or B, or in which an instance of A was not eventually followed by an instance of B Returns ------------ filtered_df Filtered dataframe """ if parameters is None: parameters = {} case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME) attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY) positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True) enable_timestamp = exec_utils.get_param_value(Parameters.ENABLE_TIMESTAMP, parameters, False) timestamp_diff_boundaries = exec_utils.get_param_value(Parameters.TIMESTAMP_DIFF_BOUNDARIES, parameters, []) colset = [case_id_glue, attribute_key] if enable_timestamp: colset.append(timestamp_key) df = df0.copy() df = df[colset] df = pandas_utils.insert_index(df) df_A = df[df[attribute_key] == A].copy() df_B = df[df[attribute_key] == B].copy() df_B["@@conceptname"] = df_B[case_id_glue] df_B = df_B.groupby(case_id_glue).last().set_index("@@conceptname") df_join = df_A.join(df_B, on=case_id_glue, rsuffix="_2").dropna() df_join["@@diffindex"] = df_join[constants.DEFAULT_INDEX_KEY+"_2"] - df_join[constants.DEFAULT_INDEX_KEY] df_join = df_join[df_join["@@diffindex"] > 0] if enable_timestamp: df_join["@@difftimestamp"] = (df_join[timestamp_key + "_2"] - df_join[timestamp_key]).astype('timedelta64[s]') if timestamp_diff_boundaries: df_join = df_join[df_join["@@difftimestamp"] >= timestamp_diff_boundaries[0][0]] df_join = df_join[df_join["@@difftimestamp"] <= timestamp_diff_boundaries[0][1]] i1 = df.set_index(case_id_glue).index i2 = df_join.set_index(case_id_glue).index if positive: return df0[i1.isin(i2)] else: return df0[~i1.isin(i2)]
def eventually_follows(df0, attribute_values, parameters=None): """ Applies the eventually follows rule Parameters ------------ df0 Dataframe attribute_values A list of attribute_values attribute_values[n] follows attribute_values[n-1] follows ... follows attribute_values[0] parameters Parameters of the algorithm, including the attribute key and the positive parameter: - If True, returns all the cases containing all attribute_values and in which attribute_values[i] was eventually followed by attribute_values[i + 1] - If False, returns all the cases not containing all attribute_values, or in which an instance of attribute_values[i] was not eventually followed by an instance of attribute_values[i + 1] Returns ------------ filtered_df Filtered dataframe """ if parameters is None: parameters = {} case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME) attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY) positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True) enable_timestamp = exec_utils.get_param_value(Parameters.ENABLE_TIMESTAMP, parameters, False) timestamp_diff_boundaries = exec_utils.get_param_value(Parameters.TIMESTAMP_DIFF_BOUNDARIES, parameters, []) colset = [case_id_glue, attribute_key] if enable_timestamp: colset.append(timestamp_key) df = df0.copy() df = df[colset] df = pandas_utils.insert_index(df) df_a = [df[df[attribute_key] == attribute_value].copy() for attribute_value in attribute_values] df_join = df_a[0].merge(df_a[1], on=case_id_glue, suffixes=('', "_1")).dropna() df_join["@@diffindex0"] = df_join[constants.DEFAULT_INDEX_KEY+"_1"] - df_join[constants.DEFAULT_INDEX_KEY] df_join = df_join[df_join["@@diffindex0"] > 0] for i in range(2, len(df_a)): df_join = df_join.merge(df_a[i], on=case_id_glue, suffixes=('', f"_{i}")).dropna() df_join[f"@@diffindex{i-1}"] = df_join[constants.DEFAULT_INDEX_KEY+f"_{i}"] - df_join[constants.DEFAULT_INDEX_KEY+f"_{i-1}"] df_join = df_join[df_join[f"@@diffindex{i-1}"] > 0] if enable_timestamp: for i in range(len(df_a)): df_join[f"@@difftimestamp{i}"] = (df_join[timestamp_key + f"_{i + 1}"] - df_join[timestamp_key + f'_{i}']).astype('timedelta64[s]') if timestamp_diff_boundaries: df_join = df_join[df_join[f"@@difftimestamp{i}"] >= timestamp_diff_boundaries[i][0]] df_join = df_join[df_join[f"@@difftimestamp{i}"] <= timestamp_diff_boundaries[i][1]] i1 = df.set_index(case_id_glue).index i2 = df_join.set_index(case_id_glue).index if positive: return df0[i1.isin(i2)] else: return df0[~i1.isin(i2)]
def get_concurrent_events_dataframe( df, start_timestamp_key=None, timestamp_key="time:timestamp", case_id_glue="case:concept:name", activity_key="concept:name", sort_caseid_required=True, sort_timestamp_along_case_id=True, reduce_dataframe=True, max_start_column="@@max_start_column", min_complete_column="@@min_complete_column", diff_maxs_minc="@@diff_maxs_minc", strict=False): """ Gets the concurrent events (of the same case) in a Pandas dataframe Parameters -------------- df Dataframe start_timestamp_key Start timestamp key (if not provided, defaulted to the timestamp_key) timestamp_key Complete timestamp case_id_glue Column of the dataframe to use as case ID activity_key Activity key sort_caseid_required Tells if a sort by case ID is required (default: True) sort_timestamp_along_case_id Tells if a sort by timestamp is required along the case ID (default: True) reduce_dataframe To fasten operation, keep only essential columns in the dataframe strict Gets only entries that are strictly concurrent (i.e. the length of the intersection as real interval is > 0) Returns --------------- conc_ev_dataframe Concurrent events dataframe (with @@diff_maxs_minc as the size of the intersection of the intervals) """ # if not differently specified, set the start timestamp key to the timestamp key # to avoid retro-compatibility problems if start_timestamp_key is None: start_timestamp_key = xes_constants.DEFAULT_START_TIMESTAMP_KEY df[start_timestamp_key] = df[timestamp_key] # to get rows belonging to same case ID together, we need to sort on case ID if sort_caseid_required: if sort_timestamp_along_case_id: df = df.sort_values( [case_id_glue, start_timestamp_key, timestamp_key]) else: df = df.sort_values(case_id_glue) # to increase the speed of the approaches reduce dataframe to case, activity (and possibly complete timestamp) # columns if reduce_dataframe: df = df[[ case_id_glue, activity_key, start_timestamp_key, timestamp_key ]] df = pandas_utils.insert_index(df) df = df.set_index(case_id_glue) df_copy = df.copy() df = df.join(df_copy, rsuffix="_2").dropna() df = df[df[constants.DEFAULT_INDEX_KEY] < df[constants.DEFAULT_INDEX_KEY + "_2"]] df[max_start_column] = df[[ start_timestamp_key, start_timestamp_key + '_2' ]].max(axis=1) df[min_complete_column] = df[[timestamp_key, timestamp_key + '_2']].min(axis=1) df[max_start_column] = df[max_start_column].apply(lambda x: x.timestamp()) df[min_complete_column] = df[min_complete_column].apply( lambda x: x.timestamp()) df[diff_maxs_minc] = df[min_complete_column] - df[max_start_column] if strict: df = df[df[diff_maxs_minc] > 0] else: df = df[df[diff_maxs_minc] >= 0] return df
def get_partial_order_dataframe(df, start_timestamp_key=None, timestamp_key="time:timestamp", case_id_glue="case:concept:name", activity_key="concept:name", sort_caseid_required=True, sort_timestamp_along_case_id=True, reduce_dataframe=True, keep_first_following=True): """ Gets the partial order between events (of the same case) in a Pandas dataframe Parameters -------------- df Dataframe start_timestamp_key Start timestamp key (if not provided, defaulted to the timestamp_key) timestamp_key Complete timestamp case_id_glue Column of the dataframe to use as case ID activity_key Activity key sort_caseid_required Tells if a sort by case ID is required (default: True) sort_timestamp_along_case_id Tells if a sort by timestamp is required along the case ID (default: True) reduce_dataframe To fasten operation, keep only essential columns in the dataframe keep_first_following Keep only the first event following the given event Returns --------------- part_ord_dataframe Partial order dataframe (with @@flow_time between events) """ # if not differently specified, set the start timestamp key to the timestamp key # to avoid retro-compatibility problems if start_timestamp_key is None: start_timestamp_key = xes_constants.DEFAULT_START_TIMESTAMP_KEY df[start_timestamp_key] = df[timestamp_key] # to get rows belonging to same case ID together, we need to sort on case ID if sort_caseid_required: if sort_timestamp_along_case_id: df = df.sort_values( [case_id_glue, start_timestamp_key, timestamp_key]) else: df = df.sort_values(case_id_glue) # to increase the speed of the approaches reduce dataframe to case, activity (and possibly complete timestamp) # columns if reduce_dataframe: df = df[[ case_id_glue, activity_key, start_timestamp_key, timestamp_key ]] df = pandas_utils.insert_index(df) df = df.set_index(case_id_glue) df_copy = df.copy() df = df.join(df_copy, rsuffix="_2").dropna() df = df[df[constants.DEFAULT_INDEX_KEY] < df[constants.DEFAULT_INDEX_KEY + "_2"]] df = df[df[timestamp_key] <= df[start_timestamp_key + "_2"]] df = df.reset_index() df[constants.DEFAULT_FLOW_TIME] = ( df[start_timestamp_key + "_2"] - df[timestamp_key]).astype('timedelta64[s]') if keep_first_following: df = df.groupby(constants.DEFAULT_INDEX_KEY).first().reset_index() return df
def insert_artificial_start_end( df0: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None) -> pd.DataFrame: """ Inserts the artificial start/end activities in a Pandas dataframe Parameters ------------------ df0 Dataframe parameters Parameters of the algorithm, including: - Parameters.CASE_ID_KEY: the case identifier - Parameters.TIMESTAMP_KEY: the timestamp - Parameters.ACTIVITY_KEY: the activity Returns ----------------- enriched_df Dataframe with artificial start/end activities """ if parameters is None: parameters = {} case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) use_extremes_timestamp = exec_utils.get_param_value( Parameters.USE_EXTREMES_TIMESTAMP, parameters, False) artificial_start_activity = exec_utils.get_param_value( Parameters.PARAM_ARTIFICIAL_START_ACTIVITY, parameters, constants.DEFAULT_ARTIFICIAL_START_ACTIVITY) artificial_end_activity = exec_utils.get_param_value( Parameters.PARAM_ARTIFICIAL_END_ACTIVITY, parameters, constants.DEFAULT_ARTIFICIAL_END_ACTIVITY) index_key = exec_utils.get_param_value(Parameters.INDEX_KEY, parameters, constants.DEFAULT_INDEX_KEY) df = df0.copy() df = pandas_utils.insert_index(df, index_key) df = df.sort_values([case_id_key, timestamp_key, index_key]) start_df = df[[case_id_key, timestamp_key]].groupby(case_id_key).first().reset_index() end_df = df[[case_id_key, timestamp_key]].groupby(case_id_key).last().reset_index() # stability trick: remove 1ms from the artificial start activity timestamp, add 1ms to the artificial end activity timestamp if use_extremes_timestamp: start_df[timestamp_key] = pd.Timestamp.min end_df[timestamp_key] = pd.Timestamp.max start_df[timestamp_key] = start_df[timestamp_key].dt.tz_localize("utc") end_df[timestamp_key] = end_df[timestamp_key].dt.tz_localize("utc") else: start_df[timestamp_key] = start_df[timestamp_key] - pd.Timedelta( "1 ms") end_df[timestamp_key] = end_df[timestamp_key] + pd.Timedelta("1 ms") start_df[activity_key] = artificial_start_activity end_df[activity_key] = artificial_end_activity df = pd.concat([start_df, df, end_df]) df = pandas_utils.insert_index(df, index_key) df = df.sort_values([case_id_key, timestamp_key, index_key]) df.attrs = df0.attrs return df
def __insert_start_from_previous_event( df: pd.DataFrame, parameters: Optional[Dict[str, Any]] = None) -> pd.DataFrame: """ Inserts the start timestamp of an event set to the completion of the previous event in the case Parameters --------------- df Dataframe Returns --------------- df Dataframe with the start timestamp for each event """ if parameters is None: parameters = {} timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) resource_key = exec_utils.get_param_value( Parameters.RESOURCE_KEY, parameters, xes_constants.DEFAULT_RESOURCE_KEY) case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) start_timestamp_key = exec_utils.get_param_value( Parameters.START_TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_START_TIMESTAMP_KEY) from pm4py.util import pandas_utils df = df[[timestamp_key, resource_key, case_id_key, activity_key]] df = pandas_utils.insert_index(df) df = df.sort_values( [case_id_key, timestamp_key, constants.DEFAULT_INDEX_KEY]) shifted_df = df[[case_id_key, timestamp_key]].shift(1) shifted_df.columns = [x + "_2" for x in shifted_df.columns] concat_df = pd.concat([df, shifted_df], axis=1) concat_df = concat_df[concat_df[case_id_key] == concat_df[ case_id_key + "_2"]][[constants.DEFAULT_INDEX_KEY, timestamp_key + "_2"]] del shifted_df concat_df = concat_df.to_dict("r") concat_df = { x[constants.DEFAULT_INDEX_KEY]: x[timestamp_key + "_2"] for x in concat_df } df[start_timestamp_key] = df[constants.DEFAULT_INDEX_KEY].map(concat_df) df[start_timestamp_key] = df[start_timestamp_key].fillna(df[timestamp_key]) df = df.sort_values( [start_timestamp_key, timestamp_key, constants.DEFAULT_INDEX_KEY]) return df
def apply(dataframe: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None) -> pd.DataFrame: """ Performs a link analysis between the entries of the current dataframe. The link analysis permits advanced filtering based on events connected in an output-input relation (e.g., the OUT column of the first is equal to the IN column of the second). When OUT_COLUMN = IN_COLUMN = CASE ID, it can be equivalent to the directly-follows graph (when Parameters.KEEP_FIRST_OCCURRENCE = True), and to the eventually-follows graph (when Parameters.KEEP_FIRST_OCCURRENCE = False). Parameters ----------------- dataframe Pandas dataframe parameters Parameters of the algorithm, including: - Parameters.OUT_COLUMN => the output column of the dataframe - Parameters.IN_COLUMN => the input column of the dataframe - Parameters.SORTING_COLUMN => the column on top of which the - Parameters.INDEX_COLUMN => the attribute to use for the indexing - Parameters.LOOK_FORWARD => filters the relations in which the second event has an index >= than the index of the first event. - Parameters.KEEP_FIRST_OCCURRENCE => keep, for every source event, only the first-occurring relationship with a target event (OUT=IN). - Parameters.PROPAGATE => propagate the relationships between events, in such a way that the entire document flow chain can be reconstructed. Returns ----------------- link_analysis_dataframe Link analysis dataframe """ if parameters is None: parameters = {} out_column = exec_utils.get_param_value(Parameters.OUT_COLUMN, parameters, constants.CASE_CONCEPT_NAME) in_column = exec_utils.get_param_value(Parameters.IN_COLUMN, parameters, constants.CASE_CONCEPT_NAME) sorting_column = exec_utils.get_param_value(Parameters.SORTING_COLUMN, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) index_column = exec_utils.get_param_value(Parameters.INDEX_COLUMN, parameters, constants.DEFAULT_INDEX_KEY) look_forward = exec_utils.get_param_value(Parameters.LOOK_FORWARD, parameters, True) keep_first_occurrence = exec_utils.get_param_value(Parameters.KEEP_FIRST_OCCURRENCE, parameters, False) propagate = exec_utils.get_param_value(Parameters.PROPAGATE, parameters, False) dataframe = dataframe.sort_values(sorting_column) dataframe = pandas_utils.insert_index(dataframe, index_column) df_red1 = dataframe[[out_column, index_column]] df_red2 = dataframe[[in_column, index_column]] df_red = df_red1.merge(df_red2, left_on=out_column, right_on=in_column, suffixes=("_out", "_in")) if look_forward: df_red = df_red[df_red[index_column + "_out"] < df_red[index_column + "_in"]] if keep_first_occurrence: df_red = df_red.groupby(index_column + "_out").first().reset_index() stream_red = df_red.to_dict("records") associations = {} for el in stream_red: if not el[index_column + "_out"] in associations: associations[el[index_column + "_out"]] = set() associations[el[index_column + "_out"]].add(el[index_column + "_in"]) if propagate: associations = propagate_associations(associations) out_clmn = [] in_clmn = [] for k in associations: for v in associations[k]: out_clmn.append(k) in_clmn.append(v) rel = pd.DataFrame({index_column + "_out": out_clmn, index_column + "_in": in_clmn}) df_link = dataframe.copy() df_link.columns = [x + "_out" for x in df_link.columns] df_link = df_link.merge(rel, left_on=index_column + "_out", right_on=index_column + "_out") dataframe.columns = [x + "_in" for x in dataframe.columns] df_link = df_link.merge(dataframe, left_on=index_column + "_in", right_on=index_column + "_in") return df_link
def format_dataframe( df: pd.DataFrame, case_id: str = constants.CASE_CONCEPT_NAME, activity_key: str = xes_constants.DEFAULT_NAME_KEY, timestamp_key: str = xes_constants.DEFAULT_TIMESTAMP_KEY, start_timestamp_key: str = xes_constants.DEFAULT_START_TIMESTAMP_KEY, timest_format: Optional[str] = None) -> pd.DataFrame: """ Give the appropriate format on the dataframe, for process mining purposes Parameters -------------- df Dataframe case_id Case identifier column activity_key Activity column timestamp_key Timestamp column start_timestamp_key Start timestamp column timest_format Timestamp format that is provided to Pandas Returns -------------- df Dataframe """ if type(df) not in [pd.DataFrame, EventLog, EventStream]: raise Exception( "the method can be applied only to a traditional event log!") from pm4py.objects.log.util import dataframe_utils if case_id not in df.columns: raise Exception(case_id + " column (case ID) is not in the dataframe!") if activity_key not in df.columns: raise Exception(activity_key + " column (activity) is not in the dataframe!") if timestamp_key not in df.columns: raise Exception(timestamp_key + " column (timestamp) is not in the dataframe!") if case_id != constants.CASE_CONCEPT_NAME: if constants.CASE_CONCEPT_NAME in df.columns: del df[constants.CASE_CONCEPT_NAME] df[constants.CASE_CONCEPT_NAME] = df[case_id] if activity_key != xes_constants.DEFAULT_NAME_KEY: if xes_constants.DEFAULT_NAME_KEY in df.columns: del df[xes_constants.DEFAULT_NAME_KEY] df[xes_constants.DEFAULT_NAME_KEY] = df[activity_key] if timestamp_key != xes_constants.DEFAULT_TIMESTAMP_KEY: if xes_constants.DEFAULT_TIMESTAMP_KEY in df.columns: del df[xes_constants.DEFAULT_TIMESTAMP_KEY] df[xes_constants.DEFAULT_TIMESTAMP_KEY] = df[timestamp_key] # makes sure that the timestamps column are of timestamp type df = dataframe_utils.convert_timestamp_columns_in_df( df, timest_format=timest_format) # drop NaN(s) in the main columns (case ID, activity, timestamp) to ensure functioning of the # algorithms df = df.dropna(subset={ constants.CASE_CONCEPT_NAME, xes_constants.DEFAULT_NAME_KEY, xes_constants.DEFAULT_TIMESTAMP_KEY }, how="any") # make sure the case ID column is of string type df[constants.CASE_CONCEPT_NAME] = df[constants.CASE_CONCEPT_NAME].astype( "string") # make sure the activity column is of string type df[xes_constants.DEFAULT_NAME_KEY] = df[ xes_constants.DEFAULT_NAME_KEY].astype("string") # set an index column df = pandas_utils.insert_index(df, INDEX_COLUMN, copy_dataframe=False) # sorts the dataframe df = df.sort_values([ constants.CASE_CONCEPT_NAME, xes_constants.DEFAULT_TIMESTAMP_KEY, INDEX_COLUMN ]) # re-set the index column df = pandas_utils.insert_index(df, INDEX_COLUMN, copy_dataframe=False) # sets the properties if not hasattr(df, 'attrs'): # legacy (Python 3.6) support df.attrs = {} if start_timestamp_key in df.columns: df[xes_constants.DEFAULT_START_TIMESTAMP_KEY] = df[start_timestamp_key] df.attrs[ constants. PARAMETER_CONSTANT_START_TIMESTAMP_KEY] = xes_constants.DEFAULT_START_TIMESTAMP_KEY df.attrs[constants. PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_constants.DEFAULT_NAME_KEY df.attrs[ constants. PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_constants.DEFAULT_TIMESTAMP_KEY df.attrs[constants. PARAMETER_CONSTANT_GROUP_KEY] = xes_constants.DEFAULT_GROUP_KEY df.attrs[ constants. PARAMETER_CONSTANT_TRANSITION_KEY] = xes_constants.DEFAULT_TRANSITION_KEY df.attrs[ constants. PARAMETER_CONSTANT_RESOURCE_KEY] = xes_constants.DEFAULT_RESOURCE_KEY df.attrs[ constants.PARAMETER_CONSTANT_CASEID_KEY] = constants.CASE_CONCEPT_NAME return df
def apply( dataframe: pd.DataFrame, list_activities: List[str], sample_size: int, parameters: Optional[Dict[Union[str, Parameters], Any]] = None ) -> Dict[str, Any]: """ Finds the disconnected performance spectrum provided a dataframe and a list of activities Parameters ------------- dataframe Dataframe list_activities List of activities interesting for the performance spectrum (at least two) sample_size Size of the sample parameters Parameters of the algorithm, including: - Parameters.ACTIVITY_KEY - Parameters.TIMESTAMP_KEY - Parameters.CASE_ID_KEY Returns ------------- points Points of the performance spectrum """ if parameters is None: parameters = {} case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes.DEFAULT_TIMESTAMP_KEY) sort_log_required = exec_utils.get_param_value( Parameters.SORT_LOG_REQUIRED, parameters, True) dataframe = dataframe[[case_id_glue, activity_key, timestamp_key]] dataframe = dataframe[dataframe[activity_key].isin(list_activities)] dataframe = pandas_utils.insert_index(dataframe, constants.DEFAULT_EVENT_INDEX_KEY) if sort_log_required: dataframe = dataframe.sort_values( [case_id_glue, timestamp_key, constants.DEFAULT_EVENT_INDEX_KEY]) dataframe[timestamp_key] = dataframe[timestamp_key].astype( np.int64) / 10**9 all_patterns = [(len(list_activities) - i, gen_patterns(list_activities, len(list_activities) - i)) for i in range(len(list_activities) - 1)] def key(k, n): return k + str(n) def to_points(match, l): return { 'case_id': match[key(case_id_glue, 0)], 'points': [(match[key(activity_key, i)], match[key(timestamp_key, i)]) for i in range(l)] } points = [] for l, patterns in all_patterns: # concat shifted and suffixed dataframes to get a dataframe that allows to check for the patterns dfs = [dataframe.add_suffix(str(i)).shift(-i) for i in range(l)] df_merged = pd.concat(dfs, axis=1) indices = [shift_index(dfs[i].index, i) for i in range(len(dfs))] mindex = pd.MultiIndex.from_arrays(indices) df_merged = df_merged.set_index(mindex) for i in range(l - 1): df_merged = df_merged[df_merged[key(case_id_glue, i)] == df_merged[ key(case_id_glue, i + 1)]] column_list = [key(activity_key, i) for i in range(l)] matches = df_merged[np.isin(df_merged[column_list].sum(axis=1), patterns)] points.extend([to_points(m, l) for m in matches.to_dict('records')]) # drop rows of this match to not discover subsets of this match again dataframe = dataframe.drop( [int(i) for indices in matches.index for i in indices[:-1]]) pass points = sorted(points, key=lambda x: min(x['points'], key=lambda x: x[1])[1]) if len(points) > sample_size: points = points_subset.pick_chosen_points_list(sample_size, points) return points
def apply(dataframe, list_activities, sample_size, parameters): """ Finds the performance spectrum provided a dataframe and a list of activities Parameters ------------- dataframe Dataframe list_activities List of activities interesting for the performance spectrum (at least two) sample_size Size of the sample parameters Parameters of the algorithm, including: - Parameters.ACTIVITY_KEY - Parameters.TIMESTAMP_KEY - Parameters.CASE_ID_KEY Returns ------------- points Points of the performance spectrum """ if parameters is None: parameters = {} import pandas as pd import numpy as np case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes.DEFAULT_TIMESTAMP_KEY) dataframe = dataframe[[case_id_glue, activity_key, timestamp_key]] dataframe = dataframe[dataframe[activity_key].isin(list_activities)] dataframe = pandas_utils.insert_index(dataframe, constants.DEFAULT_EVENT_INDEX_KEY) dataframe = dataframe.sort_values( [case_id_glue, timestamp_key, constants.DEFAULT_EVENT_INDEX_KEY]) dataframe[timestamp_key] = dataframe[timestamp_key].astype( np.int64) / 10**9 list_replicas = [] activity_names = [] filt_col_names = [] for i in range(len(list_activities)): if i > 0: dataframe = dataframe.shift(-1) activity_names.append("+'@@'+") ren = {x: x + "_" + str(i) for x in dataframe.columns} list_replicas.append(dataframe.rename(columns=ren)) filt_col_names.append(timestamp_key + "_" + str(i)) activity_names.append("dataframe[activity_key+'_" + str(i) + "']") dataframe = pd.concat(list_replicas, axis=1) for i in range(len(list_activities) - 1): dataframe = dataframe[dataframe[case_id_glue + "_" + str(i)] == dataframe[case_id_glue + "_" + str(i + 1)]] dataframe["@@merged_activity"] = eval("".join(activity_names)) desidered_act = "@@".join(list_activities) dataframe = dataframe[dataframe["@@merged_activity"] == desidered_act] dataframe = dataframe[filt_col_names] if len(dataframe) > sample_size: dataframe = dataframe.sample(n=sample_size) points = pandas_utils.to_dict_records(dataframe) points = [[p[tk] for tk in filt_col_names] for p in points] points = sorted(points, key=lambda x: x[0]) return points
def apply(dataframe: pd.DataFrame, parameters=None): """ Returns the variants from a Pandas dataframe (through Numpy) Parameters ------------------ dataframe Dataframe parameters Parameters of the algorithm, including: - Parameters.CASE_ID_KEY => the case identifier - Parameters.ACTIVITY_KEY => the activity - Parameters.TIMESTAMP_KEY => the timestamp - Parameters.INDEX_KEY => the index Returns ------------------ variants_dict Dictionary associating to each variant the number of occurrences in the dataframe """ if parameters is None: parameters = {} case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) index_key = exec_utils.get_param_value(Parameters.INDEX_KEY, parameters, constants.DEFAULT_INDEX_KEY) if not (hasattr(dataframe, "attrs") and dataframe.attrs): # dataframe has not been initialized through format_dataframe dataframe = pandas_utils.insert_index(dataframe, index_key) dataframe.sort_values([case_id_key, timestamp_key, index_key]) cases = dataframe[case_id_key].to_numpy() activities = dataframe[activity_key].to_numpy() c_unq, c_ind, c_counts = np.unique(cases, return_index=True, return_counts=True) variants = Counter() for i in range(len(c_ind)): si = c_ind[i] ei = si + c_counts[i] acts = tuple(activities[si:ei]) variants[acts] += 1 if variants_util.VARIANT_SPECIFICATION == variants_util.VariantsSpecifications.STRING: variants = { constants.DEFAULT_VARIANT_SEP.join(x): y for x, y in variants.items() } else: variants = {x: y for x, y in variants.items()} return variants