Example #1
0
    def __init__(self,
                 dataframe: pd.DataFrame,
                 parameters: Optional[Dict[Any, Any]] = None):
        if parameters is None:
            parameters = {}

        case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY,
                                                 parameters,
                                                 constants.CASE_CONCEPT_NAME)
        activity_key = exec_utils.get_param_value(
            Parameters.ACTIVITY_KEY, parameters,
            xes_constants.DEFAULT_NAME_KEY)
        timestamp_key = exec_utils.get_param_value(
            Parameters.TIMESTAMP_KEY, parameters,
            xes_constants.DEFAULT_TIMESTAMP_KEY)
        index_key = exec_utils.get_param_value(Parameters.INDEX_KEY,
                                               parameters,
                                               constants.DEFAULT_INDEX_KEY)

        if not (hasattr(dataframe, "attrs") and dataframe.attrs):
            # dataframe has not been initialized through format_dataframe
            dataframe = pandas_utils.insert_index(dataframe, index_key)
            dataframe.sort_values([case_id_key, timestamp_key, index_key])

        cases = dataframe[case_id_key].to_numpy()

        self.activities = dataframe[activity_key].to_numpy()
        self.timestamps = dataframe[timestamp_key].to_numpy()
        self.c_unq, self.c_ind, self.c_counts = np.unique(cases,
                                                          return_index=True,
                                                          return_counts=True)
        self.no_traces = len(self.c_ind)
        self.i = 0
def apply(df, parameters=None):
    """
    Discovers a footprint object from a dataframe
    (the footprints of the dataframe are returned)

    Parameters
    --------------
    df
        Dataframe
    parameters
        Parameters of the algorithm

    Returns
    --------------
    footprints_obj
        Footprints object
    """
    if parameters is None:
        parameters = {}

    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY)
    caseid_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME)
    start_timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters,
                                               None)
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters,
                                               xes_constants.DEFAULT_TIMESTAMP_KEY)
    sort_required = exec_utils.get_param_value(Parameters.SORT_REQUIRED, parameters, DEFAULT_SORT_REQUIRED)
    index_key = exec_utils.get_param_value(Parameters.INDEX_KEY, parameters, DEFAULT_INDEX_KEY)

    df = df[[caseid_key, activity_key, timestamp_key]]
    if sort_required:
        df = pandas_utils.insert_index(df, index_key)
        if start_timestamp_key is not None:
            df = df.sort_values([caseid_key, start_timestamp_key, timestamp_key, index_key])
        else:
            df = df.sort_values([caseid_key, timestamp_key, index_key])

    grouped_df = df.groupby(caseid_key)
    dfg = df_statistics.get_dfg_graph(df, measure="frequency", activity_key=activity_key, case_id_glue=caseid_key,
                                      timestamp_key=timestamp_key, sort_caseid_required=False,
                                      sort_timestamp_along_case_id=False, start_timestamp_key=start_timestamp_key)
    activities = set(df[activity_key].unique())
    start_activities = set(grouped_df.first()[activity_key].unique())
    end_activities = set(grouped_df.last()[activity_key].unique())

    parallel = {(x, y) for (x, y) in dfg if (y, x) in dfg}
    sequence = set(causal_discovery.apply(dfg, causal_discovery.Variants.CAUSAL_ALPHA))

    ret = {}
    ret[Outputs.DFG.value] = dfg
    ret[Outputs.SEQUENCE.value] = sequence
    ret[Outputs.PARALLEL.value] = parallel
    ret[Outputs.ACTIVITIES.value] = activities
    ret[Outputs.START_ACTIVITIES.value] = start_activities
    ret[Outputs.END_ACTIVITIES.value] = end_activities
    ret[Outputs.MIN_TRACE_LENGTH.value] = int(grouped_df.size().min())

    return ret
Example #3
0
def format_dataframe(df: pd.DataFrame,
                     case_id: str = constants.CASE_CONCEPT_NAME,
                     activity_key: str = xes_constants.DEFAULT_NAME_KEY,
                     timestamp_key: str = xes_constants.DEFAULT_TIMESTAMP_KEY,
                     timest_format: Optional[str] = None) -> pd.DataFrame:
    """
    Give the appropriate format on the dataframe, for process mining purposes

    Parameters
    --------------
    df
        Dataframe
    case_id
        Case identifier column
    activity_key
        Activity column
    timestamp_key
        Timestamp column
    timest_format
        Timestamp format that is provided to Pandas

    Returns
    --------------
    df
        Dataframe
    """
    from pm4py.objects.log.util import dataframe_utils
    if case_id not in df.columns:
        raise Exception(case_id + " column (case ID) is not in the dataframe!")
    if activity_key not in df.columns:
        raise Exception(activity_key +
                        " column (activity) is not in the dataframe!")
    if timestamp_key not in df.columns:
        raise Exception(timestamp_key +
                        " column (timestamp) is not in the dataframe!")
    df = df.rename(
        columns={
            case_id: constants.CASE_CONCEPT_NAME,
            activity_key: xes_constants.DEFAULT_NAME_KEY,
            timestamp_key: xes_constants.DEFAULT_TIMESTAMP_KEY
        })
    df[constants.CASE_CONCEPT_NAME] = df[constants.CASE_CONCEPT_NAME].astype(
        str)
    # makes sure that the timestamp column is of timestamp type
    df = dataframe_utils.convert_timestamp_columns_in_df(
        df,
        timest_format=timest_format,
        timest_columns=[xes_constants.DEFAULT_TIMESTAMP_KEY])
    # set an index column
    df = pandas_utils.insert_index(df, INDEX_COLUMN)
    # sorts the dataframe
    df = df.sort_values([
        constants.CASE_CONCEPT_NAME, xes_constants.DEFAULT_TIMESTAMP_KEY,
        INDEX_COLUMN
    ])
    # logging.warning(
    #    "please convert the dataframe for advanced process mining applications. log = pm4py.convert_to_event_log(df)")
    return df
Example #4
0
def A_next_B_next_C(df0, A, B, C, parameters=None):
    """
    Applies the A net B next C rule

    Parameters
    ------------
    df0
        Dataframe
    A
        A Attribute value
    B
        B Attribute value
    C
        C Attribute value
    parameters
        Parameters of the algorithm, including the attribute key and the positive parameter:
        - If True, returns all the cases containing A, B and C and in which A was directly followed by B and B was directly followed by C
        - If False, returns all the cases not containing A or B or C, or in which none instance of A was directly
        followed by an instance of B and B was directly followed by C

    Returns
    ------------
    filtered_df
        Filtered dataframe
    """
    if parameters is None:
        parameters = {}

    case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME)
    attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY)
    positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True)

    df = df0.copy()
    df = df[[case_id_glue, attribute_key]]
    df = pandas_utils.insert_index(df)
    df_A = df[df[attribute_key] == A].copy()
    df_B = df[df[attribute_key] == B].copy()
    df_C = df[df[attribute_key] == C].copy()
    df_B["@@conceptname"] = df_B[case_id_glue]
    df_B = df_B.groupby(case_id_glue).last().set_index("@@conceptname")
    df_C["@@conceptname"] = df_C[case_id_glue]
    df_C = df_C.groupby(case_id_glue).last().set_index("@@conceptname")

    df_join = df_A.join(df_B, on=case_id_glue, rsuffix="_2").dropna().join(df_C, on=case_id_glue, rsuffix="_3").dropna()
    df_join["@@diffindex"] = df_join[constants.DEFAULT_INDEX_KEY+"_2"] - df_join[constants.DEFAULT_INDEX_KEY]
    df_join["@@diffindex2"] = df_join[constants.DEFAULT_INDEX_KEY+"_3"] - df_join[constants.DEFAULT_INDEX_KEY+"_2"]
    df_join = df_join[df_join["@@diffindex"] == 1]
    df_join = df_join[df_join["@@diffindex2"] == 1]

    i1 = df.set_index(case_id_glue).index
    i2 = df_join.set_index(case_id_glue).index

    if positive:
        return df0[i1.isin(i2)]
    else:
        return df0[~i1.isin(i2)]
Example #5
0
def directly_follows_dataframe(dataframe: pd.DataFrame,
                               parameters: Optional[Dict[Any, Any]] = None):
    """
    Calculates the directly-follows dataframe (internal usage)
    """
    if parameters is None:
        parameters = {}

    timestamp_key = exec_utils.get_param_value(
        Parameters.TIMESTAMP_KEY, parameters,
        xes_constants.DEFAULT_TIMESTAMP_KEY)
    case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY,
                                             parameters,
                                             constants.CASE_CONCEPT_NAME)
    index_key = exec_utils.get_param_value(Parameters.INDEX_KEY, parameters,
                                           constants.DEFAULT_INDEX_KEY)

    if not (hasattr(dataframe, "attrs") and dataframe.attrs):
        # dataframe has not been initialized through format_dataframe
        dataframe = pandas_utils.insert_index(dataframe, index_key)
        dataframe.sort_values([case_id_key, timestamp_key, index_key])

    dataframe = pandas_utils.insert_index(dataframe, index_key)

    insert_parameters = copy(parameters)
    insert_parameters["use_extremes_timestamp"] = True

    dataframe = dataframe_utils.insert_artificial_start_end(
        dataframe, parameters=insert_parameters)

    df_shifted = dataframe.shift(-1)
    df_shifted.columns = [x + "_2" for x in df_shifted.columns]
    dataframe = pd.concat([dataframe, df_shifted], axis=1)
    dataframe = dataframe[dataframe[case_id_key] == dataframe[case_id_key +
                                                              "_2"]]

    return dataframe
Example #6
0
def occu_suc(dfg, filter_percent):
    '''

    :param dfg: a counter containing all the direct succession relationship with frequency
    :param filter_percent: clarify the percentage of direct succession one wants to preserve
    :return: dataframe of direct succession relationship with frequency
    '''

    df = pd.DataFrame.from_dict(dict(dfg), orient='index', columns=['freq'])
    df = df.sort_values(axis=0, by=['freq'], ascending=False)
    df = df.reset_index().rename(columns={'index': 'suc'})
    # delete duplicated successions
    df = df.drop_duplicates('suc', keep='first')
    # delete self succession
    # filter out direct succession by percentage
    filter = list(range(0, round(filter_percent * len(df))))
    df = pandas_utils.insert_index(df)
    df = df[df[constants.DEFAULT_INDEX_KEY].isin(filter)].reset_index(drop=True)
    return df
Example #7
0
def A_eventually_B(df0, A, B, parameters=None):
    """
    Applies the A eventually B rule

    Parameters
    ------------
    df0
        Dataframe
    A
        A Attribute value
    B
        B Attribute value
    parameters
        Parameters of the algorithm, including the attribute key and the positive parameter:
        - If True, returns all the cases containing A and B and in which A was eventually followed by B
        - If False, returns all the cases not containing A or B, or in which an instance of A was not eventually
        followed by an instance of B

    Returns
    ------------
    filtered_df
        Filtered dataframe
    """
    if parameters is None:
        parameters = {}

    case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME)
    attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY)
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY)
    positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True)
    enable_timestamp = exec_utils.get_param_value(Parameters.ENABLE_TIMESTAMP, parameters, False)
    timestamp_diff_boundaries = exec_utils.get_param_value(Parameters.TIMESTAMP_DIFF_BOUNDARIES, parameters, [])

    colset = [case_id_glue, attribute_key]
    if enable_timestamp:
        colset.append(timestamp_key)

    df = df0.copy()
    df = df[colset]
    df = pandas_utils.insert_index(df)
    df_A = df[df[attribute_key] == A].copy()
    df_B = df[df[attribute_key] == B].copy()
    df_B["@@conceptname"] = df_B[case_id_glue]
    df_B = df_B.groupby(case_id_glue).last().set_index("@@conceptname")

    df_join = df_A.join(df_B, on=case_id_glue, rsuffix="_2").dropna()
    df_join["@@diffindex"] = df_join[constants.DEFAULT_INDEX_KEY+"_2"] - df_join[constants.DEFAULT_INDEX_KEY]
    df_join = df_join[df_join["@@diffindex"] > 0]

    if enable_timestamp:
        df_join["@@difftimestamp"] = (df_join[timestamp_key + "_2"] - df_join[timestamp_key]).astype('timedelta64[s]')
        if timestamp_diff_boundaries:
            df_join = df_join[df_join["@@difftimestamp"] >= timestamp_diff_boundaries[0][0]]
            df_join = df_join[df_join["@@difftimestamp"] <= timestamp_diff_boundaries[0][1]]

    i1 = df.set_index(case_id_glue).index
    i2 = df_join.set_index(case_id_glue).index

    if positive:
        return df0[i1.isin(i2)]
    else:
        return df0[~i1.isin(i2)]
Example #8
0
def eventually_follows(df0, attribute_values, parameters=None):
    """
    Applies the eventually follows rule

    Parameters
    ------------
    df0
        Dataframe
    attribute_values
        A list of attribute_values attribute_values[n] follows attribute_values[n-1] follows ... follows attribute_values[0]

    parameters
        Parameters of the algorithm, including the attribute key and the positive parameter:
        - If True, returns all the cases containing all attribute_values and in which attribute_values[i] was eventually followed by attribute_values[i + 1]
        - If False, returns all the cases not containing all attribute_values, or in which an instance of attribute_values[i] was not eventually
        followed by an instance of attribute_values[i + 1]

    Returns
    ------------
    filtered_df
        Filtered dataframe
    """
    if parameters is None:
        parameters = {}

    case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME)
    attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY)
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY)
    positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True)
    enable_timestamp = exec_utils.get_param_value(Parameters.ENABLE_TIMESTAMP, parameters, False)
    timestamp_diff_boundaries = exec_utils.get_param_value(Parameters.TIMESTAMP_DIFF_BOUNDARIES, parameters, [])

    colset = [case_id_glue, attribute_key]
    if enable_timestamp:
        colset.append(timestamp_key)

    df = df0.copy()
    df = df[colset]
    df = pandas_utils.insert_index(df)

    df_a = [df[df[attribute_key] == attribute_value].copy() for attribute_value in attribute_values]

    df_join = df_a[0].merge(df_a[1], on=case_id_glue, suffixes=('', "_1")).dropna()
    df_join["@@diffindex0"] = df_join[constants.DEFAULT_INDEX_KEY+"_1"] - df_join[constants.DEFAULT_INDEX_KEY]
    df_join = df_join[df_join["@@diffindex0"] > 0]

    for i in range(2, len(df_a)):
        df_join = df_join.merge(df_a[i], on=case_id_glue, suffixes=('', f"_{i}")).dropna()
        df_join[f"@@diffindex{i-1}"] = df_join[constants.DEFAULT_INDEX_KEY+f"_{i}"] - df_join[constants.DEFAULT_INDEX_KEY+f"_{i-1}"]
        df_join = df_join[df_join[f"@@diffindex{i-1}"] > 0]

    if enable_timestamp:
        for i in range(len(df_a)):
            df_join[f"@@difftimestamp{i}"] = (df_join[timestamp_key + f"_{i + 1}"] - df_join[timestamp_key + f'_{i}']).astype('timedelta64[s]')

            if timestamp_diff_boundaries:
                df_join = df_join[df_join[f"@@difftimestamp{i}"] >= timestamp_diff_boundaries[i][0]]
                df_join = df_join[df_join[f"@@difftimestamp{i}"] <= timestamp_diff_boundaries[i][1]]

    i1 = df.set_index(case_id_glue).index
    i2 = df_join.set_index(case_id_glue).index
    if positive:
        return df0[i1.isin(i2)]
    else:
        return df0[~i1.isin(i2)]
Example #9
0
def get_concurrent_events_dataframe(
        df,
        start_timestamp_key=None,
        timestamp_key="time:timestamp",
        case_id_glue="case:concept:name",
        activity_key="concept:name",
        sort_caseid_required=True,
        sort_timestamp_along_case_id=True,
        reduce_dataframe=True,
        max_start_column="@@max_start_column",
        min_complete_column="@@min_complete_column",
        diff_maxs_minc="@@diff_maxs_minc",
        strict=False):
    """
    Gets the concurrent events (of the same case) in a Pandas dataframe

    Parameters
    --------------
    df
        Dataframe
    start_timestamp_key
        Start timestamp key (if not provided, defaulted to the timestamp_key)
    timestamp_key
        Complete timestamp
    case_id_glue
        Column of the dataframe to use as case ID
    activity_key
        Activity key
    sort_caseid_required
        Tells if a sort by case ID is required (default: True)
    sort_timestamp_along_case_id
        Tells if a sort by timestamp is required along the case ID (default: True)
    reduce_dataframe
        To fasten operation, keep only essential columns in the dataframe
    strict
        Gets only entries that are strictly concurrent (i.e. the length of the intersection as real interval is > 0)

    Returns
    ---------------
    conc_ev_dataframe
        Concurrent events dataframe (with @@diff_maxs_minc as the size of the intersection of the intervals)
    """
    # if not differently specified, set the start timestamp key to the timestamp key
    # to avoid retro-compatibility problems
    if start_timestamp_key is None:
        start_timestamp_key = xes_constants.DEFAULT_START_TIMESTAMP_KEY
        df[start_timestamp_key] = df[timestamp_key]

    # to get rows belonging to same case ID together, we need to sort on case ID
    if sort_caseid_required:
        if sort_timestamp_along_case_id:
            df = df.sort_values(
                [case_id_glue, start_timestamp_key, timestamp_key])
        else:
            df = df.sort_values(case_id_glue)

    # to increase the speed of the approaches reduce dataframe to case, activity (and possibly complete timestamp)
    # columns
    if reduce_dataframe:
        df = df[[
            case_id_glue, activity_key, start_timestamp_key, timestamp_key
        ]]

    df = pandas_utils.insert_index(df)
    df = df.set_index(case_id_glue)
    df_copy = df.copy()

    df = df.join(df_copy, rsuffix="_2").dropna()
    df = df[df[constants.DEFAULT_INDEX_KEY] < df[constants.DEFAULT_INDEX_KEY +
                                                 "_2"]]
    df[max_start_column] = df[[
        start_timestamp_key, start_timestamp_key + '_2'
    ]].max(axis=1)
    df[min_complete_column] = df[[timestamp_key,
                                  timestamp_key + '_2']].min(axis=1)
    df[max_start_column] = df[max_start_column].apply(lambda x: x.timestamp())
    df[min_complete_column] = df[min_complete_column].apply(
        lambda x: x.timestamp())
    df[diff_maxs_minc] = df[min_complete_column] - df[max_start_column]
    if strict:
        df = df[df[diff_maxs_minc] > 0]
    else:
        df = df[df[diff_maxs_minc] >= 0]

    return df
Example #10
0
def get_partial_order_dataframe(df,
                                start_timestamp_key=None,
                                timestamp_key="time:timestamp",
                                case_id_glue="case:concept:name",
                                activity_key="concept:name",
                                sort_caseid_required=True,
                                sort_timestamp_along_case_id=True,
                                reduce_dataframe=True,
                                keep_first_following=True):
    """
    Gets the partial order between events (of the same case) in a Pandas dataframe

    Parameters
    --------------
    df
        Dataframe
    start_timestamp_key
        Start timestamp key (if not provided, defaulted to the timestamp_key)
    timestamp_key
        Complete timestamp
    case_id_glue
        Column of the dataframe to use as case ID
    activity_key
        Activity key
    sort_caseid_required
        Tells if a sort by case ID is required (default: True)
    sort_timestamp_along_case_id
        Tells if a sort by timestamp is required along the case ID (default: True)
    reduce_dataframe
        To fasten operation, keep only essential columns in the dataframe
    keep_first_following
        Keep only the first event following the given event
    Returns
    ---------------
    part_ord_dataframe
        Partial order dataframe (with @@flow_time between events)
    """
    # if not differently specified, set the start timestamp key to the timestamp key
    # to avoid retro-compatibility problems
    if start_timestamp_key is None:
        start_timestamp_key = xes_constants.DEFAULT_START_TIMESTAMP_KEY
        df[start_timestamp_key] = df[timestamp_key]

    # to get rows belonging to same case ID together, we need to sort on case ID
    if sort_caseid_required:
        if sort_timestamp_along_case_id:
            df = df.sort_values(
                [case_id_glue, start_timestamp_key, timestamp_key])
        else:
            df = df.sort_values(case_id_glue)

    # to increase the speed of the approaches reduce dataframe to case, activity (and possibly complete timestamp)
    # columns
    if reduce_dataframe:
        df = df[[
            case_id_glue, activity_key, start_timestamp_key, timestamp_key
        ]]

    df = pandas_utils.insert_index(df)
    df = df.set_index(case_id_glue)
    df_copy = df.copy()

    df = df.join(df_copy, rsuffix="_2").dropna()
    df = df[df[constants.DEFAULT_INDEX_KEY] < df[constants.DEFAULT_INDEX_KEY +
                                                 "_2"]]
    df = df[df[timestamp_key] <= df[start_timestamp_key + "_2"]]
    df = df.reset_index()

    df[constants.DEFAULT_FLOW_TIME] = (
        df[start_timestamp_key + "_2"] -
        df[timestamp_key]).astype('timedelta64[s]')

    if keep_first_following:
        df = df.groupby(constants.DEFAULT_INDEX_KEY).first().reset_index()

    return df
Example #11
0
def insert_artificial_start_end(
        df0: pd.DataFrame,
        parameters: Optional[Dict[Any, Any]] = None) -> pd.DataFrame:
    """
    Inserts the artificial start/end activities in a Pandas dataframe

    Parameters
    ------------------
    df0
        Dataframe
    parameters
        Parameters of the algorithm, including:
        - Parameters.CASE_ID_KEY: the case identifier
        - Parameters.TIMESTAMP_KEY: the timestamp
        - Parameters.ACTIVITY_KEY: the activity

    Returns
    -----------------
    enriched_df
        Dataframe with artificial start/end activities
    """
    if parameters is None:
        parameters = {}

    case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY,
                                             parameters,
                                             constants.CASE_CONCEPT_NAME)
    timestamp_key = exec_utils.get_param_value(
        Parameters.TIMESTAMP_KEY, parameters,
        xes_constants.DEFAULT_TIMESTAMP_KEY)
    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters,
                                              xes_constants.DEFAULT_NAME_KEY)
    use_extremes_timestamp = exec_utils.get_param_value(
        Parameters.USE_EXTREMES_TIMESTAMP, parameters, False)

    artificial_start_activity = exec_utils.get_param_value(
        Parameters.PARAM_ARTIFICIAL_START_ACTIVITY, parameters,
        constants.DEFAULT_ARTIFICIAL_START_ACTIVITY)
    artificial_end_activity = exec_utils.get_param_value(
        Parameters.PARAM_ARTIFICIAL_END_ACTIVITY, parameters,
        constants.DEFAULT_ARTIFICIAL_END_ACTIVITY)

    index_key = exec_utils.get_param_value(Parameters.INDEX_KEY, parameters,
                                           constants.DEFAULT_INDEX_KEY)

    df = df0.copy()
    df = pandas_utils.insert_index(df, index_key)
    df = df.sort_values([case_id_key, timestamp_key, index_key])

    start_df = df[[case_id_key,
                   timestamp_key]].groupby(case_id_key).first().reset_index()
    end_df = df[[case_id_key,
                 timestamp_key]].groupby(case_id_key).last().reset_index()
    # stability trick: remove 1ms from the artificial start activity timestamp, add 1ms to the artificial end activity timestamp
    if use_extremes_timestamp:
        start_df[timestamp_key] = pd.Timestamp.min
        end_df[timestamp_key] = pd.Timestamp.max
        start_df[timestamp_key] = start_df[timestamp_key].dt.tz_localize("utc")
        end_df[timestamp_key] = end_df[timestamp_key].dt.tz_localize("utc")
    else:
        start_df[timestamp_key] = start_df[timestamp_key] - pd.Timedelta(
            "1 ms")
        end_df[timestamp_key] = end_df[timestamp_key] + pd.Timedelta("1 ms")

    start_df[activity_key] = artificial_start_activity
    end_df[activity_key] = artificial_end_activity

    df = pd.concat([start_df, df, end_df])
    df = pandas_utils.insert_index(df, index_key)
    df = df.sort_values([case_id_key, timestamp_key, index_key])

    df.attrs = df0.attrs

    return df
Example #12
0
def __insert_start_from_previous_event(
        df: pd.DataFrame,
        parameters: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
    """
    Inserts the start timestamp of an event set to the completion of the previous event in the case

    Parameters
    ---------------
    df
        Dataframe

    Returns
    ---------------
    df
        Dataframe with the start timestamp for each event
    """
    if parameters is None:
        parameters = {}

    timestamp_key = exec_utils.get_param_value(
        Parameters.TIMESTAMP_KEY, parameters,
        xes_constants.DEFAULT_TIMESTAMP_KEY)
    resource_key = exec_utils.get_param_value(
        Parameters.RESOURCE_KEY, parameters,
        xes_constants.DEFAULT_RESOURCE_KEY)
    case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY,
                                             parameters,
                                             constants.CASE_CONCEPT_NAME)
    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters,
                                              xes_constants.DEFAULT_NAME_KEY)
    start_timestamp_key = exec_utils.get_param_value(
        Parameters.START_TIMESTAMP_KEY, parameters,
        xes_constants.DEFAULT_START_TIMESTAMP_KEY)

    from pm4py.util import pandas_utils

    df = df[[timestamp_key, resource_key, case_id_key, activity_key]]

    df = pandas_utils.insert_index(df)
    df = df.sort_values(
        [case_id_key, timestamp_key, constants.DEFAULT_INDEX_KEY])

    shifted_df = df[[case_id_key, timestamp_key]].shift(1)
    shifted_df.columns = [x + "_2" for x in shifted_df.columns]

    concat_df = pd.concat([df, shifted_df], axis=1)
    concat_df = concat_df[concat_df[case_id_key] == concat_df[
        case_id_key +
        "_2"]][[constants.DEFAULT_INDEX_KEY, timestamp_key + "_2"]]

    del shifted_df
    concat_df = concat_df.to_dict("r")
    concat_df = {
        x[constants.DEFAULT_INDEX_KEY]: x[timestamp_key + "_2"]
        for x in concat_df
    }

    df[start_timestamp_key] = df[constants.DEFAULT_INDEX_KEY].map(concat_df)
    df[start_timestamp_key] = df[start_timestamp_key].fillna(df[timestamp_key])
    df = df.sort_values(
        [start_timestamp_key, timestamp_key, constants.DEFAULT_INDEX_KEY])

    return df
Example #13
0
def apply(dataframe: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None) -> pd.DataFrame:
    """
    Performs a link analysis between the entries of the current dataframe.
    The link analysis permits advanced filtering based on events connected in an
    output-input relation (e.g., the OUT column of the first is equal to the IN column
    of the second).

    When OUT_COLUMN = IN_COLUMN = CASE ID, it can be equivalent to the directly-follows graph
    (when Parameters.KEEP_FIRST_OCCURRENCE = True), and to the eventually-follows graph
    (when Parameters.KEEP_FIRST_OCCURRENCE = False).

    Parameters
    -----------------
    dataframe
        Pandas dataframe
    parameters
        Parameters of the algorithm, including:
        - Parameters.OUT_COLUMN => the output column of the dataframe
        - Parameters.IN_COLUMN => the input column of the dataframe
        - Parameters.SORTING_COLUMN => the column on top of which the
        - Parameters.INDEX_COLUMN => the attribute to use for the indexing
        - Parameters.LOOK_FORWARD => filters the relations in which the second event has an index >= than the index
        of the first event.
        - Parameters.KEEP_FIRST_OCCURRENCE => keep, for every source event, only the first-occurring relationship
        with a target event (OUT=IN).
        - Parameters.PROPAGATE => propagate the relationships between events, in such a way that the entire document
        flow chain can be reconstructed.

    Returns
    -----------------
    link_analysis_dataframe
        Link analysis dataframe
    """
    if parameters is None:
        parameters = {}

    out_column = exec_utils.get_param_value(Parameters.OUT_COLUMN, parameters, constants.CASE_CONCEPT_NAME)
    in_column = exec_utils.get_param_value(Parameters.IN_COLUMN, parameters, constants.CASE_CONCEPT_NAME)
    sorting_column = exec_utils.get_param_value(Parameters.SORTING_COLUMN, parameters,
                                                xes_constants.DEFAULT_TIMESTAMP_KEY)
    index_column = exec_utils.get_param_value(Parameters.INDEX_COLUMN, parameters, constants.DEFAULT_INDEX_KEY)
    look_forward = exec_utils.get_param_value(Parameters.LOOK_FORWARD, parameters, True)
    keep_first_occurrence = exec_utils.get_param_value(Parameters.KEEP_FIRST_OCCURRENCE, parameters, False)
    propagate = exec_utils.get_param_value(Parameters.PROPAGATE, parameters, False)

    dataframe = dataframe.sort_values(sorting_column)
    dataframe = pandas_utils.insert_index(dataframe, index_column)

    df_red1 = dataframe[[out_column, index_column]]
    df_red2 = dataframe[[in_column, index_column]]
    df_red = df_red1.merge(df_red2, left_on=out_column, right_on=in_column, suffixes=("_out", "_in"))

    if look_forward:
        df_red = df_red[df_red[index_column + "_out"] < df_red[index_column + "_in"]]

    if keep_first_occurrence:
        df_red = df_red.groupby(index_column + "_out").first().reset_index()

    stream_red = df_red.to_dict("records")
    associations = {}
    for el in stream_red:
        if not el[index_column + "_out"] in associations:
            associations[el[index_column + "_out"]] = set()
        associations[el[index_column + "_out"]].add(el[index_column + "_in"])

    if propagate:
        associations = propagate_associations(associations)

    out_clmn = []
    in_clmn = []
    for k in associations:
        for v in associations[k]:
            out_clmn.append(k)
            in_clmn.append(v)

    rel = pd.DataFrame({index_column + "_out": out_clmn, index_column + "_in": in_clmn})

    df_link = dataframe.copy()
    df_link.columns = [x + "_out" for x in df_link.columns]
    df_link = df_link.merge(rel, left_on=index_column + "_out", right_on=index_column + "_out")
    dataframe.columns = [x + "_in" for x in dataframe.columns]
    df_link = df_link.merge(dataframe, left_on=index_column + "_in", right_on=index_column + "_in")

    return df_link
Example #14
0
def format_dataframe(
        df: pd.DataFrame,
        case_id: str = constants.CASE_CONCEPT_NAME,
        activity_key: str = xes_constants.DEFAULT_NAME_KEY,
        timestamp_key: str = xes_constants.DEFAULT_TIMESTAMP_KEY,
        start_timestamp_key: str = xes_constants.DEFAULT_START_TIMESTAMP_KEY,
        timest_format: Optional[str] = None) -> pd.DataFrame:
    """
    Give the appropriate format on the dataframe, for process mining purposes

    Parameters
    --------------
    df
        Dataframe
    case_id
        Case identifier column
    activity_key
        Activity column
    timestamp_key
        Timestamp column
    start_timestamp_key
        Start timestamp column
    timest_format
        Timestamp format that is provided to Pandas

    Returns
    --------------
    df
        Dataframe
    """
    if type(df) not in [pd.DataFrame, EventLog, EventStream]:
        raise Exception(
            "the method can be applied only to a traditional event log!")

    from pm4py.objects.log.util import dataframe_utils
    if case_id not in df.columns:
        raise Exception(case_id + " column (case ID) is not in the dataframe!")
    if activity_key not in df.columns:
        raise Exception(activity_key +
                        " column (activity) is not in the dataframe!")
    if timestamp_key not in df.columns:
        raise Exception(timestamp_key +
                        " column (timestamp) is not in the dataframe!")
    if case_id != constants.CASE_CONCEPT_NAME:
        if constants.CASE_CONCEPT_NAME in df.columns:
            del df[constants.CASE_CONCEPT_NAME]
        df[constants.CASE_CONCEPT_NAME] = df[case_id]
    if activity_key != xes_constants.DEFAULT_NAME_KEY:
        if xes_constants.DEFAULT_NAME_KEY in df.columns:
            del df[xes_constants.DEFAULT_NAME_KEY]
        df[xes_constants.DEFAULT_NAME_KEY] = df[activity_key]
    if timestamp_key != xes_constants.DEFAULT_TIMESTAMP_KEY:
        if xes_constants.DEFAULT_TIMESTAMP_KEY in df.columns:
            del df[xes_constants.DEFAULT_TIMESTAMP_KEY]
        df[xes_constants.DEFAULT_TIMESTAMP_KEY] = df[timestamp_key]
    # makes sure that the timestamps column are of timestamp type
    df = dataframe_utils.convert_timestamp_columns_in_df(
        df, timest_format=timest_format)
    # drop NaN(s) in the main columns (case ID, activity, timestamp) to ensure functioning of the
    # algorithms
    df = df.dropna(subset={
        constants.CASE_CONCEPT_NAME, xes_constants.DEFAULT_NAME_KEY,
        xes_constants.DEFAULT_TIMESTAMP_KEY
    },
                   how="any")
    # make sure the case ID column is of string type
    df[constants.CASE_CONCEPT_NAME] = df[constants.CASE_CONCEPT_NAME].astype(
        "string")
    # make sure the activity column is of string type
    df[xes_constants.DEFAULT_NAME_KEY] = df[
        xes_constants.DEFAULT_NAME_KEY].astype("string")
    # set an index column
    df = pandas_utils.insert_index(df, INDEX_COLUMN, copy_dataframe=False)
    # sorts the dataframe
    df = df.sort_values([
        constants.CASE_CONCEPT_NAME, xes_constants.DEFAULT_TIMESTAMP_KEY,
        INDEX_COLUMN
    ])
    # re-set the index column
    df = pandas_utils.insert_index(df, INDEX_COLUMN, copy_dataframe=False)
    # sets the properties
    if not hasattr(df, 'attrs'):
        # legacy (Python 3.6) support
        df.attrs = {}
    if start_timestamp_key in df.columns:
        df[xes_constants.DEFAULT_START_TIMESTAMP_KEY] = df[start_timestamp_key]
        df.attrs[
            constants.
            PARAMETER_CONSTANT_START_TIMESTAMP_KEY] = xes_constants.DEFAULT_START_TIMESTAMP_KEY
    df.attrs[constants.
             PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_constants.DEFAULT_NAME_KEY
    df.attrs[
        constants.
        PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_constants.DEFAULT_TIMESTAMP_KEY
    df.attrs[constants.
             PARAMETER_CONSTANT_GROUP_KEY] = xes_constants.DEFAULT_GROUP_KEY
    df.attrs[
        constants.
        PARAMETER_CONSTANT_TRANSITION_KEY] = xes_constants.DEFAULT_TRANSITION_KEY
    df.attrs[
        constants.
        PARAMETER_CONSTANT_RESOURCE_KEY] = xes_constants.DEFAULT_RESOURCE_KEY
    df.attrs[
        constants.PARAMETER_CONSTANT_CASEID_KEY] = constants.CASE_CONCEPT_NAME
    return df
def apply(
    dataframe: pd.DataFrame,
    list_activities: List[str],
    sample_size: int,
    parameters: Optional[Dict[Union[str, Parameters], Any]] = None
) -> Dict[str, Any]:
    """
    Finds the disconnected performance spectrum provided a dataframe
    and a list of activities

    Parameters
    -------------
    dataframe
        Dataframe
    list_activities
        List of activities interesting for the performance spectrum (at least two)
    sample_size
        Size of the sample
    parameters
        Parameters of the algorithm,  including:
            - Parameters.ACTIVITY_KEY
            - Parameters.TIMESTAMP_KEY
            - Parameters.CASE_ID_KEY

    Returns
    -------------
    points
        Points of the performance spectrum
    """
    if parameters is None:
        parameters = {}

    case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY,
                                              parameters,
                                              constants.CASE_CONCEPT_NAME)
    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters, xes.DEFAULT_NAME_KEY)
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY,
                                               parameters,
                                               xes.DEFAULT_TIMESTAMP_KEY)

    sort_log_required = exec_utils.get_param_value(
        Parameters.SORT_LOG_REQUIRED, parameters, True)

    dataframe = dataframe[[case_id_glue, activity_key, timestamp_key]]
    dataframe = dataframe[dataframe[activity_key].isin(list_activities)]
    dataframe = pandas_utils.insert_index(dataframe,
                                          constants.DEFAULT_EVENT_INDEX_KEY)
    if sort_log_required:
        dataframe = dataframe.sort_values(
            [case_id_glue, timestamp_key, constants.DEFAULT_EVENT_INDEX_KEY])
    dataframe[timestamp_key] = dataframe[timestamp_key].astype(
        np.int64) / 10**9

    all_patterns = [(len(list_activities) - i,
                     gen_patterns(list_activities,
                                  len(list_activities) - i))
                    for i in range(len(list_activities) - 1)]

    def key(k, n):
        return k + str(n)

    def to_points(match, l):
        return {
            'case_id':
            match[key(case_id_glue, 0)],
            'points': [(match[key(activity_key,
                                  i)], match[key(timestamp_key, i)])
                       for i in range(l)]
        }

    points = []
    for l, patterns in all_patterns:
        # concat shifted and suffixed dataframes to get a dataframe that allows to check for the patterns
        dfs = [dataframe.add_suffix(str(i)).shift(-i) for i in range(l)]
        df_merged = pd.concat(dfs, axis=1)

        indices = [shift_index(dfs[i].index, i) for i in range(len(dfs))]
        mindex = pd.MultiIndex.from_arrays(indices)
        df_merged = df_merged.set_index(mindex)

        for i in range(l - 1):
            df_merged = df_merged[df_merged[key(case_id_glue, i)] == df_merged[
                key(case_id_glue, i + 1)]]

        column_list = [key(activity_key, i) for i in range(l)]
        matches = df_merged[np.isin(df_merged[column_list].sum(axis=1),
                                    patterns)]
        points.extend([to_points(m, l) for m in matches.to_dict('records')])
        # drop rows of this match to not discover subsets of this match again
        dataframe = dataframe.drop(
            [int(i) for indices in matches.index for i in indices[:-1]])
        pass

    points = sorted(points,
                    key=lambda x: min(x['points'], key=lambda x: x[1])[1])
    if len(points) > sample_size:
        points = points_subset.pick_chosen_points_list(sample_size, points)

    return points
Example #16
0
def apply(dataframe, list_activities, sample_size, parameters):
    """
    Finds the performance spectrum provided a dataframe
    and a list of activities

    Parameters
    -------------
    dataframe
        Dataframe
    list_activities
        List of activities interesting for the performance spectrum (at least two)
    sample_size
        Size of the sample
    parameters
        Parameters of the algorithm,  including:
            - Parameters.ACTIVITY_KEY
            - Parameters.TIMESTAMP_KEY
            - Parameters.CASE_ID_KEY

    Returns
    -------------
    points
        Points of the performance spectrum
    """
    if parameters is None:
        parameters = {}

    import pandas as pd
    import numpy as np

    case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY,
                                              parameters, CASE_CONCEPT_NAME)
    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters, xes.DEFAULT_NAME_KEY)
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY,
                                               parameters,
                                               xes.DEFAULT_TIMESTAMP_KEY)

    dataframe = dataframe[[case_id_glue, activity_key, timestamp_key]]
    dataframe = dataframe[dataframe[activity_key].isin(list_activities)]
    dataframe = pandas_utils.insert_index(dataframe,
                                          constants.DEFAULT_EVENT_INDEX_KEY)
    dataframe = dataframe.sort_values(
        [case_id_glue, timestamp_key, constants.DEFAULT_EVENT_INDEX_KEY])
    dataframe[timestamp_key] = dataframe[timestamp_key].astype(
        np.int64) / 10**9
    list_replicas = []
    activity_names = []
    filt_col_names = []
    for i in range(len(list_activities)):
        if i > 0:
            dataframe = dataframe.shift(-1)
            activity_names.append("+'@@'+")
        ren = {x: x + "_" + str(i) for x in dataframe.columns}
        list_replicas.append(dataframe.rename(columns=ren))
        filt_col_names.append(timestamp_key + "_" + str(i))

        activity_names.append("dataframe[activity_key+'_" + str(i) + "']")

    dataframe = pd.concat(list_replicas, axis=1)
    for i in range(len(list_activities) - 1):
        dataframe = dataframe[dataframe[case_id_glue + "_" +
                                        str(i)] == dataframe[case_id_glue +
                                                             "_" + str(i + 1)]]
    dataframe["@@merged_activity"] = eval("".join(activity_names))
    desidered_act = "@@".join(list_activities)
    dataframe = dataframe[dataframe["@@merged_activity"] == desidered_act]
    dataframe = dataframe[filt_col_names]

    if len(dataframe) > sample_size:
        dataframe = dataframe.sample(n=sample_size)

    points = pandas_utils.to_dict_records(dataframe)
    points = [[p[tk] for tk in filt_col_names] for p in points]
    points = sorted(points, key=lambda x: x[0])

    return points
Example #17
0
def apply(dataframe: pd.DataFrame, parameters=None):
    """
    Returns the variants from a Pandas dataframe (through Numpy)

    Parameters
    ------------------
    dataframe
        Dataframe
    parameters
        Parameters of the algorithm, including:
        - Parameters.CASE_ID_KEY => the case identifier
        - Parameters.ACTIVITY_KEY => the activity
        - Parameters.TIMESTAMP_KEY => the timestamp
        - Parameters.INDEX_KEY => the index

    Returns
    ------------------
    variants_dict
        Dictionary associating to each variant the number of occurrences in the dataframe
    """
    if parameters is None:
        parameters = {}

    case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY,
                                             parameters,
                                             constants.CASE_CONCEPT_NAME)
    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters,
                                              xes_constants.DEFAULT_NAME_KEY)
    timestamp_key = exec_utils.get_param_value(
        Parameters.TIMESTAMP_KEY, parameters,
        xes_constants.DEFAULT_TIMESTAMP_KEY)
    index_key = exec_utils.get_param_value(Parameters.INDEX_KEY, parameters,
                                           constants.DEFAULT_INDEX_KEY)

    if not (hasattr(dataframe, "attrs") and dataframe.attrs):
        # dataframe has not been initialized through format_dataframe
        dataframe = pandas_utils.insert_index(dataframe, index_key)
        dataframe.sort_values([case_id_key, timestamp_key, index_key])

    cases = dataframe[case_id_key].to_numpy()
    activities = dataframe[activity_key].to_numpy()

    c_unq, c_ind, c_counts = np.unique(cases,
                                       return_index=True,
                                       return_counts=True)
    variants = Counter()

    for i in range(len(c_ind)):
        si = c_ind[i]
        ei = si + c_counts[i]
        acts = tuple(activities[si:ei])
        variants[acts] += 1

    if variants_util.VARIANT_SPECIFICATION == variants_util.VariantsSpecifications.STRING:
        variants = {
            constants.DEFAULT_VARIANT_SEP.join(x): y
            for x, y in variants.items()
        }
    else:
        variants = {x: y for x, y in variants.items()}

    return variants