Python applyの例、pm4py.algo.filtering.pandas.attributes.attributes_filter.apply Pythonの例

コード例 #1

0

ファイルを表示

def filter_event_attribute_values(
        log: Union[EventLog, pd.DataFrame],
        attribute_key: str,
        values: Union[Set[str], List[str]],
        level: str = "case",
        retain: bool = True) -> Union[EventLog, pd.DataFrame]:
    """
    Filter a log object on the values of some event attribute

    Parameters
    --------------
    log
        Log object
    attribute_key
        Attribute to filter
    values
        Admitted (or forbidden) values
    level
        Specifies how the filter should be applied ('case' filters the cases where at least one occurrence happens,
        'event' filter the events eventually trimming the cases)
    retain
        Specified if the values should be kept or removed

    Returns
    --------------
    filtered_log
        Filtered log object
    """
    if type(log) not in [pd.DataFrame, EventLog, EventStream]:
        raise Exception(
            "the method can be applied only to a traditional event log!")

    parameters = get_properties(log)
    parameters[constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = attribute_key
    if check_is_pandas_dataframe(log):
        check_pandas_dataframe_columns(log)
        from pm4py.algo.filtering.pandas.attributes import attributes_filter
        if level == "event":
            parameters[attributes_filter.Parameters.POSITIVE] = retain
            return attributes_filter.apply_events(log,
                                                  values,
                                                  parameters=parameters)
        elif level == "case":
            parameters[attributes_filter.Parameters.POSITIVE] = retain
            return attributes_filter.apply(log, values, parameters=parameters)
    else:
        from pm4py.algo.filtering.log.attributes import attributes_filter
        if level == "event":
            parameters[attributes_filter.Parameters.POSITIVE] = retain
            return attributes_filter.apply_events(log,
                                                  values,
                                                  parameters=parameters)
        elif level == "case":
            parameters[attributes_filter.Parameters.POSITIVE] = retain
            return attributes_filter.apply(log, values, parameters=parameters)

コード例 #2

0

ファイルを表示

def filter_trace_attribute(log, attribute, values, positive=True):
    """
    Filter a log_skeleton on the values of a trace attribute

    Parameters
    --------------
    log
        Event log_skeleton
    attribute
        Attribute to filter
    values
        Values to filter (list of)
    positive
        Boolean value (keep/discard cases)

    Returns
    --------------
    filtered_log
        Filtered event log_skeleton
    """
    if check_is_dataframe(log):
        check_dataframe_columns(log)
        from pm4py.algo.filtering.pandas.attributes import attributes_filter
        return attributes_filter.apply(log, values, parameters={attributes_filter.Parameters.ATTRIBUTE_KEY: attribute,
                                                                attributes_filter.Parameters.POSITIVE: positive})
    else:
        from pm4py.algo.filtering.log.attributes import attributes_filter
        return attributes_filter.apply_trace_attribute(log, values, parameters={
            attributes_filter.Parameters.ATTRIBUTE_KEY: attribute, attributes_filter.Parameters.POSITIVE: positive})

コード例 #3

0

ファイルを表示

def apply(dataframe, filter, parameters=None):
    """
    Apply a filter to the current log (attributes filter)

    Parameters
    ------------
    dataframe
        Pandas dataframe
    filter
        Filter to apply
    parameters
        Parameters of the algorithm

    Returns
    ------------
    dataframe
        Pandas dataframe
    """
    if parameters is None:
        parameters = {}

    parameters[constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = filter[1][0]
    parameters["positive"] = False

    return attributes_filter.apply(dataframe, filter[1][1], parameters=parameters)

コード例 #4

0

ファイルを表示

def social_position(
        df: pd.DataFrame,
        t1: Union[datetime, str],
        t2: Union[datetime, str],
        r: str,
        parameters: Optional[Dict[Union[str, Parameters],
                                  Any]] = None) -> float:
    """
    The fraction of resources involved in the same cases with a given resource during a given time slot with
    respect to the total number of resources active during the time slot.

    Metric RBI 5.2 in Pika, Anastasiia, et al.
    "Mining resource profiles from event logs." ACM Transactions on Management Information Systems (TMIS) 8.1 (2017): 1-30.

    Parameters
    -----------------
    df
        Dataframe
    t1
        Left interval
    t2
        Right interval
    r
        Resource

    Returns
    ----------------
    metric
        Value of the metric
    """
    if parameters is None:
        parameters = {}

    t1 = get_dt_from_string(t1)
    t2 = get_dt_from_string(t2)

    timestamp_key = exec_utils.get_param_value(
        Parameters.TIMESTAMP_KEY, parameters,
        xes_constants.DEFAULT_TIMESTAMP_KEY)
    resource_key = exec_utils.get_param_value(
        Parameters.RESOURCE_KEY, parameters,
        xes_constants.DEFAULT_RESOURCE_KEY)
    case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY,
                                             parameters,
                                             constants.CASE_CONCEPT_NAME)

    df = df[[timestamp_key, resource_key, case_id_key]]
    df = df[df[timestamp_key] >= t1]
    df = df[df[timestamp_key] < t2]

    from pm4py.algo.filtering.pandas.attributes import attributes_filter
    parameters_filter = {
        attributes_filter.Parameters.ATTRIBUTE_KEY: resource_key
    }
    filt_df = attributes_filter.apply(df, [r], parameters=parameters_filter)

    q1 = float(filt_df[case_id_key].nunique())
    q2 = float(df[case_id_key].nunique())

    return q1 / q2 if q2 > 0 else 0

コード例 #5

0

ファイルを表示

def filter_trace_attribute_values(log: Union[EventLog, pd.DataFrame], attribute_key: str, values: List[str],
                                  retain: bool = True) -> Union[EventLog, pd.DataFrame]:
    """
    Filter a log on the values of a trace attribute

    Parameters
    --------------
    log
        Event log
    attribute_key
        Attribute to filter
    values
        Values to filter (list of)
    retain
        Boolean value (keep/discard matching traces)

    Returns
    --------------
    filtered_log
        Filtered event log
    """
    if check_is_dataframe(log):
        check_dataframe_columns(log)
        from pm4py.algo.filtering.pandas.attributes import attributes_filter
        return attributes_filter.apply(log, values,
                                       parameters={attributes_filter.Parameters.ATTRIBUTE_KEY: attribute_key,
                                                   attributes_filter.Parameters.POSITIVE: retain})
    else:
        from pm4py.algo.filtering.log.attributes import attributes_filter
        return attributes_filter.apply_trace_attribute(log, values, parameters={
            attributes_filter.Parameters.ATTRIBUTE_KEY: attribute_key, attributes_filter.Parameters.POSITIVE: retain})

コード例 #6

0

ファイルを表示

def filter_event_attribute_values(log: Union[EventLog, pd.DataFrame], attribute_key: str, values: List[str],
                                  level: str = "case", retain: bool = True) -> Union[EventLog, pd.DataFrame]:
    """
    Filter a log object on the values of some event attribute

    Parameters
    --------------
    log
        Log object
    attribute_key
        Attribute to filter
    values
        Admitted (or forbidden) values
    level
        Specifies how the filter should be applied ('case' filters the cases where at least one occurrence happens,
        'event' filter the events eventually trimming the cases)
    retain
        Specified if the values should be kept or removed

    Returns
    --------------
    filtered_log
        Filtered log object
    """
    if check_is_dataframe(log):
        check_dataframe_columns(log)
        from pm4py.algo.filtering.pandas.attributes import attributes_filter
        if level == "event":
            return attributes_filter.apply_events(log, values,
                                                  parameters={constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: attribute_key,
                                                              attributes_filter.Parameters.POSITIVE: retain})
        elif level == "case":
            return attributes_filter.apply(log, values, parameters={
                constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: attribute_key,
                attributes_filter.Parameters.POSITIVE: retain})
    else:
        from pm4py.algo.filtering.log.attributes import attributes_filter
        if level == "event":
            return attributes_filter.apply_events(log, values,
                                                  parameters={constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: attribute_key,
                                                              attributes_filter.Parameters.POSITIVE: retain})
        elif level == "case":
            return attributes_filter.apply(log, values, parameters={
                constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: attribute_key,
                attributes_filter.Parameters.POSITIVE: retain})

コード例 #7

0

ファイルを表示

def filter_attribute_values(log, attribute, values, how="cases", positive=True):
    """
    Filter a log_skeleton object on the values of some attribute

    Parameters
    --------------
    log
        Log object
    attribute
        Attribute to filter
    values
        Admitted (or forbidden) values
    how
        Specifies how the filter should be applied (cases filters the cases where at least one occurrence happens,
        events filter the events eventually trimming the cases)
    positive
        Specified if the values should be kept or removed

    Returns
    --------------
    filtered_log
        Filtered log_skeleton object
    """
    if check_is_dataframe(log):
        check_dataframe_columns(log)
        from pm4py.algo.filtering.pandas.attributes import attributes_filter
        if how == "events":
            return attributes_filter.apply_events(log, values,
                                                  parameters={constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: attribute,
                                                              attributes_filter.Parameters.POSITIVE: positive})
        elif how == "cases":
            return attributes_filter.apply(log, values, parameters={
                constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: attribute, attributes_filter.Parameters.POSITIVE: positive})
    else:
        from pm4py.algo.filtering.log.attributes import attributes_filter
        if how == "events":
            return attributes_filter.apply_events(log, values,
                                                  parameters={constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: attribute,
                                                              attributes_filter.Parameters.POSITIVE: positive})
        else:
            return attributes_filter.apply(log, values, parameters={
                constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: attribute, attributes_filter.Parameters.POSITIVE: positive})

コード例 #8

0

ファイルを表示

def average_case_duration(
        df: pd.DataFrame,
        t1: Union[datetime, str],
        t2: Union[datetime, str],
        r: str,
        parameters: Optional[Dict[Union[str, Parameters],
                                  Any]] = None) -> float:
    """
    The average duration of cases completed during a given time slot in which a given resource was involved.

    Metric RBI 4.4 in Pika, Anastasiia, et al.
    "Mining resource profiles from event logs." ACM Transactions on Management Information Systems (TMIS) 8.1 (2017): 1-30.

    Parameters
    -----------------
    df
        Dataframe
    t1
        Left interval
    t2
        Right interval
    r
        Resource

    Returns
    ----------------
    metric
        Value of the metric
    """
    if parameters is None:
        parameters = {}

    resource_key = exec_utils.get_param_value(
        Parameters.RESOURCE_KEY, parameters,
        xes_constants.DEFAULT_RESOURCE_KEY)

    from pm4py.algo.filtering.pandas.attributes import attributes_filter
    parameters_filter = {
        attributes_filter.Parameters.ATTRIBUTE_KEY: resource_key
    }
    df = attributes_filter.apply(df, [r], parameters=parameters_filter)

    from pm4py.algo.filtering.pandas.timestamp import timestamp_filter
    df = timestamp_filter.filter_traces_intersecting(df,
                                                     t1,
                                                     t2,
                                                     parameters=parameters)

    from pm4py.statistics.traces.generic.pandas import case_statistics
    cd = case_statistics.get_cases_description(df,
                                               parameters=parameters).values()
    return mean(x["caseDuration"] for x in cd)

コード例 #9

0

ファイルを表示

def filter_trace_attribute_values(
        log: Union[EventLog, pd.DataFrame],
        attribute_key: str,
        values: Union[Set[str], List[str]],
        retain: bool = True) -> Union[EventLog, pd.DataFrame]:
    """
    Filter a log on the values of a trace attribute

    Parameters
    --------------
    log
        Event log
    attribute_key
        Attribute to filter
    values
        Values to filter (list of)
    retain
        Boolean value (keep/discard matching traces)

    Returns
    --------------
    filtered_log
        Filtered event log
    """
    if type(log) not in [pd.DataFrame, EventLog, EventStream]:
        raise Exception(
            "the method can be applied only to a traditional event log!")

    parameters = get_properties(log)
    parameters[constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = attribute_key
    if check_is_pandas_dataframe(log):
        check_pandas_dataframe_columns(log)
        from pm4py.algo.filtering.pandas.attributes import attributes_filter
        parameters[attributes_filter.Parameters.POSITIVE] = retain
        return attributes_filter.apply(log, values, parameters=parameters)
    else:
        from pm4py.algo.filtering.log.attributes import attributes_filter
        parameters[attributes_filter.Parameters.POSITIVE] = retain
        return attributes_filter.apply_trace_attribute(log,
                                                       values,
                                                       parameters=parameters)

コード例 #10

0

ファイルを表示

def filter_related_entries(frame: DataFrame,
                           session_field: str,
                           attribute_field: str,
                           values: List[str],
                           keep: bool = True) -> DataFrame:
    """
    Filters related entries from a DataFrame based on the values of one of the
    entries fields.
    :param frame: the DataFrame
    :param session_field: the field that marks related entries
    :param attribute_field: the field that should be checked for desired values
    :param values: the desired values
    :param keep: whether the matching entries or non-matching entries should
    be kept
    :return: a filtered representation of the initial DataFrame
    """
    parameters = {
        attributes_filter.Parameters.CASE_ID_KEY: session_field,
        attributes_filter.Parameters.ATTRIBUTE_KEY: attribute_field,
        attributes_filter.Parameters.POSITIVE: keep
    }
    return attributes_filter.apply(frame, values, parameters=parameters)

コード例 #11

0

ファイルを表示

def execute_script():
    aa = time.time()
    dataframe = csv_import_adapter.import_dataframe_from_path_wo_timeconversion(
        inputLog, sep=',')
    dataframe = csv_import_adapter.convert_caseid_column_to_str(
        dataframe, case_id_glue=CASEID_GLUE)
    dataframe = csv_import_adapter.convert_timestamp_columns_in_df(
        dataframe, timest_format=TIMEST_FORMAT, timest_columns=TIMEST_COLUMNS)
    dataframe = dataframe.sort_values([CASEID_GLUE, TIMEST_KEY])
    dataframe_fa = attributes_filter.filter_df_keeping_spno_activities(
        dataframe,
        activity_key=ACTIVITY_KEY,
        max_no_activities=MAX_NO_ACTIVITIES)
    bb = time.time()
    print("importing log time=", (bb - aa))

    parameters_cde = {
        constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE,
        constants.PARAMETER_CONSTANT_TIMESTAMP_KEY: TIMEST_KEY,
        "sort_by_column": "caseDuration",
        "sort_ascending": False,
        "max_ret_cases": 1000
    }
    cases_desc = case_statistics.get_cases_description(
        dataframe, parameters=parameters_cde)

    print(cases_desc)
    bb2 = time.time()
    print("calculating and printing cases_desc = ", (bb2 - bb))
    calculate_process_schema_from_df(dataframe_fa, "NOFILTERS_FREQUENCY.svg",
                                     "NOFILTERS_PERFORMANCE.svg")
    GENERATED_IMAGES.append("NOFILTERS_FREQUENCY.svg")
    GENERATED_IMAGES.append("NOFILTERS_PERFORMANCE.svg")
    if DELETE_VARIABLES:
        del dataframe_fa
    cc = time.time()
    print(
        "saving initial Inductive Miner process schema along with frequency metrics=",
        (cc - bb2))

    dataframe_cp = case_filter.filter_on_case_performance(
        dataframe,
        case_id_glue=CASEID_GLUE,
        timestamp_key=TIMEST_KEY,
        min_case_performance=100000,
        max_case_performance=10000000)
    dataframe_cp_fa = attributes_filter.filter_df_keeping_spno_activities(
        dataframe_cp,
        activity_key=ACTIVITY_KEY,
        max_no_activities=MAX_NO_ACTIVITIES)
    dataframe_cp = None
    if DELETE_VARIABLES:
        del dataframe_cp
    calculate_process_schema_from_df(dataframe_cp_fa,
                                     "FILTER_CP_FREQUENCY.svg",
                                     "FILTER_CP_PERFORMANCE.svg")
    GENERATED_IMAGES.append("FILTER_CP_FREQUENCY.svg")
    GENERATED_IMAGES.append("FILTER_CP_PERFORMANCE.svg")
    if DELETE_VARIABLES:
        del dataframe_cp_fa
    dd = time.time()
    print("filtering on case performance and generating process schema=",
          (dd - cc))

    if ENABLE_ATTRIBUTE_FILTER:
        parameters_att = {
            constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE,
            constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: ATTRIBUTE_TO_FILTER,
            constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ATTRIBUTE_TO_FILTER,
            "positive": True
        }
        dataframe_att = attributes_filter.apply(dataframe,
                                                ATTRIBUTE_VALUES_TO_FILTER,
                                                parameters=parameters_att)
        # dataframe_att = attributes_filter.apply_auto_filter(dataframe, parameters=parameters_att)
        print(
            "all the activities in the log",
            attributes_filter.get_attribute_values(dataframe_att,
                                                   ACTIVITY_KEY))
        dataframe_att_fa = attributes_filter.filter_df_keeping_spno_activities(
            dataframe_att,
            activity_key=ACTIVITY_KEY,
            max_no_activities=MAX_NO_ACTIVITIES)
        if DELETE_VARIABLES:
            del dataframe_att
        calculate_process_schema_from_df(dataframe_att_fa,
                                         "FILTER_ATT_FREQUENCY.svg",
                                         "FILTER_ATT_PERFORMANCE.svg")
        GENERATED_IMAGES.append("FILTER_ATT_FREQUENCY.svg")
        GENERATED_IMAGES.append("FILTER_ATT_PERFORMANCE.svg")
        if DELETE_VARIABLES:
            del dataframe_att_fa
        ee = time.time()
        print("filtering on attribute values and generating process schema=",
              (ee - dd))

    ee = time.time()
    parameters_sa = {
        constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE,
        constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ACTIVITY_KEY
    }
    parameters_ea = {
        constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE,
        constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ACTIVITY_KEY
    }
    start_act = start_activities_filter.get_start_activities(
        dataframe, parameters=parameters_sa)
    print("start activities in the log = ", start_act)
    end_act = end_activities_filter.get_end_activities(
        dataframe, parameters=parameters_ea)
    print("end activities in the log = ", end_act)
    ff = time.time()
    print("finding start and end activities along with their count", (ff - ee))

    if ENABLE_STARTACT_FILTER:
        dataframe_sa = start_activities_filter.apply(dataframe,
                                                     STARTACT_TO_FILTER,
                                                     parameters=parameters_sa)
        # dataframe_sa = start_activities_filter.apply_auto_filter(dataframe, parameters=parameters_sa)
        start_act = start_activities_filter.get_start_activities(
            dataframe_sa, parameters=parameters_sa)
        print("start activities in the filtered log = ", start_act)
        dataframe_sa_fa = attributes_filter.filter_df_keeping_spno_activities(
            dataframe_sa,
            activity_key=ACTIVITY_KEY,
            max_no_activities=MAX_NO_ACTIVITIES)
        if DELETE_VARIABLES:
            del dataframe_sa
        calculate_process_schema_from_df(dataframe_sa_fa,
                                         "FILTER_SA_FREQUENCY.svg",
                                         "FILTER_SA_PERFORMANCE.svg")
        GENERATED_IMAGES.append("FILTER_SA_FREQUENCY.svg")
        GENERATED_IMAGES.append("FILTER_SA_PERFORMANCE.svg")
        if DELETE_VARIABLES:
            del dataframe_sa_fa
    gg = time.time()
    if ENABLE_STARTACT_FILTER:
        print("filtering start activities time=", (gg - ff))

    if ENABLE_ENDACT_FILTER:
        dataframe_ea = end_activities_filter.apply(dataframe,
                                                   ENDACT_TO_FILTER,
                                                   parameters=parameters_ea)
        # dataframe_ea = end_activities_filter.apply_auto_filter(dataframe, parameters=parameters_ea)
        end_act = end_activities_filter.get_end_activities(
            dataframe_ea, parameters=parameters_ea)
        print("end activities in the filtered log = ", end_act)
        dataframe_ea_fa = attributes_filter.filter_df_keeping_spno_activities(
            dataframe_ea,
            activity_key=ACTIVITY_KEY,
            max_no_activities=MAX_NO_ACTIVITIES)
        if DELETE_VARIABLES:
            del dataframe_ea
        calculate_process_schema_from_df(dataframe_ea_fa,
                                         "FILTER_EA_FREQUENCY.svg",
                                         "FILTER_EA_PERFORMANCE.svg")
        GENERATED_IMAGES.append("FILTER_EA_FREQUENCY.svg")
        GENERATED_IMAGES.append("FILTER_EA_PERFORMANCE.svg")
        if DELETE_VARIABLES:
            del dataframe_ea_fa
    hh = time.time()
    if ENABLE_ENDACT_FILTER:
        print("filtering end activities time=", (hh - gg))

    if REMOVE_GENERATED_IMAGES:
        for image in GENERATED_IMAGES:
            os.remove(image)

コード例 #12

0

ファイルを表示

ファイル: pandas.py プロジェクト: lgbanuelos/pm4py-core

def interaction_two_resources(
        df: pd.DataFrame,
        t1: Union[datetime, str],
        t2: Union[datetime, str],
        r1: str,
        r2: str,
        parameters: Optional[Dict[str, Any]] = None) -> float:
    """
    The number of cases completed during a given time slot in which two given resources were involved.

    Metric RBI 5.1 in Pika, Anastasiia, et al.
    "Mining resource profiles from event logs." ACM Transactions on Management Information Systems (TMIS) 8.1 (2017): 1-30.

    Parameters
    -----------------
    df
        Dataframe
    t1
        Left interval
    t2
        Right interval
    r1
        Resource 1
    r2
        Resource 2

    Returns
    ----------------
    metric
        Value of the metric
    """
    if parameters is None:
        parameters = {}

    t1 = get_dt_from_string(t1)
    t2 = get_dt_from_string(t2)

    timestamp_key = exec_utils.get_param_value(
        Parameters.TIMESTAMP_KEY, parameters,
        xes_constants.DEFAULT_TIMESTAMP_KEY)
    resource_key = exec_utils.get_param_value(
        Parameters.RESOURCE_KEY, parameters,
        xes_constants.DEFAULT_RESOURCE_KEY)
    case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY,
                                             parameters,
                                             constants.CASE_CONCEPT_NAME)
    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters,
                                              xes_constants.DEFAULT_NAME_KEY)

    df = df[[timestamp_key, resource_key, case_id_key, activity_key]]

    from pm4py.algo.filtering.pandas.attributes import attributes_filter
    parameters_filter = {
        attributes_filter.Parameters.ATTRIBUTE_KEY: resource_key
    }
    df = attributes_filter.apply(df, [r1], parameters=parameters_filter)
    df = attributes_filter.apply(df, [r2], parameters=parameters_filter)
    last_df = df.groupby(case_id_key).last().reset_index()
    last_df = last_df[last_df[timestamp_key] >= t1]
    last_df = last_df[last_df[timestamp_key] < t2]
    cases = set(last_df[case_id_key].unique())
    df = df[df[case_id_key].isin(cases)]

    return df[case_id_key].nunique()