def filter_event_attribute_values( log: Union[EventLog, pd.DataFrame], attribute_key: str, values: Union[Set[str], List[str]], level: str = "case", retain: bool = True) -> Union[EventLog, pd.DataFrame]: """ Filter a log object on the values of some event attribute Parameters -------------- log Log object attribute_key Attribute to filter values Admitted (or forbidden) values level Specifies how the filter should be applied ('case' filters the cases where at least one occurrence happens, 'event' filter the events eventually trimming the cases) retain Specified if the values should be kept or removed Returns -------------- filtered_log Filtered log object """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception( "the method can be applied only to a traditional event log!") parameters = get_properties(log) parameters[constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = attribute_key if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.algo.filtering.pandas.attributes import attributes_filter if level == "event": parameters[attributes_filter.Parameters.POSITIVE] = retain return attributes_filter.apply_events(log, values, parameters=parameters) elif level == "case": parameters[attributes_filter.Parameters.POSITIVE] = retain return attributes_filter.apply(log, values, parameters=parameters) else: from pm4py.algo.filtering.log.attributes import attributes_filter if level == "event": parameters[attributes_filter.Parameters.POSITIVE] = retain return attributes_filter.apply_events(log, values, parameters=parameters) elif level == "case": parameters[attributes_filter.Parameters.POSITIVE] = retain return attributes_filter.apply(log, values, parameters=parameters)
def filter_trace_attribute(log, attribute, values, positive=True): """ Filter a log_skeleton on the values of a trace attribute Parameters -------------- log Event log_skeleton attribute Attribute to filter values Values to filter (list of) positive Boolean value (keep/discard cases) Returns -------------- filtered_log Filtered event log_skeleton """ if check_is_dataframe(log): check_dataframe_columns(log) from pm4py.algo.filtering.pandas.attributes import attributes_filter return attributes_filter.apply(log, values, parameters={attributes_filter.Parameters.ATTRIBUTE_KEY: attribute, attributes_filter.Parameters.POSITIVE: positive}) else: from pm4py.algo.filtering.log.attributes import attributes_filter return attributes_filter.apply_trace_attribute(log, values, parameters={ attributes_filter.Parameters.ATTRIBUTE_KEY: attribute, attributes_filter.Parameters.POSITIVE: positive})
def apply(dataframe, filter, parameters=None): """ Apply a filter to the current log (attributes filter) Parameters ------------ dataframe Pandas dataframe filter Filter to apply parameters Parameters of the algorithm Returns ------------ dataframe Pandas dataframe """ if parameters is None: parameters = {} parameters[constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = filter[1][0] parameters["positive"] = False return attributes_filter.apply(dataframe, filter[1][1], parameters=parameters)
def social_position( df: pd.DataFrame, t1: Union[datetime, str], t2: Union[datetime, str], r: str, parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> float: """ The fraction of resources involved in the same cases with a given resource during a given time slot with respect to the total number of resources active during the time slot. Metric RBI 5.2 in Pika, Anastasiia, et al. "Mining resource profiles from event logs." ACM Transactions on Management Information Systems (TMIS) 8.1 (2017): 1-30. Parameters ----------------- df Dataframe t1 Left interval t2 Right interval r Resource Returns ---------------- metric Value of the metric """ if parameters is None: parameters = {} t1 = get_dt_from_string(t1) t2 = get_dt_from_string(t2) timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) resource_key = exec_utils.get_param_value( Parameters.RESOURCE_KEY, parameters, xes_constants.DEFAULT_RESOURCE_KEY) case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) df = df[[timestamp_key, resource_key, case_id_key]] df = df[df[timestamp_key] >= t1] df = df[df[timestamp_key] < t2] from pm4py.algo.filtering.pandas.attributes import attributes_filter parameters_filter = { attributes_filter.Parameters.ATTRIBUTE_KEY: resource_key } filt_df = attributes_filter.apply(df, [r], parameters=parameters_filter) q1 = float(filt_df[case_id_key].nunique()) q2 = float(df[case_id_key].nunique()) return q1 / q2 if q2 > 0 else 0
def filter_trace_attribute_values(log: Union[EventLog, pd.DataFrame], attribute_key: str, values: List[str], retain: bool = True) -> Union[EventLog, pd.DataFrame]: """ Filter a log on the values of a trace attribute Parameters -------------- log Event log attribute_key Attribute to filter values Values to filter (list of) retain Boolean value (keep/discard matching traces) Returns -------------- filtered_log Filtered event log """ if check_is_dataframe(log): check_dataframe_columns(log) from pm4py.algo.filtering.pandas.attributes import attributes_filter return attributes_filter.apply(log, values, parameters={attributes_filter.Parameters.ATTRIBUTE_KEY: attribute_key, attributes_filter.Parameters.POSITIVE: retain}) else: from pm4py.algo.filtering.log.attributes import attributes_filter return attributes_filter.apply_trace_attribute(log, values, parameters={ attributes_filter.Parameters.ATTRIBUTE_KEY: attribute_key, attributes_filter.Parameters.POSITIVE: retain})
def filter_event_attribute_values(log: Union[EventLog, pd.DataFrame], attribute_key: str, values: List[str], level: str = "case", retain: bool = True) -> Union[EventLog, pd.DataFrame]: """ Filter a log object on the values of some event attribute Parameters -------------- log Log object attribute_key Attribute to filter values Admitted (or forbidden) values level Specifies how the filter should be applied ('case' filters the cases where at least one occurrence happens, 'event' filter the events eventually trimming the cases) retain Specified if the values should be kept or removed Returns -------------- filtered_log Filtered log object """ if check_is_dataframe(log): check_dataframe_columns(log) from pm4py.algo.filtering.pandas.attributes import attributes_filter if level == "event": return attributes_filter.apply_events(log, values, parameters={constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: attribute_key, attributes_filter.Parameters.POSITIVE: retain}) elif level == "case": return attributes_filter.apply(log, values, parameters={ constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: attribute_key, attributes_filter.Parameters.POSITIVE: retain}) else: from pm4py.algo.filtering.log.attributes import attributes_filter if level == "event": return attributes_filter.apply_events(log, values, parameters={constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: attribute_key, attributes_filter.Parameters.POSITIVE: retain}) elif level == "case": return attributes_filter.apply(log, values, parameters={ constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: attribute_key, attributes_filter.Parameters.POSITIVE: retain})
def filter_attribute_values(log, attribute, values, how="cases", positive=True): """ Filter a log_skeleton object on the values of some attribute Parameters -------------- log Log object attribute Attribute to filter values Admitted (or forbidden) values how Specifies how the filter should be applied (cases filters the cases where at least one occurrence happens, events filter the events eventually trimming the cases) positive Specified if the values should be kept or removed Returns -------------- filtered_log Filtered log_skeleton object """ if check_is_dataframe(log): check_dataframe_columns(log) from pm4py.algo.filtering.pandas.attributes import attributes_filter if how == "events": return attributes_filter.apply_events(log, values, parameters={constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: attribute, attributes_filter.Parameters.POSITIVE: positive}) elif how == "cases": return attributes_filter.apply(log, values, parameters={ constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: attribute, attributes_filter.Parameters.POSITIVE: positive}) else: from pm4py.algo.filtering.log.attributes import attributes_filter if how == "events": return attributes_filter.apply_events(log, values, parameters={constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: attribute, attributes_filter.Parameters.POSITIVE: positive}) else: return attributes_filter.apply(log, values, parameters={ constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: attribute, attributes_filter.Parameters.POSITIVE: positive})
def average_case_duration( df: pd.DataFrame, t1: Union[datetime, str], t2: Union[datetime, str], r: str, parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> float: """ The average duration of cases completed during a given time slot in which a given resource was involved. Metric RBI 4.4 in Pika, Anastasiia, et al. "Mining resource profiles from event logs." ACM Transactions on Management Information Systems (TMIS) 8.1 (2017): 1-30. Parameters ----------------- df Dataframe t1 Left interval t2 Right interval r Resource Returns ---------------- metric Value of the metric """ if parameters is None: parameters = {} resource_key = exec_utils.get_param_value( Parameters.RESOURCE_KEY, parameters, xes_constants.DEFAULT_RESOURCE_KEY) from pm4py.algo.filtering.pandas.attributes import attributes_filter parameters_filter = { attributes_filter.Parameters.ATTRIBUTE_KEY: resource_key } df = attributes_filter.apply(df, [r], parameters=parameters_filter) from pm4py.algo.filtering.pandas.timestamp import timestamp_filter df = timestamp_filter.filter_traces_intersecting(df, t1, t2, parameters=parameters) from pm4py.statistics.traces.generic.pandas import case_statistics cd = case_statistics.get_cases_description(df, parameters=parameters).values() return mean(x["caseDuration"] for x in cd)
def filter_trace_attribute_values( log: Union[EventLog, pd.DataFrame], attribute_key: str, values: Union[Set[str], List[str]], retain: bool = True) -> Union[EventLog, pd.DataFrame]: """ Filter a log on the values of a trace attribute Parameters -------------- log Event log attribute_key Attribute to filter values Values to filter (list of) retain Boolean value (keep/discard matching traces) Returns -------------- filtered_log Filtered event log """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception( "the method can be applied only to a traditional event log!") parameters = get_properties(log) parameters[constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = attribute_key if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.algo.filtering.pandas.attributes import attributes_filter parameters[attributes_filter.Parameters.POSITIVE] = retain return attributes_filter.apply(log, values, parameters=parameters) else: from pm4py.algo.filtering.log.attributes import attributes_filter parameters[attributes_filter.Parameters.POSITIVE] = retain return attributes_filter.apply_trace_attribute(log, values, parameters=parameters)
def filter_related_entries(frame: DataFrame, session_field: str, attribute_field: str, values: List[str], keep: bool = True) -> DataFrame: """ Filters related entries from a DataFrame based on the values of one of the entries fields. :param frame: the DataFrame :param session_field: the field that marks related entries :param attribute_field: the field that should be checked for desired values :param values: the desired values :param keep: whether the matching entries or non-matching entries should be kept :return: a filtered representation of the initial DataFrame """ parameters = { attributes_filter.Parameters.CASE_ID_KEY: session_field, attributes_filter.Parameters.ATTRIBUTE_KEY: attribute_field, attributes_filter.Parameters.POSITIVE: keep } return attributes_filter.apply(frame, values, parameters=parameters)
def execute_script(): aa = time.time() dataframe = csv_import_adapter.import_dataframe_from_path_wo_timeconversion( inputLog, sep=',') dataframe = csv_import_adapter.convert_caseid_column_to_str( dataframe, case_id_glue=CASEID_GLUE) dataframe = csv_import_adapter.convert_timestamp_columns_in_df( dataframe, timest_format=TIMEST_FORMAT, timest_columns=TIMEST_COLUMNS) dataframe = dataframe.sort_values([CASEID_GLUE, TIMEST_KEY]) dataframe_fa = attributes_filter.filter_df_keeping_spno_activities( dataframe, activity_key=ACTIVITY_KEY, max_no_activities=MAX_NO_ACTIVITIES) bb = time.time() print("importing log time=", (bb - aa)) parameters_cde = { constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE, constants.PARAMETER_CONSTANT_TIMESTAMP_KEY: TIMEST_KEY, "sort_by_column": "caseDuration", "sort_ascending": False, "max_ret_cases": 1000 } cases_desc = case_statistics.get_cases_description( dataframe, parameters=parameters_cde) print(cases_desc) bb2 = time.time() print("calculating and printing cases_desc = ", (bb2 - bb)) calculate_process_schema_from_df(dataframe_fa, "NOFILTERS_FREQUENCY.svg", "NOFILTERS_PERFORMANCE.svg") GENERATED_IMAGES.append("NOFILTERS_FREQUENCY.svg") GENERATED_IMAGES.append("NOFILTERS_PERFORMANCE.svg") if DELETE_VARIABLES: del dataframe_fa cc = time.time() print( "saving initial Inductive Miner process schema along with frequency metrics=", (cc - bb2)) dataframe_cp = case_filter.filter_on_case_performance( dataframe, case_id_glue=CASEID_GLUE, timestamp_key=TIMEST_KEY, min_case_performance=100000, max_case_performance=10000000) dataframe_cp_fa = attributes_filter.filter_df_keeping_spno_activities( dataframe_cp, activity_key=ACTIVITY_KEY, max_no_activities=MAX_NO_ACTIVITIES) dataframe_cp = None if DELETE_VARIABLES: del dataframe_cp calculate_process_schema_from_df(dataframe_cp_fa, "FILTER_CP_FREQUENCY.svg", "FILTER_CP_PERFORMANCE.svg") GENERATED_IMAGES.append("FILTER_CP_FREQUENCY.svg") GENERATED_IMAGES.append("FILTER_CP_PERFORMANCE.svg") if DELETE_VARIABLES: del dataframe_cp_fa dd = time.time() print("filtering on case performance and generating process schema=", (dd - cc)) if ENABLE_ATTRIBUTE_FILTER: parameters_att = { constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE, constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: ATTRIBUTE_TO_FILTER, constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ATTRIBUTE_TO_FILTER, "positive": True } dataframe_att = attributes_filter.apply(dataframe, ATTRIBUTE_VALUES_TO_FILTER, parameters=parameters_att) # dataframe_att = attributes_filter.apply_auto_filter(dataframe, parameters=parameters_att) print( "all the activities in the log", attributes_filter.get_attribute_values(dataframe_att, ACTIVITY_KEY)) dataframe_att_fa = attributes_filter.filter_df_keeping_spno_activities( dataframe_att, activity_key=ACTIVITY_KEY, max_no_activities=MAX_NO_ACTIVITIES) if DELETE_VARIABLES: del dataframe_att calculate_process_schema_from_df(dataframe_att_fa, "FILTER_ATT_FREQUENCY.svg", "FILTER_ATT_PERFORMANCE.svg") GENERATED_IMAGES.append("FILTER_ATT_FREQUENCY.svg") GENERATED_IMAGES.append("FILTER_ATT_PERFORMANCE.svg") if DELETE_VARIABLES: del dataframe_att_fa ee = time.time() print("filtering on attribute values and generating process schema=", (ee - dd)) ee = time.time() parameters_sa = { constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE, constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ACTIVITY_KEY } parameters_ea = { constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE, constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ACTIVITY_KEY } start_act = start_activities_filter.get_start_activities( dataframe, parameters=parameters_sa) print("start activities in the log = ", start_act) end_act = end_activities_filter.get_end_activities( dataframe, parameters=parameters_ea) print("end activities in the log = ", end_act) ff = time.time() print("finding start and end activities along with their count", (ff - ee)) if ENABLE_STARTACT_FILTER: dataframe_sa = start_activities_filter.apply(dataframe, STARTACT_TO_FILTER, parameters=parameters_sa) # dataframe_sa = start_activities_filter.apply_auto_filter(dataframe, parameters=parameters_sa) start_act = start_activities_filter.get_start_activities( dataframe_sa, parameters=parameters_sa) print("start activities in the filtered log = ", start_act) dataframe_sa_fa = attributes_filter.filter_df_keeping_spno_activities( dataframe_sa, activity_key=ACTIVITY_KEY, max_no_activities=MAX_NO_ACTIVITIES) if DELETE_VARIABLES: del dataframe_sa calculate_process_schema_from_df(dataframe_sa_fa, "FILTER_SA_FREQUENCY.svg", "FILTER_SA_PERFORMANCE.svg") GENERATED_IMAGES.append("FILTER_SA_FREQUENCY.svg") GENERATED_IMAGES.append("FILTER_SA_PERFORMANCE.svg") if DELETE_VARIABLES: del dataframe_sa_fa gg = time.time() if ENABLE_STARTACT_FILTER: print("filtering start activities time=", (gg - ff)) if ENABLE_ENDACT_FILTER: dataframe_ea = end_activities_filter.apply(dataframe, ENDACT_TO_FILTER, parameters=parameters_ea) # dataframe_ea = end_activities_filter.apply_auto_filter(dataframe, parameters=parameters_ea) end_act = end_activities_filter.get_end_activities( dataframe_ea, parameters=parameters_ea) print("end activities in the filtered log = ", end_act) dataframe_ea_fa = attributes_filter.filter_df_keeping_spno_activities( dataframe_ea, activity_key=ACTIVITY_KEY, max_no_activities=MAX_NO_ACTIVITIES) if DELETE_VARIABLES: del dataframe_ea calculate_process_schema_from_df(dataframe_ea_fa, "FILTER_EA_FREQUENCY.svg", "FILTER_EA_PERFORMANCE.svg") GENERATED_IMAGES.append("FILTER_EA_FREQUENCY.svg") GENERATED_IMAGES.append("FILTER_EA_PERFORMANCE.svg") if DELETE_VARIABLES: del dataframe_ea_fa hh = time.time() if ENABLE_ENDACT_FILTER: print("filtering end activities time=", (hh - gg)) if REMOVE_GENERATED_IMAGES: for image in GENERATED_IMAGES: os.remove(image)
def interaction_two_resources( df: pd.DataFrame, t1: Union[datetime, str], t2: Union[datetime, str], r1: str, r2: str, parameters: Optional[Dict[str, Any]] = None) -> float: """ The number of cases completed during a given time slot in which two given resources were involved. Metric RBI 5.1 in Pika, Anastasiia, et al. "Mining resource profiles from event logs." ACM Transactions on Management Information Systems (TMIS) 8.1 (2017): 1-30. Parameters ----------------- df Dataframe t1 Left interval t2 Right interval r1 Resource 1 r2 Resource 2 Returns ---------------- metric Value of the metric """ if parameters is None: parameters = {} t1 = get_dt_from_string(t1) t2 = get_dt_from_string(t2) timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) resource_key = exec_utils.get_param_value( Parameters.RESOURCE_KEY, parameters, xes_constants.DEFAULT_RESOURCE_KEY) case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) df = df[[timestamp_key, resource_key, case_id_key, activity_key]] from pm4py.algo.filtering.pandas.attributes import attributes_filter parameters_filter = { attributes_filter.Parameters.ATTRIBUTE_KEY: resource_key } df = attributes_filter.apply(df, [r1], parameters=parameters_filter) df = attributes_filter.apply(df, [r2], parameters=parameters_filter) last_df = df.groupby(case_id_key).last().reset_index() last_df = last_df[last_df[timestamp_key] >= t1] last_df = last_df[last_df[timestamp_key] < t2] cases = set(last_df[case_id_key].unique()) df = df[df[case_id_key].isin(cases)] return df[case_id_key].nunique()