def filter_end_activities(log, activities, retain=True): """ Filter cases having an end activity in the provided list Parameters --------------- log Log object activities List of admitted end activities retain if True, we retain the traces containing the given activities, if false, we drop the traces Returns --------------- filtered_log Filtered log object """ if check_is_dataframe(log): check_dataframe_columns(log) from pm4py.algo.filtering.pandas.end_activities import end_activities_filter return end_activities_filter.apply( log, activities, parameters={end_activities_filter.Parameters.POSITIVE: retain}) else: from pm4py.algo.filtering.log.end_activities import end_activities_filter return end_activities_filter.apply( log, activities, parameters={end_activities_filter.Parameters.POSITIVE: retain})
def filter_end_activities( log: Union[EventLog, pd.DataFrame], activities: Union[Set[str], List[str]], retain: bool = True) -> Union[EventLog, pd.DataFrame]: """ Filter cases having an end activity in the provided list Parameters --------------- log Log object activities List of admitted end activities retain if True, we retain the traces containing the given activities, if false, we drop the traces Returns --------------- filtered_log Filtered log object """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception( "the method can be applied only to a traditional event log!") parameters = get_properties(log) if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.algo.filtering.pandas.end_activities import end_activities_filter parameters[end_activities_filter.Parameters.POSITIVE] = retain return end_activities_filter.apply(log, activities, parameters=parameters) else: from pm4py.algo.filtering.log.end_activities import end_activities_filter parameters[end_activities_filter.Parameters.POSITIVE] = retain return end_activities_filter.apply(log, activities, parameters=parameters)
def filter_end_activities(log, admitted_end_activities): """ Filter cases having an end activity in the provided list Parameters --------------- log Log object admitted_end_activities List of admitted end activities Returns --------------- filtered_log Filtered log_skeleton object """ if check_is_dataframe(log): check_dataframe_columns(log) from pm4py.algo.filtering.pandas.end_activities import end_activities_filter return end_activities_filter.apply(log, admitted_end_activities) else: from pm4py.algo.filtering.log.end_activities import end_activities_filter return end_activities_filter.apply(log, admitted_end_activities)
def apply(dataframe, filter, parameters=None): """ Apply a filter to the current log (end activities filter) Parameters ------------ dataframe Pandas dataframe filter Filter to apply parameters Parameters of the algorithm Returns ------------ dataframe Pandas dataframe """ if parameters is None: parameters = {} return end_activities_filter.apply(dataframe, filter[1], parameters=parameters)
def execute_script(): aa = time.time() dataframe = csv_import_adapter.import_dataframe_from_path_wo_timeconversion( inputLog, sep=',') dataframe = csv_import_adapter.convert_caseid_column_to_str( dataframe, case_id_glue=CASEID_GLUE) dataframe = csv_import_adapter.convert_timestamp_columns_in_df( dataframe, timest_format=TIMEST_FORMAT, timest_columns=TIMEST_COLUMNS) dataframe = dataframe.sort_values([CASEID_GLUE, TIMEST_KEY]) dataframe_fa = attributes_filter.filter_df_keeping_spno_activities( dataframe, activity_key=ACTIVITY_KEY, max_no_activities=MAX_NO_ACTIVITIES) bb = time.time() print("importing log time=", (bb - aa)) parameters_cde = { constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE, constants.PARAMETER_CONSTANT_TIMESTAMP_KEY: TIMEST_KEY, "sort_by_column": "caseDuration", "sort_ascending": False, "max_ret_cases": 1000 } cases_desc = case_statistics.get_cases_description( dataframe, parameters=parameters_cde) print(cases_desc) bb2 = time.time() print("calculating and printing cases_desc = ", (bb2 - bb)) calculate_process_schema_from_df(dataframe_fa, "NOFILTERS_FREQUENCY.svg", "NOFILTERS_PERFORMANCE.svg") GENERATED_IMAGES.append("NOFILTERS_FREQUENCY.svg") GENERATED_IMAGES.append("NOFILTERS_PERFORMANCE.svg") if DELETE_VARIABLES: del dataframe_fa cc = time.time() print( "saving initial Inductive Miner process schema along with frequency metrics=", (cc - bb2)) dataframe_cp = case_filter.filter_on_case_performance( dataframe, case_id_glue=CASEID_GLUE, timestamp_key=TIMEST_KEY, min_case_performance=100000, max_case_performance=10000000) dataframe_cp_fa = attributes_filter.filter_df_keeping_spno_activities( dataframe_cp, activity_key=ACTIVITY_KEY, max_no_activities=MAX_NO_ACTIVITIES) dataframe_cp = None if DELETE_VARIABLES: del dataframe_cp calculate_process_schema_from_df(dataframe_cp_fa, "FILTER_CP_FREQUENCY.svg", "FILTER_CP_PERFORMANCE.svg") GENERATED_IMAGES.append("FILTER_CP_FREQUENCY.svg") GENERATED_IMAGES.append("FILTER_CP_PERFORMANCE.svg") if DELETE_VARIABLES: del dataframe_cp_fa dd = time.time() print("filtering on case performance and generating process schema=", (dd - cc)) if ENABLE_ATTRIBUTE_FILTER: parameters_att = { constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE, constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: ATTRIBUTE_TO_FILTER, constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ATTRIBUTE_TO_FILTER, "positive": True } dataframe_att = attributes_filter.apply(dataframe, ATTRIBUTE_VALUES_TO_FILTER, parameters=parameters_att) # dataframe_att = attributes_filter.apply_auto_filter(dataframe, parameters=parameters_att) print( "all the activities in the log", attributes_filter.get_attribute_values(dataframe_att, ACTIVITY_KEY)) dataframe_att_fa = attributes_filter.filter_df_keeping_spno_activities( dataframe_att, activity_key=ACTIVITY_KEY, max_no_activities=MAX_NO_ACTIVITIES) if DELETE_VARIABLES: del dataframe_att calculate_process_schema_from_df(dataframe_att_fa, "FILTER_ATT_FREQUENCY.svg", "FILTER_ATT_PERFORMANCE.svg") GENERATED_IMAGES.append("FILTER_ATT_FREQUENCY.svg") GENERATED_IMAGES.append("FILTER_ATT_PERFORMANCE.svg") if DELETE_VARIABLES: del dataframe_att_fa ee = time.time() print("filtering on attribute values and generating process schema=", (ee - dd)) ee = time.time() parameters_sa = { constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE, constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ACTIVITY_KEY } parameters_ea = { constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE, constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ACTIVITY_KEY } start_act = start_activities_filter.get_start_activities( dataframe, parameters=parameters_sa) print("start activities in the log = ", start_act) end_act = end_activities_filter.get_end_activities( dataframe, parameters=parameters_ea) print("end activities in the log = ", end_act) ff = time.time() print("finding start and end activities along with their count", (ff - ee)) if ENABLE_STARTACT_FILTER: dataframe_sa = start_activities_filter.apply(dataframe, STARTACT_TO_FILTER, parameters=parameters_sa) # dataframe_sa = start_activities_filter.apply_auto_filter(dataframe, parameters=parameters_sa) start_act = start_activities_filter.get_start_activities( dataframe_sa, parameters=parameters_sa) print("start activities in the filtered log = ", start_act) dataframe_sa_fa = attributes_filter.filter_df_keeping_spno_activities( dataframe_sa, activity_key=ACTIVITY_KEY, max_no_activities=MAX_NO_ACTIVITIES) if DELETE_VARIABLES: del dataframe_sa calculate_process_schema_from_df(dataframe_sa_fa, "FILTER_SA_FREQUENCY.svg", "FILTER_SA_PERFORMANCE.svg") GENERATED_IMAGES.append("FILTER_SA_FREQUENCY.svg") GENERATED_IMAGES.append("FILTER_SA_PERFORMANCE.svg") if DELETE_VARIABLES: del dataframe_sa_fa gg = time.time() if ENABLE_STARTACT_FILTER: print("filtering start activities time=", (gg - ff)) if ENABLE_ENDACT_FILTER: dataframe_ea = end_activities_filter.apply(dataframe, ENDACT_TO_FILTER, parameters=parameters_ea) # dataframe_ea = end_activities_filter.apply_auto_filter(dataframe, parameters=parameters_ea) end_act = end_activities_filter.get_end_activities( dataframe_ea, parameters=parameters_ea) print("end activities in the filtered log = ", end_act) dataframe_ea_fa = attributes_filter.filter_df_keeping_spno_activities( dataframe_ea, activity_key=ACTIVITY_KEY, max_no_activities=MAX_NO_ACTIVITIES) if DELETE_VARIABLES: del dataframe_ea calculate_process_schema_from_df(dataframe_ea_fa, "FILTER_EA_FREQUENCY.svg", "FILTER_EA_PERFORMANCE.svg") GENERATED_IMAGES.append("FILTER_EA_FREQUENCY.svg") GENERATED_IMAGES.append("FILTER_EA_PERFORMANCE.svg") if DELETE_VARIABLES: del dataframe_ea_fa hh = time.time() if ENABLE_ENDACT_FILTER: print("filtering end activities time=", (hh - gg)) if REMOVE_GENERATED_IMAGES: for image in GENERATED_IMAGES: os.remove(image)
tstct = tstct[tstct["SPRSL"] == "E"] tstct = tstct[["TCODE", "TTEXT"]] stream = tstct.to_dict("r") for row in stream: activities[row["TCODE"]] = row["TTEXT"] bkpf = pd.read_csv(os.path.join(dir, "bkpf_old.tsv"), sep="\t", dtype={ "BELNR": str, "AWKEY": str, "XBLNR": str, "BUKRS": str }) bkpf["time:timestamp"] = bkpf["CPUDT"] + " " + bkpf["CPUTM"] bkpf["time:timestamp"] = pd.to_datetime(bkpf["time:timestamp"], format="%d.%m.%Y %H:%M:%S") bkpf["case:concept:name"] = "C_" + bkpf["BELNR"] bkpf["concept:name"] = bkpf["TCODE"].map(activities) bkpf["org:resource"] = bkpf["USNAM"] bkpf = bkpf.dropna(subset=["concept:name"]) bkpf = bkpf.dropna(subset=["org:resource"]) bkpf = bkpf.sort_values("time:timestamp") bkpf = bkpf.reset_index() bkpf = bkpf[[x for x in bkpf.columns if not "named:" in x]] #print(start_activities_filter.get_start_activities(bkpf)) bkpf = start_activities_filter.apply(bkpf, ["Create Billing Document"]) #print(end_activities_filter.get_end_activities(bkpf)) bkpf = end_activities_filter.apply(bkpf, ["Post Document"]) parquet_exporter.apply(bkpf, "bkpf.parquet") bkpf.to_csv("bkpf.csv", index=False)