def filter_start_activities(log, activities, retain=True): """ Filter cases having a start activity in the provided list Parameters -------------- log Log object activities List start activities retain if True, we retain the traces containing the given activities, if false, we drop the traces Returns -------------- filtered_log Filtered log object """ if check_is_dataframe(log): check_dataframe_columns(log) from pm4py.algo.filtering.pandas.start_activities import start_activities_filter return start_activities_filter.apply( log, activities, parameters={start_activities_filter.Parameters.POSITIVE: retain}) else: from pm4py.algo.filtering.log.start_activities import start_activities_filter return start_activities_filter.apply( log, activities, parameters={start_activities_filter.Parameters.POSITIVE: retain})
def filter_start_activities(log: Union[EventLog, pd.DataFrame], activities: Union[Set[str], List[str]], retain: bool = True) -> \ Union[EventLog, pd.DataFrame]: """ Filter cases having a start activity in the provided list Parameters -------------- log Log object activities List start activities retain if True, we retain the traces containing the given activities, if false, we drop the traces Returns -------------- filtered_log Filtered log object """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception( "the method can be applied only to a traditional event log!") parameters = get_properties(log) if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.algo.filtering.pandas.start_activities import start_activities_filter parameters[start_activities_filter.Parameters.POSITIVE] = retain return start_activities_filter.apply(log, activities, parameters=parameters) else: from pm4py.algo.filtering.log.start_activities import start_activities_filter parameters[start_activities_filter.Parameters.POSITIVE] = retain return start_activities_filter.apply(log, activities, parameters=parameters)
def filter_start_activities(log, admitted_start_activities): """ Filter cases having a start activity in the provided list Parameters -------------- log Log object admitted_start_activities List of admitted start activities Returns -------------- filtered_log Filtered log_skeleton object """ if check_is_dataframe(log): check_dataframe_columns(log) from pm4py.algo.filtering.pandas.start_activities import start_activities_filter return start_activities_filter.apply(log, admitted_start_activities) else: from pm4py.algo.filtering.log.start_activities import start_activities_filter return start_activities_filter.apply(log, admitted_start_activities)
def apply(dataframe, filter, parameters=None): """ Apply a filter to the current log (start activities filter) Parameters ------------ dataframe Pandas dataframe filter Filter to apply parameters Parameters of the algorithm Returns ------------ dataframe Pandas dataframe """ if parameters is None: parameters = {} return start_activities_filter.apply(dataframe, filter[1], parameters=parameters)
def execute_script(): aa = time.time() dataframe = csv_import_adapter.import_dataframe_from_path_wo_timeconversion( inputLog, sep=',') dataframe = csv_import_adapter.convert_caseid_column_to_str( dataframe, case_id_glue=CASEID_GLUE) dataframe = csv_import_adapter.convert_timestamp_columns_in_df( dataframe, timest_format=TIMEST_FORMAT, timest_columns=TIMEST_COLUMNS) dataframe = dataframe.sort_values([CASEID_GLUE, TIMEST_KEY]) dataframe_fa = attributes_filter.filter_df_keeping_spno_activities( dataframe, activity_key=ACTIVITY_KEY, max_no_activities=MAX_NO_ACTIVITIES) bb = time.time() print("importing log time=", (bb - aa)) parameters_cde = { constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE, constants.PARAMETER_CONSTANT_TIMESTAMP_KEY: TIMEST_KEY, "sort_by_column": "caseDuration", "sort_ascending": False, "max_ret_cases": 1000 } cases_desc = case_statistics.get_cases_description( dataframe, parameters=parameters_cde) print(cases_desc) bb2 = time.time() print("calculating and printing cases_desc = ", (bb2 - bb)) calculate_process_schema_from_df(dataframe_fa, "NOFILTERS_FREQUENCY.svg", "NOFILTERS_PERFORMANCE.svg") GENERATED_IMAGES.append("NOFILTERS_FREQUENCY.svg") GENERATED_IMAGES.append("NOFILTERS_PERFORMANCE.svg") if DELETE_VARIABLES: del dataframe_fa cc = time.time() print( "saving initial Inductive Miner process schema along with frequency metrics=", (cc - bb2)) dataframe_cp = case_filter.filter_on_case_performance( dataframe, case_id_glue=CASEID_GLUE, timestamp_key=TIMEST_KEY, min_case_performance=100000, max_case_performance=10000000) dataframe_cp_fa = attributes_filter.filter_df_keeping_spno_activities( dataframe_cp, activity_key=ACTIVITY_KEY, max_no_activities=MAX_NO_ACTIVITIES) dataframe_cp = None if DELETE_VARIABLES: del dataframe_cp calculate_process_schema_from_df(dataframe_cp_fa, "FILTER_CP_FREQUENCY.svg", "FILTER_CP_PERFORMANCE.svg") GENERATED_IMAGES.append("FILTER_CP_FREQUENCY.svg") GENERATED_IMAGES.append("FILTER_CP_PERFORMANCE.svg") if DELETE_VARIABLES: del dataframe_cp_fa dd = time.time() print("filtering on case performance and generating process schema=", (dd - cc)) if ENABLE_ATTRIBUTE_FILTER: parameters_att = { constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE, constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: ATTRIBUTE_TO_FILTER, constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ATTRIBUTE_TO_FILTER, "positive": True } dataframe_att = attributes_filter.apply(dataframe, ATTRIBUTE_VALUES_TO_FILTER, parameters=parameters_att) # dataframe_att = attributes_filter.apply_auto_filter(dataframe, parameters=parameters_att) print( "all the activities in the log", attributes_filter.get_attribute_values(dataframe_att, ACTIVITY_KEY)) dataframe_att_fa = attributes_filter.filter_df_keeping_spno_activities( dataframe_att, activity_key=ACTIVITY_KEY, max_no_activities=MAX_NO_ACTIVITIES) if DELETE_VARIABLES: del dataframe_att calculate_process_schema_from_df(dataframe_att_fa, "FILTER_ATT_FREQUENCY.svg", "FILTER_ATT_PERFORMANCE.svg") GENERATED_IMAGES.append("FILTER_ATT_FREQUENCY.svg") GENERATED_IMAGES.append("FILTER_ATT_PERFORMANCE.svg") if DELETE_VARIABLES: del dataframe_att_fa ee = time.time() print("filtering on attribute values and generating process schema=", (ee - dd)) ee = time.time() parameters_sa = { constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE, constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ACTIVITY_KEY } parameters_ea = { constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE, constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ACTIVITY_KEY } start_act = start_activities_filter.get_start_activities( dataframe, parameters=parameters_sa) print("start activities in the log = ", start_act) end_act = end_activities_filter.get_end_activities( dataframe, parameters=parameters_ea) print("end activities in the log = ", end_act) ff = time.time() print("finding start and end activities along with their count", (ff - ee)) if ENABLE_STARTACT_FILTER: dataframe_sa = start_activities_filter.apply(dataframe, STARTACT_TO_FILTER, parameters=parameters_sa) # dataframe_sa = start_activities_filter.apply_auto_filter(dataframe, parameters=parameters_sa) start_act = start_activities_filter.get_start_activities( dataframe_sa, parameters=parameters_sa) print("start activities in the filtered log = ", start_act) dataframe_sa_fa = attributes_filter.filter_df_keeping_spno_activities( dataframe_sa, activity_key=ACTIVITY_KEY, max_no_activities=MAX_NO_ACTIVITIES) if DELETE_VARIABLES: del dataframe_sa calculate_process_schema_from_df(dataframe_sa_fa, "FILTER_SA_FREQUENCY.svg", "FILTER_SA_PERFORMANCE.svg") GENERATED_IMAGES.append("FILTER_SA_FREQUENCY.svg") GENERATED_IMAGES.append("FILTER_SA_PERFORMANCE.svg") if DELETE_VARIABLES: del dataframe_sa_fa gg = time.time() if ENABLE_STARTACT_FILTER: print("filtering start activities time=", (gg - ff)) if ENABLE_ENDACT_FILTER: dataframe_ea = end_activities_filter.apply(dataframe, ENDACT_TO_FILTER, parameters=parameters_ea) # dataframe_ea = end_activities_filter.apply_auto_filter(dataframe, parameters=parameters_ea) end_act = end_activities_filter.get_end_activities( dataframe_ea, parameters=parameters_ea) print("end activities in the filtered log = ", end_act) dataframe_ea_fa = attributes_filter.filter_df_keeping_spno_activities( dataframe_ea, activity_key=ACTIVITY_KEY, max_no_activities=MAX_NO_ACTIVITIES) if DELETE_VARIABLES: del dataframe_ea calculate_process_schema_from_df(dataframe_ea_fa, "FILTER_EA_FREQUENCY.svg", "FILTER_EA_PERFORMANCE.svg") GENERATED_IMAGES.append("FILTER_EA_FREQUENCY.svg") GENERATED_IMAGES.append("FILTER_EA_PERFORMANCE.svg") if DELETE_VARIABLES: del dataframe_ea_fa hh = time.time() if ENABLE_ENDACT_FILTER: print("filtering end activities time=", (hh - gg)) if REMOVE_GENERATED_IMAGES: for image in GENERATED_IMAGES: os.remove(image)
"obj_parent": parents[el] if el in parents else "", "obj_type": nodes[el] } fill_event(e) trace.append(Event(e)) for s in G.neighbors(el): curr_nodes.append(s) parents[s] = el i = i + 1 trace = insert_missing_events(trace) trace = sorted(trace, key=lambda x: x[Shared.timestamp_column]) trace1 = Trace(trace) trace1.attributes["concept:name"] = o log.append(trace1) df = log_conv_factory.apply(log, variant=log_conv_factory.TO_DATAFRAME) df = start_activities_filter.apply(df, [target_type]) unique_values = set(df[Shared.activity_column].unique()) activities = {x: x for x in unique_values} activities["C"] = "Create Order" activities["J"] = "Create Delivery" activities["Q"] = "WMS Transfer Order" activities["R"] = "Goods Movement" activities["M"] = "Create Invoice" activities["L"] = "Create Debit Memo Request" activities["P"] = "Create Debit Memo" activities["U"] = "Create Pro Forma Invoice" activities["H"] = "Create Returns Document" activities.update(Shared.tcodes) df[Shared.activity_column] = df[Shared.activity_column].map(activities) df = df.dropna(subset=[Shared.activity_column]) df = df[[x for x in df.columns if "named:" not in x]]
tstct = tstct[tstct["SPRSL"] == "E"] tstct = tstct[["TCODE", "TTEXT"]] stream = tstct.to_dict("r") for row in stream: activities[row["TCODE"]] = row["TTEXT"] bkpf = pd.read_csv(os.path.join(dir, "bkpf_old.tsv"), sep="\t", dtype={ "BELNR": str, "AWKEY": str, "XBLNR": str, "BUKRS": str }) bkpf["time:timestamp"] = bkpf["CPUDT"] + " " + bkpf["CPUTM"] bkpf["time:timestamp"] = pd.to_datetime(bkpf["time:timestamp"], format="%d.%m.%Y %H:%M:%S") bkpf["case:concept:name"] = "C_" + bkpf["BELNR"] bkpf["concept:name"] = bkpf["TCODE"].map(activities) bkpf["org:resource"] = bkpf["USNAM"] bkpf = bkpf.dropna(subset=["concept:name"]) bkpf = bkpf.dropna(subset=["org:resource"]) bkpf = bkpf.sort_values("time:timestamp") bkpf = bkpf.reset_index() bkpf = bkpf[[x for x in bkpf.columns if not "named:" in x]] #print(start_activities_filter.get_start_activities(bkpf)) bkpf = start_activities_filter.apply(bkpf, ["Create Billing Document"]) #print(end_activities_filter.get_end_activities(bkpf)) bkpf = end_activities_filter.apply(bkpf, ["Post Document"]) parquet_exporter.apply(bkpf, "bkpf.parquet") bkpf.to_csv("bkpf.csv", index=False)