def filter_variants(log, variants, retain=True): """ Filter a log on a specified set of variants Parameters --------------- log Event log variants collection of variants to filter; A variant should be specified as a list of activity names, e.g., ['a','b','c'] retain boolean; if True all traces conforming to the specified variants are retained; if False, all those traces are removed Returns -------------- filtered_log Filtered log object """ if check_is_dataframe(log): check_dataframe_columns(log) from pm4py.algo.filtering.pandas.variants import variants_filter return variants_filter.apply( log, [",".join(v) for v in variants], parameters={variants_filter.Parameters.POSITIVE: retain}) else: from pm4py.algo.filtering.log.variants import variants_filter return variants_filter.apply( log, [",".join(v) for v in variants], parameters={variants_filter.Parameters.POSITIVE: retain})
def test_filtering_variants(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" input_log = os.path.join(INPUT_DATA_DIR, "running-example.csv") dataframe = csv_import_adapter.import_dataframe_from_path_wo_timeconversion(input_log, sep=',') variants = case_statistics.get_variant_statistics(dataframe) chosen_variants = [variants[0]["variant"]] dataframe = variants_filter.apply(dataframe, chosen_variants) del dataframe
def test_filtering_variants(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" input_log = os.path.join(INPUT_DATA_DIR, "running-example.csv") dataframe = pd.read_csv(input_log) dataframe = dataframe_utils.convert_timestamp_columns_in_df(dataframe) variants = case_statistics.get_variant_statistics(dataframe) chosen_variants = [variants[0]["variant"]] dataframe = variants_filter.apply(dataframe, chosen_variants) del dataframe
def get_case_statistics(self, parameters=None): """ Gets the statistics on cases Parameters ------------- parameters Possible parameters of the algorithm Returns ------------- list_cases List of cases """ if parameters is None: parameters = {} parameters[ constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = self.activity_key parameters[ constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = self.activity_key if self.reduced_grouped_dataframe is not None: parameters[ constants.GROUPED_DATAFRAME] = self.reduced_grouped_dataframe # parameters["max_ret_cases"] = ws_constants.MAX_NO_CASES_TO_RETURN parameters["sort_by_column"] = parameters[ "sort_by_column"] if "sort_by_column" in parameters else "caseDuration" parameters["sort_ascending"] = parameters[ "sort_ascending"] if "sort_ascending" in parameters else False if "variant" in parameters: var_to_filter = parameters["variant"] # TODO: TECHNICAL DEBT # quick turnaround for bug var_to_filter = var_to_filter.replace(" start", "+start") var_to_filter = var_to_filter.replace(" START", "+START") var_to_filter = var_to_filter.replace(" complete", "+complete") var_to_filter = var_to_filter.replace(" COMPLETE", "+COMPLETE") filtered_dataframe = variants_filter.apply( self.get_reduced_dataframe(), [var_to_filter], parameters=parameters) return [ casestats.include_key_in_value_list( case_statistics.get_cases_description( filtered_dataframe, parameters=parameters)) ] + [self.get_log_summary_dictio()] else: return [ casestats.include_key_in_value_list( case_statistics.get_cases_description( self.get_reduced_dataframe(), parameters=parameters)) ] + [self.get_log_summary_dictio()]
def filter_variants(log, admitted_variants): """ Filter a log_skeleton on a specified set of variants Parameters --------------- log Event log_skeleton admitted_variants List of variants to filter Returns -------------- filtered_log Filtered log_skeleton object """ if check_is_dataframe(log): check_dataframe_columns(log) from pm4py.algo.filtering.pandas.variants import variants_filter return variants_filter.apply(log, admitted_variants) else: from pm4py.algo.filtering.log.variants import variants_filter return variants_filter.apply(log, admitted_variants)
def filter_variants(log: Union[EventLog, pd.DataFrame], variants: Union[Set[str], List[str]], retain: bool = True) -> Union[EventLog, pd.DataFrame]: """ Filter a log on a specified set of variants Parameters --------------- log Event log variants collection of variants to filter; A variant should be specified as a list of activity names, e.g., ['a','b','c'] retain boolean; if True all traces conforming to the specified variants are retained; if False, all those traces are removed Returns -------------- filtered_log Filtered log object """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception( "the method can be applied only to a traditional event log!") from pm4py.util import variants_util parameters = get_properties(log) if variants_util.VARIANT_SPECIFICATION == variants_util.VariantsSpecifications.STRING: variants = [constants.DEFAULT_VARIANT_SEP.join(v) for v in variants] if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.algo.filtering.pandas.variants import variants_filter parameters[variants_filter.Parameters.POSITIVE] = retain return variants_filter.apply(log, variants, parameters=parameters) else: from pm4py.algo.filtering.log.variants import variants_filter parameters[variants_filter.Parameters.POSITIVE] = retain return variants_filter.apply(log, variants, parameters=parameters)
def apply(dataframe, filter, parameters=None): """ Apply a filter to the current log (variants filter) Parameters ------------ dataframe Pandas dataframe filter Filter to apply parameters Parameters of the algorithm Returns ------------ dataframe Pandas dataframe """ if parameters is None: parameters = {} return variants_filter.apply(dataframe, filter[1], parameters=parameters)
def apply(df, parameters=None): """ Returns a Pandas dataframe from which a sound workflow net could be extracted taking into account a discovery algorithm returning models only with visible transitions Parameters ------------ df Pandas dataframe parameters Possible parameters of the algorithm, including: max_no_variants -> Maximum number of variants to consider to return a Petri net Returns ------------ filtered_df Filtered dataframe """ if parameters is None: parameters = {} if PARAMETER_CONSTANT_CASEID_KEY not in parameters: parameters[PARAMETER_CONSTANT_CASEID_KEY] = CASE_CONCEPT_NAME if PARAMETER_CONSTANT_ACTIVITY_KEY not in parameters: parameters[PARAMETER_CONSTANT_ACTIVITY_KEY] = DEFAULT_NAME_KEY if PARAMETER_CONSTANT_TIMESTAMP_KEY not in parameters: parameters[PARAMETER_CONSTANT_TIMESTAMP_KEY] = DEFAULT_TIMESTAMP_KEY if PARAMETER_CONSTANT_ATTRIBUTE_KEY not in parameters: parameters[PARAMETER_CONSTANT_ATTRIBUTE_KEY] = parameters[ PARAMETER_CONSTANT_ACTIVITY_KEY] caseid_glue = parameters[PARAMETER_CONSTANT_CASEID_KEY] activity_key = parameters[PARAMETER_CONSTANT_ACTIVITY_KEY] timest_key = parameters[PARAMETER_CONSTANT_TIMESTAMP_KEY] max_no_variants = parameters[ "max_no_variants"] if "max_no_variants" in parameters else 20 variants_df = case_statistics.get_variants_df(df, parameters=parameters) parameters["variants_df"] = variants_df variant_stats = case_statistics.get_variant_statistics( df, parameters=parameters) all_variants_list = [] for var in variant_stats: all_variants_list.append([var["variant"], var[caseid_glue]]) all_variants_list = sorted(all_variants_list, key=lambda x: (x[1], x[0]), reverse=True) considered_variants = [] considered_traces = [] i = 0 while i < min(len(all_variants_list), max_no_variants): variant = all_variants_list[i][0] considered_variants.append(variant) filtered_df = variants_filter.apply(df, considered_variants, parameters=parameters) dfg_frequency = dfg_util.get_dfg_graph(filtered_df, measure="frequency", perf_aggregation_key="median", case_id_glue=caseid_glue, activity_key=activity_key, timestamp_key=timest_key) net, initial_marking, final_marking = alpha_miner.apply_dfg( dfg_frequency, parameters=parameters) is_sound = check_soundness.check_petri_wfnet_and_soundness(net) if not is_sound: del considered_variants[-1] else: traces_of_this_variant = variants_filter.apply( df, [variant], parameters=parameters).groupby(caseid_glue) traces_of_this_variant_keys = list( traces_of_this_variant.groups.keys()) trace_of_this_variant = traces_of_this_variant.get_group( traces_of_this_variant_keys[0]) this_trace = transform.transform_event_log_to_trace_log( pandas_df_imp.convert_dataframe_to_event_log( trace_of_this_variant), case_glue=caseid_glue)[0] if not activity_key == DEFAULT_NAME_KEY: for j in range(len(this_trace)): this_trace[j][DEFAULT_NAME_KEY] = this_trace[j][ activity_key] considered_traces.append(this_trace) filtered_log = TraceLog(considered_traces) try: alignments = alignment_factory.apply(filtered_log, net, initial_marking, final_marking) del alignments fitness = replay_fitness_factory.apply(filtered_log, net, initial_marking, final_marking, parameters=parameters) if fitness["log_fitness"] < 0.99999: del considered_variants[-1] del considered_traces[-1] except TypeError: del considered_variants[-1] del considered_traces[-1] i = i + 1 return variants_filter.apply(df, considered_variants, parameters=parameters)