def test_case_statistics(self): from pm4py.statistics.traces.generic.pandas import case_statistics df = self.get_dataframe() case_statistics.get_cases_description(df) case_statistics.get_variants_df(df) case_statistics.get_variant_statistics(df) #case_statistics.get_variant_statistics_with_case_duration(df) case_statistics.get_events(df, "N77802") case_statistics.get_variants_df_with_case_duration(df) case_statistics.get_variants_df_and_list(df) case_statistics.get_kde_caseduration(df)
def get_variants_list(log, parameters=None): """ Gets the list of variants (along with their count) from the particular log type Parameters ------------ log Log parameters Parameters of the algorithm Returns ------------- variants_list List of variants of the log (along with their count) """ from pm4py.statistics.traces.generic.pandas import case_statistics as pd_case_statistics from pm4py.statistics.traces.generic.log import case_statistics as log_case_statistics variants_list = [] if type(log) is pd.DataFrame: pd_variants = pd_case_statistics.get_variant_statistics( log, parameters=parameters) for var in pd_variants: varkeys = list(var.keys()) del varkeys[varkeys.index("variant")] variants_list.append((var["variant"], var[varkeys[0]])) else: log_variants = log_case_statistics.get_variant_statistics( log, parameters=parameters) for var in log_variants: varkeys = list(var.keys()) del varkeys[varkeys.index("variant")] variants_list.append((var["variant"], var[varkeys[0]])) return variants_list
def test_filtering_variants(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" input_log = os.path.join(INPUT_DATA_DIR, "running-example.csv") dataframe = pd.read_csv(input_log) dataframe = dataframe_utils.convert_timestamp_columns_in_df(dataframe) variants = case_statistics.get_variant_statistics(dataframe) chosen_variants = [variants[0]["variant"]] dataframe = variants_filter.apply(dataframe, chosen_variants) del dataframe
def apply(df, parameters=None): """ Convert a dataframe into a log containing N case per variant (only control-flow perspective is considered) Parameters ------------- df Dataframe parameters Parameters of the algorithm Returns ------------- log Event log """ from pm4py.statistics.traces.generic.pandas import case_statistics if parameters is None: parameters = {} return_variants = parameters[RETURN_VARIANTS] if RETURN_VARIANTS in parameters else False case_glue = parameters[ pm4_constants.PARAMETER_CONSTANT_CASEID_KEY] if pm4_constants.PARAMETER_CONSTANT_CASEID_KEY in parameters else pm4_constants.CASE_CONCEPT_NAME activity_key = parameters[ pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] if pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY variant_stats = case_statistics.get_variant_statistics(df, parameters=parameters) log = EventLog() all_variants_log = {} for vd in variant_stats: variant = vd['variant'].split(pm4_constants.DEFAULT_VARIANT_SEP) variant_count = vd[case_glue] trace = Trace() for activity in variant: event = Event() event[activity_key] = activity trace.append(event) all_variants_log[vd['variant']] = [] for i in range(variant_count): log.append(trace) all_variants_log[vd['variant']].append(len(log) - 1) if return_variants: return log, all_variants_log return log
def apply(log, parameters=None): """ Calculates the Working Together metric Parameters ------------ log Log parameters Possible parameters of the algorithm Returns ----------- tuple Tuple containing the metric matrix and the resources list. Moreover, last boolean indicates that the metric is not directed. """ if parameters is None: parameters = {} import numpy from pm4py.statistics.traces.generic.pandas import case_statistics resource_key = exec_utils.get_param_value(Parameters.RESOURCE_KEY, parameters, xes.DEFAULT_RESOURCE_KEY) parameters_variants = {case_statistics.Parameters.ACTIVITY_KEY: resource_key, case_statistics.Parameters.ATTRIBUTE_KEY: resource_key} variants_occ = {x["variant"]: x["case:concept:name"] for x in case_statistics.get_variant_statistics(log, parameters=parameters_variants)} variants_resources = list(variants_occ.keys()) resources = [variants_util.get_activities_from_variant(y) for y in variants_resources] flat_list = sorted(list(set([item for sublist in resources for item in sublist]))) metric_matrix = numpy.zeros((len(flat_list), len(flat_list))) for idx, rv in enumerate(resources): rvj = variants_resources[idx] ord_res_list = sorted(list(set(rv))) for i in range(len(ord_res_list) - 1): res_i = flat_list.index(ord_res_list[i]) for j in range(i + 1, len(ord_res_list)): res_j = flat_list.index(ord_res_list[j]) metric_matrix[res_i, res_j] += float(variants_occ[rvj]) / float(len(log)) metric_matrix[res_j, res_i] += float(variants_occ[rvj]) / float(len(log)) return [metric_matrix, flat_list, False]
def apply_auto_filter(df, parameters=None): """ Apply an automatic filter on variants Parameters ----------- df Dataframe parameters Parameters of the algorithm, including: Parameters.CASE_ID_KEY -> Column that contains the Case ID Parameters.ACTIVITY_KEY -> Column that contains the activity variants_df -> If provided, avoid recalculation of the variants dataframe Parameters.DECREASING_FACTOR -> Decreasing factor that should be passed to the algorithm Returns ----------- df Filtered dataframe """ if parameters is None: parameters = {} case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME) decreasing_factor = exec_utils.get_param_value( Parameters.DECREASING_FACTOR, parameters, filtering_constants.DECREASING_FACTOR) variants_df = case_statistics.get_variants_df(df, parameters=parameters) parameters["variants_df"] = variants_df variants = case_statistics.get_variant_statistics(df, parameters=parameters) admitted_variants = [] if len(variants) > 0: current_variant_count = variants[0][case_id_glue] for i in range(len(variants)): if variants[i][ case_id_glue] >= decreasing_factor * current_variant_count: admitted_variants.append(variants[i]["variant"]) else: break current_variant_count = variants[i][case_id_glue] return apply(df, admitted_variants, parameters=parameters)
def apply(df, parameters=None): """ Convert a dataframe into a log containing 1 case per variant (only control-flow perspective is considered) Parameters ------------- df Dataframe parameters Parameters of the algorithm Returns ------------- log Event log """ from pm4py.statistics.traces.generic.pandas import case_statistics if parameters is None: parameters = {} variant_stats = case_statistics.get_variant_statistics( df, parameters=parameters) activity_key = parameters[ pm4_constants. PARAMETER_CONSTANT_ACTIVITY_KEY] if pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY log = EventLog() for vd in variant_stats: variant = vd['variant'].split(pm4_constants.DEFAULT_VARIANT_SEP) trace = Trace() for activity in variant: event = Event() event[activity_key] = activity trace.append(event) log.append(trace) return log
def apply(log, parameters=None): """ Calculates the Subcontracting metric Parameters ------------ log Log parameters Possible parameters of the algorithm: Parameters.N -> n of the algorithm proposed in the Wil SNA paper Returns ----------- tuple Tuple containing the metric matrix and the resources list """ if parameters is None: parameters = {} import numpy from pm4py.statistics.traces.generic.pandas import case_statistics resource_key = exec_utils.get_param_value(Parameters.RESOURCE_KEY, parameters, xes.DEFAULT_RESOURCE_KEY) n = exec_utils.get_param_value(Parameters.N, parameters, 2) parameters_variants = { case_statistics.Parameters.ACTIVITY_KEY: resource_key, case_statistics.Parameters.ATTRIBUTE_KEY: resource_key } variants_occ = { x["variant"]: x["case:concept:name"] for x in case_statistics.get_variant_statistics( log, parameters=parameters_variants) } variants_resources = list(variants_occ.keys()) resources = [ variants_util.get_activities_from_variant(y) for y in variants_resources ] flat_list = sorted( list(set([item for sublist in resources for item in sublist]))) metric_matrix = numpy.zeros((len(flat_list), len(flat_list))) sum_i_to_j = {} for idx, rv in enumerate(resources): rvj = variants_resources[idx] for i in range(len(rv) - n): res_i = flat_list.index(rv[i]) res_i_n = flat_list.index(rv[i + n]) if res_i == res_i_n: if res_i not in sum_i_to_j: sum_i_to_j[res_i] = {} for j in range(i + 1, i + n): res_j = flat_list.index(rv[j]) if res_j not in sum_i_to_j[res_i]: sum_i_to_j[res_i][res_j] = 0 sum_i_to_j[res_i][res_j] += variants_occ[rvj] dividend = 0 for idx, rv in enumerate(resources): rvj = variants_resources[idx] dividend = dividend + variants_occ[rvj] * (len(rv) - 1) for key1 in sum_i_to_j: for key2 in sum_i_to_j[key1]: metric_matrix[key1][key2] = sum_i_to_j[key1][key2] / dividend return [metric_matrix, flat_list, True]
def apply(log, parameters=None): """ Calculates the HW metric Parameters ------------ log Log parameters Possible parameters of the algorithm: Paramters.BETA -> beta value as described in the Wil SNA paper Returns ----------- tuple Tuple containing the metric matrix and the resources list. Moreover, last boolean indicates that the metric is directed. """ if parameters is None: parameters = {} import numpy from pm4py.statistics.traces.generic.pandas import case_statistics resource_key = exec_utils.get_param_value(Parameters.RESOURCE_KEY, parameters, xes.DEFAULT_RESOURCE_KEY) beta = exec_utils.get_param_value(Parameters.BETA, parameters, 0) parameters_variants = {case_statistics.Parameters.ACTIVITY_KEY: resource_key, case_statistics.Parameters.ATTRIBUTE_KEY: resource_key} variants_occ = {x["variant"]: x["case:concept:name"] for x in case_statistics.get_variant_statistics(log, parameters=parameters_variants)} variants_resources = list(variants_occ.keys()) resources = [variants_util.get_activities_from_variant(y) for y in variants_resources] flat_list = sorted(list(set([item for sublist in resources for item in sublist]))) metric_matrix = numpy.zeros((len(flat_list), len(flat_list))) sum_i_to_j = {} for idx, rv in enumerate(resources): rvj = variants_resources[idx] for i in range(len(rv) - 1): res_i = flat_list.index(rv[i]) if not res_i in sum_i_to_j: sum_i_to_j[res_i] = {} for j in range(i + 1, len(rv)): res_j = flat_list.index(rv[j]) if not res_j in sum_i_to_j[res_i]: sum_i_to_j[res_i][res_j] = 0 if beta == 0: sum_i_to_j[res_i][res_j] += variants_occ[rvj] break else: sum_i_to_j[res_i][res_j] += variants_occ[rvj] * (beta ** (j - i - 1)) dividend = 0 for idx, rv in enumerate(resources): rvj = variants_resources[idx] if beta == 0: dividend = dividend + variants_occ[rvj] * (len(rv) - 1) else: dividend = dividend + variants_occ[rvj] * (len(rv) - 1) for key1 in sum_i_to_j: for key2 in sum_i_to_j[key1]: metric_matrix[key1][key2] = sum_i_to_j[key1][key2] / dividend return [metric_matrix, flat_list, True]