Esempio n. 1
0
 def test_case_statistics(self):
     from pm4py.statistics.traces.generic.pandas import case_statistics
     df = self.get_dataframe()
     case_statistics.get_cases_description(df)
     case_statistics.get_variants_df(df)
     case_statistics.get_variant_statistics(df)
     #case_statistics.get_variant_statistics_with_case_duration(df)
     case_statistics.get_events(df, "N77802")
     case_statistics.get_variants_df_with_case_duration(df)
     case_statistics.get_variants_df_and_list(df)
     case_statistics.get_kde_caseduration(df)
Esempio n. 2
0
def get_variants_list(log, parameters=None):
    """
    Gets the list of variants (along with their count) from the particular log type

    Parameters
    ------------
    log
        Log
    parameters
        Parameters of the algorithm

    Returns
    -------------
    variants_list
        List of variants of the log (along with their count)
    """
    from pm4py.statistics.traces.generic.pandas import case_statistics as pd_case_statistics
    from pm4py.statistics.traces.generic.log import case_statistics as log_case_statistics

    variants_list = []
    if type(log) is pd.DataFrame:
        pd_variants = pd_case_statistics.get_variant_statistics(
            log, parameters=parameters)
        for var in pd_variants:
            varkeys = list(var.keys())
            del varkeys[varkeys.index("variant")]
            variants_list.append((var["variant"], var[varkeys[0]]))
    else:
        log_variants = log_case_statistics.get_variant_statistics(
            log, parameters=parameters)
        for var in log_variants:
            varkeys = list(var.keys())
            del varkeys[varkeys.index("variant")]
            variants_list.append((var["variant"], var[varkeys[0]]))
    return variants_list
Esempio n. 3
0
 def test_filtering_variants(self):
     # to avoid static method warnings in tests,
     # that by construction of the unittest package have to be expressed in such way
     self.dummy_variable = "dummy_value"
     input_log = os.path.join(INPUT_DATA_DIR, "running-example.csv")
     dataframe = pd.read_csv(input_log)
     dataframe = dataframe_utils.convert_timestamp_columns_in_df(dataframe)
     variants = case_statistics.get_variant_statistics(dataframe)
     chosen_variants = [variants[0]["variant"]]
     dataframe = variants_filter.apply(dataframe, chosen_variants)
     del dataframe
Esempio n. 4
0
def apply(df, parameters=None):
    """
    Convert a dataframe into a log containing N case per variant (only control-flow
    perspective is considered)

    Parameters
    -------------
    df
        Dataframe
    parameters
        Parameters of the algorithm

    Returns
    -------------
    log
        Event log
    """
    from pm4py.statistics.traces.generic.pandas import case_statistics

    if parameters is None:
        parameters = {}

    return_variants = parameters[RETURN_VARIANTS] if RETURN_VARIANTS in parameters else False

    case_glue = parameters[
        pm4_constants.PARAMETER_CONSTANT_CASEID_KEY] if pm4_constants.PARAMETER_CONSTANT_CASEID_KEY in parameters else pm4_constants.CASE_CONCEPT_NAME
    activity_key = parameters[
        pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] if pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY

    variant_stats = case_statistics.get_variant_statistics(df, parameters=parameters)

    log = EventLog()
    all_variants_log = {}
    for vd in variant_stats:
        variant = vd['variant'].split(pm4_constants.DEFAULT_VARIANT_SEP)
        variant_count = vd[case_glue]
        trace = Trace()
        for activity in variant:
            event = Event()
            event[activity_key] = activity
            trace.append(event)
        all_variants_log[vd['variant']] = []
        for i in range(variant_count):
            log.append(trace)
            all_variants_log[vd['variant']].append(len(log) - 1)

    if return_variants:
        return log, all_variants_log

    return log
Esempio n. 5
0
def apply(log, parameters=None):
    """
    Calculates the Working Together metric

    Parameters
    ------------
    log
        Log
    parameters
        Possible parameters of the algorithm

    Returns
    -----------
    tuple
        Tuple containing the metric matrix and the resources list. Moreover, last boolean indicates that the metric is
        not directed.
    """

    if parameters is None:
        parameters = {}

    import numpy
    from pm4py.statistics.traces.generic.pandas import case_statistics

    resource_key = exec_utils.get_param_value(Parameters.RESOURCE_KEY, parameters, xes.DEFAULT_RESOURCE_KEY)

    parameters_variants = {case_statistics.Parameters.ACTIVITY_KEY: resource_key,
                           case_statistics.Parameters.ATTRIBUTE_KEY: resource_key}
    variants_occ = {x["variant"]: x["case:concept:name"] for x in
                    case_statistics.get_variant_statistics(log, parameters=parameters_variants)}
    variants_resources = list(variants_occ.keys())
    resources = [variants_util.get_activities_from_variant(y) for y in variants_resources]

    flat_list = sorted(list(set([item for sublist in resources for item in sublist])))

    metric_matrix = numpy.zeros((len(flat_list), len(flat_list)))

    for idx, rv in enumerate(resources):
        rvj = variants_resources[idx]
        ord_res_list = sorted(list(set(rv)))

        for i in range(len(ord_res_list) - 1):
            res_i = flat_list.index(ord_res_list[i])
            for j in range(i + 1, len(ord_res_list)):
                res_j = flat_list.index(ord_res_list[j])
                metric_matrix[res_i, res_j] += float(variants_occ[rvj]) / float(len(log))
                metric_matrix[res_j, res_i] += float(variants_occ[rvj]) / float(len(log))

    return [metric_matrix, flat_list, False]
Esempio n. 6
0
def apply_auto_filter(df, parameters=None):
    """
    Apply an automatic filter on variants

    Parameters
    -----------
    df
        Dataframe
    parameters
        Parameters of the algorithm, including:
            Parameters.CASE_ID_KEY -> Column that contains the Case ID
            Parameters.ACTIVITY_KEY -> Column that contains the activity
            variants_df -> If provided, avoid recalculation of the variants dataframe
            Parameters.DECREASING_FACTOR -> Decreasing factor that should be passed to the algorithm

    Returns
    -----------
    df
        Filtered dataframe
    """
    if parameters is None:
        parameters = {}
    case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY,
                                              parameters, CASE_CONCEPT_NAME)
    decreasing_factor = exec_utils.get_param_value(
        Parameters.DECREASING_FACTOR, parameters,
        filtering_constants.DECREASING_FACTOR)
    variants_df = case_statistics.get_variants_df(df, parameters=parameters)
    parameters["variants_df"] = variants_df
    variants = case_statistics.get_variant_statistics(df,
                                                      parameters=parameters)

    admitted_variants = []
    if len(variants) > 0:
        current_variant_count = variants[0][case_id_glue]

        for i in range(len(variants)):
            if variants[i][
                    case_id_glue] >= decreasing_factor * current_variant_count:
                admitted_variants.append(variants[i]["variant"])
            else:
                break
            current_variant_count = variants[i][case_id_glue]

    return apply(df, admitted_variants, parameters=parameters)
Esempio n. 7
0
def apply(df, parameters=None):
    """
    Convert a dataframe into a log containing 1 case per variant (only control-flow
    perspective is considered)

    Parameters
    -------------
    df
        Dataframe
    parameters
        Parameters of the algorithm

    Returns
    -------------
    log
        Event log
    """
    from pm4py.statistics.traces.generic.pandas import case_statistics

    if parameters is None:
        parameters = {}
    variant_stats = case_statistics.get_variant_statistics(
        df, parameters=parameters)
    activity_key = parameters[
        pm4_constants.
        PARAMETER_CONSTANT_ACTIVITY_KEY] if pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY
    log = EventLog()
    for vd in variant_stats:
        variant = vd['variant'].split(pm4_constants.DEFAULT_VARIANT_SEP)
        trace = Trace()
        for activity in variant:
            event = Event()
            event[activity_key] = activity
            trace.append(event)
        log.append(trace)
    return log
Esempio n. 8
0
def apply(log, parameters=None):
    """
    Calculates the Subcontracting metric

    Parameters
    ------------
    log
        Log
    parameters
        Possible parameters of the algorithm:
            Parameters.N -> n of the algorithm proposed in the Wil SNA paper

    Returns
    -----------
    tuple
        Tuple containing the metric matrix and the resources list
    """
    if parameters is None:
        parameters = {}

    import numpy
    from pm4py.statistics.traces.generic.pandas import case_statistics

    resource_key = exec_utils.get_param_value(Parameters.RESOURCE_KEY,
                                              parameters,
                                              xes.DEFAULT_RESOURCE_KEY)
    n = exec_utils.get_param_value(Parameters.N, parameters, 2)

    parameters_variants = {
        case_statistics.Parameters.ACTIVITY_KEY: resource_key,
        case_statistics.Parameters.ATTRIBUTE_KEY: resource_key
    }
    variants_occ = {
        x["variant"]: x["case:concept:name"]
        for x in case_statistics.get_variant_statistics(
            log, parameters=parameters_variants)
    }
    variants_resources = list(variants_occ.keys())
    resources = [
        variants_util.get_activities_from_variant(y)
        for y in variants_resources
    ]

    flat_list = sorted(
        list(set([item for sublist in resources for item in sublist])))

    metric_matrix = numpy.zeros((len(flat_list), len(flat_list)))

    sum_i_to_j = {}

    for idx, rv in enumerate(resources):
        rvj = variants_resources[idx]
        for i in range(len(rv) - n):
            res_i = flat_list.index(rv[i])
            res_i_n = flat_list.index(rv[i + n])
            if res_i == res_i_n:
                if res_i not in sum_i_to_j:
                    sum_i_to_j[res_i] = {}
                    for j in range(i + 1, i + n):
                        res_j = flat_list.index(rv[j])
                        if res_j not in sum_i_to_j[res_i]:
                            sum_i_to_j[res_i][res_j] = 0
                        sum_i_to_j[res_i][res_j] += variants_occ[rvj]

    dividend = 0
    for idx, rv in enumerate(resources):
        rvj = variants_resources[idx]
        dividend = dividend + variants_occ[rvj] * (len(rv) - 1)

    for key1 in sum_i_to_j:
        for key2 in sum_i_to_j[key1]:
            metric_matrix[key1][key2] = sum_i_to_j[key1][key2] / dividend

    return [metric_matrix, flat_list, True]
Esempio n. 9
0
def apply(log, parameters=None):
    """
    Calculates the HW metric

    Parameters
    ------------
    log
        Log
    parameters
        Possible parameters of the algorithm:
            Paramters.BETA -> beta value as described in the Wil SNA paper

    Returns
    -----------
    tuple
        Tuple containing the metric matrix and the resources list. Moreover, last boolean indicates that the metric is
        directed.
    """
    if parameters is None:
        parameters = {}

    import numpy
    from pm4py.statistics.traces.generic.pandas import case_statistics

    resource_key = exec_utils.get_param_value(Parameters.RESOURCE_KEY, parameters, xes.DEFAULT_RESOURCE_KEY)
    beta = exec_utils.get_param_value(Parameters.BETA, parameters, 0)

    parameters_variants = {case_statistics.Parameters.ACTIVITY_KEY: resource_key,
                           case_statistics.Parameters.ATTRIBUTE_KEY: resource_key}

    variants_occ = {x["variant"]: x["case:concept:name"] for x in
                    case_statistics.get_variant_statistics(log, parameters=parameters_variants)}
    variants_resources = list(variants_occ.keys())
    resources = [variants_util.get_activities_from_variant(y) for y in variants_resources]

    flat_list = sorted(list(set([item for sublist in resources for item in sublist])))

    metric_matrix = numpy.zeros((len(flat_list), len(flat_list)))

    sum_i_to_j = {}

    for idx, rv in enumerate(resources):
        rvj = variants_resources[idx]
        for i in range(len(rv) - 1):
            res_i = flat_list.index(rv[i])
            if not res_i in sum_i_to_j:
                sum_i_to_j[res_i] = {}
            for j in range(i + 1, len(rv)):
                res_j = flat_list.index(rv[j])
                if not res_j in sum_i_to_j[res_i]:
                    sum_i_to_j[res_i][res_j] = 0
                if beta == 0:
                    sum_i_to_j[res_i][res_j] += variants_occ[rvj]
                    break
                else:
                    sum_i_to_j[res_i][res_j] += variants_occ[rvj] * (beta ** (j - i - 1))

    dividend = 0
    for idx, rv in enumerate(resources):
        rvj = variants_resources[idx]
        if beta == 0:
            dividend = dividend + variants_occ[rvj] * (len(rv) - 1)
        else:
            dividend = dividend + variants_occ[rvj] * (len(rv) - 1)

    for key1 in sum_i_to_j:
        for key2 in sum_i_to_j[key1]:
            metric_matrix[key1][key2] = sum_i_to_j[key1][key2] / dividend

    return [metric_matrix, flat_list, True]