Ejemplo n.º 1
0
def filter_variants(log, variants, retain=True):
    """
    Filter a log on a specified set of variants

    Parameters
    ---------------
    log
        Event log
    variants
        collection of variants to filter; A variant should be specified as a list of activity names, e.g., ['a','b','c']
    retain
        boolean; if True all traces conforming to the specified variants are retained; if False, all those traces are removed

    Returns
    --------------
    filtered_log
        Filtered log object
    """
    if check_is_dataframe(log):
        check_dataframe_columns(log)
        from pm4py.algo.filtering.pandas.variants import variants_filter
        return variants_filter.apply(
            log, [",".join(v) for v in variants],
            parameters={variants_filter.Parameters.POSITIVE: retain})
    else:
        from pm4py.algo.filtering.log.variants import variants_filter
        return variants_filter.apply(
            log, [",".join(v) for v in variants],
            parameters={variants_filter.Parameters.POSITIVE: retain})
Ejemplo n.º 2
0
 def test_filtering_variants(self):
     # to avoid static method warnings in tests,
     # that by construction of the unittest package have to be expressed in such way
     self.dummy_variable = "dummy_value"
     input_log = os.path.join(INPUT_DATA_DIR, "running-example.csv")
     dataframe = csv_import_adapter.import_dataframe_from_path_wo_timeconversion(input_log, sep=',')
     variants = case_statistics.get_variant_statistics(dataframe)
     chosen_variants = [variants[0]["variant"]]
     dataframe = variants_filter.apply(dataframe, chosen_variants)
     del dataframe
Ejemplo n.º 3
0
 def test_filtering_variants(self):
     # to avoid static method warnings in tests,
     # that by construction of the unittest package have to be expressed in such way
     self.dummy_variable = "dummy_value"
     input_log = os.path.join(INPUT_DATA_DIR, "running-example.csv")
     dataframe = pd.read_csv(input_log)
     dataframe = dataframe_utils.convert_timestamp_columns_in_df(dataframe)
     variants = case_statistics.get_variant_statistics(dataframe)
     chosen_variants = [variants[0]["variant"]]
     dataframe = variants_filter.apply(dataframe, chosen_variants)
     del dataframe
Ejemplo n.º 4
0
    def get_case_statistics(self, parameters=None):
        """
        Gets the statistics on cases

        Parameters
        -------------
        parameters
            Possible parameters of the algorithm

        Returns
        -------------
        list_cases
            List of cases
        """
        if parameters is None:
            parameters = {}
        parameters[
            constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = self.activity_key
        parameters[
            constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = self.activity_key
        if self.reduced_grouped_dataframe is not None:
            parameters[
                constants.GROUPED_DATAFRAME] = self.reduced_grouped_dataframe
        # parameters["max_ret_cases"] = ws_constants.MAX_NO_CASES_TO_RETURN
        parameters["sort_by_column"] = parameters[
            "sort_by_column"] if "sort_by_column" in parameters else "caseDuration"
        parameters["sort_ascending"] = parameters[
            "sort_ascending"] if "sort_ascending" in parameters else False

        if "variant" in parameters:
            var_to_filter = parameters["variant"]
            # TODO: TECHNICAL DEBT
            # quick turnaround for bug
            var_to_filter = var_to_filter.replace(" start", "+start")
            var_to_filter = var_to_filter.replace(" START", "+START")
            var_to_filter = var_to_filter.replace(" complete", "+complete")
            var_to_filter = var_to_filter.replace(" COMPLETE", "+COMPLETE")

            filtered_dataframe = variants_filter.apply(
                self.get_reduced_dataframe(), [var_to_filter],
                parameters=parameters)
            return [
                casestats.include_key_in_value_list(
                    case_statistics.get_cases_description(
                        filtered_dataframe, parameters=parameters))
            ] + [self.get_log_summary_dictio()]
        else:
            return [
                casestats.include_key_in_value_list(
                    case_statistics.get_cases_description(
                        self.get_reduced_dataframe(), parameters=parameters))
            ] + [self.get_log_summary_dictio()]
Ejemplo n.º 5
0
def filter_variants(log, admitted_variants):
    """
    Filter a log_skeleton on a specified set of variants

    Parameters
    ---------------
    log
        Event log_skeleton
    admitted_variants
        List of variants to filter

    Returns
    --------------
    filtered_log
        Filtered log_skeleton object
    """
    if check_is_dataframe(log):
        check_dataframe_columns(log)
        from pm4py.algo.filtering.pandas.variants import variants_filter
        return variants_filter.apply(log, admitted_variants)
    else:
        from pm4py.algo.filtering.log.variants import variants_filter
        return variants_filter.apply(log, admitted_variants)
Ejemplo n.º 6
0
def filter_variants(log: Union[EventLog, pd.DataFrame],
                    variants: Union[Set[str], List[str]],
                    retain: bool = True) -> Union[EventLog, pd.DataFrame]:
    """
    Filter a log on a specified set of variants

    Parameters
    ---------------
    log
        Event log
    variants
        collection of variants to filter; A variant should be specified as a list of activity names, e.g., ['a','b','c']
    retain
        boolean; if True all traces conforming to the specified variants are retained; if False, all those traces are removed

    Returns
    --------------
    filtered_log
        Filtered log object
    """
    if type(log) not in [pd.DataFrame, EventLog, EventStream]:
        raise Exception(
            "the method can be applied only to a traditional event log!")

    from pm4py.util import variants_util
    parameters = get_properties(log)
    if variants_util.VARIANT_SPECIFICATION == variants_util.VariantsSpecifications.STRING:
        variants = [constants.DEFAULT_VARIANT_SEP.join(v) for v in variants]
    if check_is_pandas_dataframe(log):
        check_pandas_dataframe_columns(log)
        from pm4py.algo.filtering.pandas.variants import variants_filter
        parameters[variants_filter.Parameters.POSITIVE] = retain
        return variants_filter.apply(log, variants, parameters=parameters)
    else:
        from pm4py.algo.filtering.log.variants import variants_filter
        parameters[variants_filter.Parameters.POSITIVE] = retain
        return variants_filter.apply(log, variants, parameters=parameters)
Ejemplo n.º 7
0
def apply(dataframe, filter, parameters=None):
    """
    Apply a filter to the current log (variants filter)

    Parameters
    ------------
    dataframe
        Pandas dataframe
    filter
        Filter to apply
    parameters
        Parameters of the algorithm

    Returns
    ------------
    dataframe
        Pandas dataframe
    """
    if parameters is None:
        parameters = {}

    return variants_filter.apply(dataframe, filter[1], parameters=parameters)
def apply(df, parameters=None):
    """
    Returns a Pandas dataframe from which a sound workflow net could be extracted taking into account
    a discovery algorithm returning models only with visible transitions

    Parameters
    ------------
    df
        Pandas dataframe
    parameters
        Possible parameters of the algorithm, including:
            max_no_variants -> Maximum number of variants to consider to return a Petri net

    Returns
    ------------
    filtered_df
        Filtered dataframe
    """
    if parameters is None:
        parameters = {}

    if PARAMETER_CONSTANT_CASEID_KEY not in parameters:
        parameters[PARAMETER_CONSTANT_CASEID_KEY] = CASE_CONCEPT_NAME
    if PARAMETER_CONSTANT_ACTIVITY_KEY not in parameters:
        parameters[PARAMETER_CONSTANT_ACTIVITY_KEY] = DEFAULT_NAME_KEY
    if PARAMETER_CONSTANT_TIMESTAMP_KEY not in parameters:
        parameters[PARAMETER_CONSTANT_TIMESTAMP_KEY] = DEFAULT_TIMESTAMP_KEY
    if PARAMETER_CONSTANT_ATTRIBUTE_KEY not in parameters:
        parameters[PARAMETER_CONSTANT_ATTRIBUTE_KEY] = parameters[
            PARAMETER_CONSTANT_ACTIVITY_KEY]

    caseid_glue = parameters[PARAMETER_CONSTANT_CASEID_KEY]
    activity_key = parameters[PARAMETER_CONSTANT_ACTIVITY_KEY]
    timest_key = parameters[PARAMETER_CONSTANT_TIMESTAMP_KEY]

    max_no_variants = parameters[
        "max_no_variants"] if "max_no_variants" in parameters else 20

    variants_df = case_statistics.get_variants_df(df, parameters=parameters)
    parameters["variants_df"] = variants_df

    variant_stats = case_statistics.get_variant_statistics(
        df, parameters=parameters)

    all_variants_list = []
    for var in variant_stats:
        all_variants_list.append([var["variant"], var[caseid_glue]])

    all_variants_list = sorted(all_variants_list,
                               key=lambda x: (x[1], x[0]),
                               reverse=True)

    considered_variants = []
    considered_traces = []

    i = 0
    while i < min(len(all_variants_list), max_no_variants):
        variant = all_variants_list[i][0]

        considered_variants.append(variant)

        filtered_df = variants_filter.apply(df,
                                            considered_variants,
                                            parameters=parameters)

        dfg_frequency = dfg_util.get_dfg_graph(filtered_df,
                                               measure="frequency",
                                               perf_aggregation_key="median",
                                               case_id_glue=caseid_glue,
                                               activity_key=activity_key,
                                               timestamp_key=timest_key)

        net, initial_marking, final_marking = alpha_miner.apply_dfg(
            dfg_frequency, parameters=parameters)

        is_sound = check_soundness.check_petri_wfnet_and_soundness(net)
        if not is_sound:
            del considered_variants[-1]
        else:
            traces_of_this_variant = variants_filter.apply(
                df, [variant], parameters=parameters).groupby(caseid_glue)
            traces_of_this_variant_keys = list(
                traces_of_this_variant.groups.keys())
            trace_of_this_variant = traces_of_this_variant.get_group(
                traces_of_this_variant_keys[0])

            this_trace = transform.transform_event_log_to_trace_log(
                pandas_df_imp.convert_dataframe_to_event_log(
                    trace_of_this_variant),
                case_glue=caseid_glue)[0]
            if not activity_key == DEFAULT_NAME_KEY:
                for j in range(len(this_trace)):
                    this_trace[j][DEFAULT_NAME_KEY] = this_trace[j][
                        activity_key]
            considered_traces.append(this_trace)
            filtered_log = TraceLog(considered_traces)

            try:
                alignments = alignment_factory.apply(filtered_log, net,
                                                     initial_marking,
                                                     final_marking)
                del alignments
                fitness = replay_fitness_factory.apply(filtered_log,
                                                       net,
                                                       initial_marking,
                                                       final_marking,
                                                       parameters=parameters)
                if fitness["log_fitness"] < 0.99999:
                    del considered_variants[-1]
                    del considered_traces[-1]
            except TypeError:
                del considered_variants[-1]
                del considered_traces[-1]

        i = i + 1

    return variants_filter.apply(df,
                                 considered_variants,
                                 parameters=parameters)