Ejemplo n.º 1
0
def discover_dfg(log: Union[EventLog, pd.DataFrame]) -> Tuple[dict, dict, dict]:
    """
    Discovers a DFG from a log

    Parameters
    --------------
    log
        Event log

    Returns
    --------------
    dfg
        DFG
    start_activities
        Start activities
    end_activities
        End activities
    """
    if check_is_dataframe(log):
        check_dataframe_columns(log)
        from pm4py.objects.dfg.retrieval.pandas import get_dfg_graph
        dfg = get_dfg_graph(log)
        from pm4py.statistics.start_activities.pandas import get as start_activities_module
        from pm4py.statistics.end_activities.pandas import get as end_activities_module
        start_activities = start_activities_module.get_start_activities(log)
        end_activities = end_activities_module.get_end_activities(log)
    else:
        from pm4py.algo.discovery.dfg import algorithm as dfg_discovery
        dfg = dfg_discovery.apply(log)
        from pm4py.statistics.start_activities.log import get as start_activities_module
        from pm4py.statistics.end_activities.log import get as end_activities_module
        start_activities = start_activities_module.get_start_activities(log)
        end_activities = end_activities_module.get_end_activities(log)
    return dfg, start_activities, end_activities
Ejemplo n.º 2
0
def discover_performance_dfg(log: Union[EventLog, pd.DataFrame], business_hours: bool = False, worktiming: List[int] = [7, 17], weekends: List[int] = [6, 7], workcalendar=constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR) -> Tuple[dict, dict, dict]:
    """
    Discovers a performance directly-follows graph from an event log

    Parameters
    ---------------
    log
        Event log
    business_hours
        Enables/disables the computation based on the business hours (default: False)
    worktiming
        (If the business hours are enabled) The hour range in which the resources of the log are working (default: 7 to 17)
    weekends
        (If the business hours are enabled) The weekends days (default: Saturday (6), Sunday (7))

    Returns
    ---------------
    performance_dfg
        Performance DFG
    start_activities
        Start activities
    end_activities
        End activities
    """
    if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!")

    if check_is_pandas_dataframe(log):
        check_pandas_dataframe_columns(log)
        from pm4py.util import constants
        properties = get_properties(log)
        from pm4py.algo.discovery.dfg.adapters.pandas.df_statistics import get_dfg_graph
        activity_key = properties[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in properties else xes_constants.DEFAULT_NAME_KEY
        timestamp_key = properties[constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in properties else xes_constants.DEFAULT_TIMESTAMP_KEY
        case_id_key = properties[constants.PARAMETER_CONSTANT_CASEID_KEY] if constants.PARAMETER_CONSTANT_CASEID_KEY in properties else constants.CASE_CONCEPT_NAME
        dfg = get_dfg_graph(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_glue=case_id_key, measure="performance", perf_aggregation_key="all",
                            business_hours=business_hours, worktiming=worktiming, weekends=weekends, workcalendar=workcalendar)
        from pm4py.statistics.start_activities.pandas import get as start_activities_module
        from pm4py.statistics.end_activities.pandas import get as end_activities_module
        start_activities = start_activities_module.get_start_activities(log, parameters=properties)
        end_activities = end_activities_module.get_end_activities(log, parameters=properties)
    else:
        from pm4py.algo.discovery.dfg.variants import performance as dfg_discovery
        properties = get_properties(log)
        properties[dfg_discovery.Parameters.AGGREGATION_MEASURE] = "all"
        properties[dfg_discovery.Parameters.BUSINESS_HOURS] = business_hours
        properties[dfg_discovery.Parameters.WORKTIMING] = worktiming
        properties[dfg_discovery.Parameters.WEEKENDS] = weekends
        dfg = dfg_discovery.apply(log, parameters=properties)
        from pm4py.statistics.start_activities.log import get as start_activities_module
        from pm4py.statistics.end_activities.log import get as end_activities_module
        start_activities = start_activities_module.get_start_activities(log, parameters=properties)
        end_activities = end_activities_module.get_end_activities(log, parameters=properties)
    return dfg, start_activities, end_activities
Ejemplo n.º 3
0
def execute_script():
    df = pd.read_csv("../tests/input_data/interval_event_log.csv")
    df = dataframe_utils.convert_timestamp_columns_in_df(df)
    act_count = dict(df["concept:name"].value_counts())
    parameters = {}
    parameters[
        constants.PARAMETER_CONSTANT_START_TIMESTAMP_KEY] = "start_timestamp"
    parameters[constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] = "time:timestamp"
    parameters["format"] = "svg"
    start_activities = sa_get.get_start_activities(df, parameters=parameters)
    end_activities = ea_get.get_end_activities(df, parameters=parameters)
    parameters["start_activities"] = start_activities
    parameters["end_activities"] = end_activities
    soj_time = soj_time_get.apply(df, parameters=parameters)
    dfg, performance_dfg = correlation_miner.apply(
        df, variant=correlation_miner.Variants.CLASSIC, parameters=parameters)
    gviz_freq = dfg_vis.apply(dfg,
                              activities_count=act_count,
                              soj_time=soj_time,
                              variant=dfg_vis.Variants.FREQUENCY,
                              parameters=parameters)
    dfg_vis.view(gviz_freq)
    gviz_perf = dfg_vis.apply(performance_dfg,
                              activities_count=act_count,
                              soj_time=soj_time,
                              variant=dfg_vis.Variants.PERFORMANCE,
                              parameters=parameters)
    dfg_vis.view(gviz_perf)
Ejemplo n.º 4
0
def apply_auto_filter(df, parameters=None):
    """
    Apply auto filter on end activities

    Parameters
    -----------
    df
        Pandas dataframe
    parameters
        Parameters of the algorithm, including:
            Parameters.CASE_ID_KEY -> Case ID column in the dataframe
            Parameters.ACTIVITY_KEY -> Column that represents the activity
            Parameters.DECREASING_FACTOR -> Decreasing factor that should be passed to the algorithm

    Returns
    -----------
    df
        Filtered dataframe
    """
    if parameters is None:
        parameters = {}

    case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME)
    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, DEFAULT_NAME_KEY)
    grouped_df = exec_utils.get_param_value(Parameters.GROUP_DATAFRAME, parameters, None)
    decreasing_factor = exec_utils.get_param_value(Parameters.DECREASING_FACTOR, parameters,
                                                   filtering_constants.DECREASING_FACTOR)

    start_activities = get_start_activities(df, parameters=parameters)
    salist = start_activities_common.get_sorted_start_activities_list(start_activities)
    sathreshold = start_activities_common.get_start_activities_threshold(salist, decreasing_factor)

    return filter_df_on_start_activities_nocc(df, sathreshold, sa_count0=start_activities, case_id_glue=case_id_glue,
                                              activity_key=activity_key, grouped_df=grouped_df)
Ejemplo n.º 5
0
def discover_abstraction_dataframe(df: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None) -> Tuple[
    Any, Any, Any, Any, Any, Any, Any]:
    """
    Discovers an abstraction from a dataframe that is useful for the Heuristics Miner ++ algorithm

    Parameters
    --------------
    df
        Dataframe
    parameters
        Parameters of the algorithm, including:
        - Parameters.ACTIVITY_KEY
        - Parameters.START_TIMESTAMP_KEY
        - Parameters.TIMESTAMP_KEY
        - Parameters.CASE_ID_KEY

    Returns
    --------------
    start_activities
        Start activities
    end_activities
        End activities
    activities_occurrences
        Activities along with their number of occurrences
    dfg
        Directly-follows graph
    performance_dfg
        (Performance) Directly-follows graph
    sojourn_time
        Sojourn time for each activity
    concurrent_activities
        Concurrent activities
    """
    if parameters is None:
        parameters = {}
    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY)
    start_timestamp_key = exec_utils.get_param_value(Parameters.START_TIMESTAMP_KEY, parameters,
                                                     None)
    if start_timestamp_key is None:
        start_timestamp_key = xes.DEFAULT_START_TIMESTAMP_KEY
        parameters = copy(parameters)
        parameters[Parameters.START_TIMESTAMP_KEY] = start_timestamp_key
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes.DEFAULT_TIMESTAMP_KEY)
    case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME)
    start_activities = pd_sa.get_start_activities(df, parameters=parameters)
    end_activities = pd_ea.get_end_activities(df, parameters=parameters)
    activities_occurrences = pd_attributes.get_attribute_values(df, activity_key, parameters=parameters)
    efg_parameters = copy(parameters)
    efg_parameters[pd_efg.Parameters.KEEP_FIRST_FOLLOWING] = True
    dfg = pd_efg.apply(df, parameters=efg_parameters)
    performance_dfg = df_statistics.get_dfg_graph(df, case_id_glue=case_id_glue,
                                                  activity_key=activity_key, timestamp_key=timestamp_key,
                                                  start_timestamp_key=start_timestamp_key, measure="performance")
    sojourn_time = pd_soj_time.apply(df, parameters=parameters)
    concurrent_activities = pd_conc_act.apply(df, parameters=parameters)
    return (
        start_activities, end_activities, activities_occurrences, dfg, performance_dfg, sojourn_time,
        concurrent_activities)
Ejemplo n.º 6
0
def discover_dfg(log: Union[EventLog, pd.DataFrame]) -> Tuple[dict, dict, dict]:
    """
    Discovers a DFG from a log

    Parameters
    --------------
    log
        Event log

    Returns
    --------------
    dfg
        DFG
    start_activities
        Start activities
    end_activities
        End activities
    """
    if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!")

    if check_is_pandas_dataframe(log):
        check_pandas_dataframe_columns(log)
        from pm4py.util import constants
        properties = get_properties(log)
        from pm4py.algo.discovery.dfg.adapters.pandas.df_statistics import get_dfg_graph
        activity_key = properties[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in properties else xes_constants.DEFAULT_NAME_KEY
        timestamp_key = properties[constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in properties else xes_constants.DEFAULT_TIMESTAMP_KEY
        case_id_key = properties[constants.PARAMETER_CONSTANT_CASEID_KEY] if constants.PARAMETER_CONSTANT_CASEID_KEY in properties else constants.CASE_CONCEPT_NAME
        dfg = get_dfg_graph(log, activity_key=activity_key,
                            timestamp_key=timestamp_key,
                            case_id_glue=case_id_key)
        from pm4py.statistics.start_activities.pandas import get as start_activities_module
        from pm4py.statistics.end_activities.pandas import get as end_activities_module
        start_activities = start_activities_module.get_start_activities(log, parameters=properties)
        end_activities = end_activities_module.get_end_activities(log, parameters=properties)
    else:
        from pm4py.algo.discovery.dfg import algorithm as dfg_discovery
        dfg = dfg_discovery.apply(log, parameters=get_properties(log))
        from pm4py.statistics.start_activities.log import get as start_activities_module
        from pm4py.statistics.end_activities.log import get as end_activities_module
        start_activities = start_activities_module.get_start_activities(log, parameters=get_properties(log))
        end_activities = end_activities_module.get_end_activities(log, parameters=get_properties(log))
    return dfg, start_activities, end_activities
Ejemplo n.º 7
0
def get_start_activities(log):
    """
    Returns the start activities from a log object

    Parameters
    ---------------
    log
        Log object

    Returns
    ---------------
    start_activities
        Dictionary of start activities along with their count
    """
    if check_is_dataframe(log):
        check_dataframe_columns(log)
        from pm4py.statistics.start_activities.pandas import get
        return get.get_start_activities(log)
    else:
        from pm4py.statistics.start_activities.log import get
        return get.get_start_activities(log)
Ejemplo n.º 8
0
def apply(log, parameters=None):
    """
    Apply the IMDF algorithm to a log obtaining a Petri net along with an initial and final marking

    Parameters
    -----------
    log
        Log
    parameters
        Parameters of the algorithm, including:
            Parameters.ACTIVITY_KEY -> attribute of the log to use as activity name
            (default concept:name)

    Returns
    -----------
    net
        Petri net
    initial_marking
        Initial marking
    final_marking
        Final marking
    """
    if parameters is None:
        parameters = {}
    case_id_glue = exec_utils.get_param_value(
        Parameters.CASE_ID_KEY, parameters, pmutil.constants.CASE_CONCEPT_NAME)
    activity_key = exec_utils.get_param_value(
        Parameters.ACTIVITY_KEY, parameters,
        pmutil.xes_constants.DEFAULT_NAME_KEY)
    timestamp_key = exec_utils.get_param_value(
        Parameters.TIMESTAMP_KEY, parameters,
        pmutil.xes_constants.DEFAULT_TIMESTAMP_KEY)
    if isinstance(log, pandas.core.frame.DataFrame):
        dfg = df_statistics.get_dfg_graph(log,
                                          case_id_glue=case_id_glue,
                                          activity_key=activity_key,
                                          timestamp_key=timestamp_key)
        start_activities = pd_start_act_stats.get_start_activities(
            log, parameters=parameters)
        end_activities = pd_end_act_stats.get_end_activities(
            log, parameters=parameters)
        activities = pd_attributes_stats.get_attribute_values(
            log, activity_key, parameters=parameters)
        return apply_dfg(dfg,
                         activities=activities,
                         start_activities=start_activities,
                         end_activities=end_activities,
                         parameters=parameters)
    log = log_conversion.apply(log, parameters, log_conversion.TO_EVENT_LOG)
    tree = apply_tree(log, parameters=parameters)
    net, initial_marking, final_marking = tree_to_petri.apply(tree)
    return net, initial_marking, final_marking
Ejemplo n.º 9
0
def get_start_activities(log: Union[EventLog, pd.DataFrame]) -> Dict[str, int]:
    """
    Returns the start activities from a log object

    Parameters
    ---------------
    log
        Log object

    Returns
    ---------------
    start_activities
        Dictionary of start activities along with their count
    """
    if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!")

    if check_is_pandas_dataframe(log):
        check_pandas_dataframe_columns(log)
        from pm4py.statistics.start_activities.pandas import get
        return get.get_start_activities(log, parameters=get_properties(log))
    else:
        from pm4py.statistics.start_activities.log import get
        return get.get_start_activities(log, parameters=get_properties(log))
Ejemplo n.º 10
0
def filter_df_on_start_activities_nocc(df,
                                       nocc,
                                       sa_count0=None,
                                       case_id_glue=CASE_CONCEPT_NAME,
                                       activity_key=DEFAULT_NAME_KEY,
                                       grouped_df=None):
    """
    Filter dataframe on start activities number of occurrences

    Parameters
    -----------
    df
        Dataframe
    nocc
        Minimum number of occurrences of the start activity
    sa_count0
        (if provided) Dictionary that associates each start activity with its count
    case_id_glue
        Column that contains the Case ID
    activity_key
        Column that contains the activity
    grouped_df
        Grouped dataframe

    Returns
    ------------
    df
        Filtered dataframe
    """
    if grouped_df is None:
        grouped_df = df.groupby(case_id_glue)
    first_eve_df = grouped_df.first()
    if sa_count0 is None:
        parameters = {
            Parameters.CASE_ID_KEY: case_id_glue,
            Parameters.ACTIVITY_KEY: activity_key,
            Parameters.GROUP_DATAFRAME: grouped_df
        }
        sa_count0 = get_start_activities(df, parameters=parameters)
    sa_count = [k for k, v in sa_count0.items() if v >= nocc]
    if len(sa_count) < len(sa_count0):
        first_eve_df = first_eve_df[first_eve_df[activity_key].isin(sa_count)]
        i1 = df.set_index(case_id_glue).index
        i2 = first_eve_df.index
        ret = df[i1.isin(i2)]
    else:
        ret = df

    ret.attrs = copy(df.attrs) if hasattr(df, 'attrs') else {}
    return ret
Ejemplo n.º 11
0
def execute_script():
    log_path = os.path.join("..", "tests", "input_data",
                            "interval_event_log.csv")
    dataframe = pm4py.read_csv(log_path)
    log_path = os.path.join("..", "tests", "input_data", "reviewing.xes")
    log = pm4py.read_xes(log_path)
    dataframe = pm4py.convert_to_dataframe(log)
    parameters = {}
    #parameters[constants.PARAMETER_CONSTANT_START_TIMESTAMP_KEY] = "start_timestamp"
    parameters[constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] = "time:timestamp"
    parameters[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = "concept:name"
    parameters[constants.PARAMETER_CONSTANT_CASEID_KEY] = "case:concept:name"
    parameters["strict"] = True
    parameters["format"] = "svg"
    start_activities = sa_get.get_start_activities(dataframe,
                                                   parameters=parameters)
    end_activities = ea_get.get_end_activities(dataframe,
                                               parameters=parameters)
    att_count = att_get.get_attribute_values(dataframe,
                                             "concept:name",
                                             parameters=parameters)
    parameters["start_activities"] = start_activities
    parameters["end_activities"] = end_activities
    soj_time = soj_time_get.apply(dataframe, parameters=parameters)
    print("soj_time")
    print(soj_time)
    conc_act = conc_act_get.apply(dataframe, parameters=parameters)
    print("conc_act")
    print(conc_act)
    efg = efg_get.apply(dataframe, parameters=parameters)
    print("efg")
    print(efg)
    dfg_freq, dfg_perf = df_statistics.get_dfg_graph(
        dataframe, measure="both", start_timestamp_key="start_timestamp")
    dfg_gv_freq = dfg_vis_fact.apply(dfg_freq,
                                     activities_count=att_count,
                                     variant=dfg_vis_fact.Variants.FREQUENCY,
                                     soj_time=soj_time,
                                     parameters=parameters)
    dfg_vis_fact.view(dfg_gv_freq)
    dfg_gv_perf = dfg_vis_fact.apply(dfg_perf,
                                     activities_count=att_count,
                                     variant=dfg_vis_fact.Variants.PERFORMANCE,
                                     soj_time=soj_time,
                                     parameters=parameters)
    dfg_vis_fact.view(dfg_gv_perf)
    net, im, fm = dfg_conv.apply(dfg_freq)
    gviz = pn_vis.apply(net, im, fm, parameters=parameters)
    pn_vis.view(gviz)
Ejemplo n.º 12
0
def get_process_svg():
    parameters = request.args.get("parameters")
    parameters = __process_parameters(parameters)

    log = __prepare_event_log(parameters)
    ext_type = parameters[
        "ext_type"] if "ext_type" in parameters else "document_flow_log"
    log_type = __get_log_type_from_ext_type(ext_type)

    if log_type == 0:
        log.type = "succint"
        from pm4pymdl.algo.mvp.gen_framework import algorithm as discovery
        from pm4pymdl.visualization.mvp.gen_framework import visualizer as vis_factory
        model = discovery.apply(log,
                                model_type_variant="model3",
                                node_freq_variant="type31",
                                edge_freq_variant="type11")
        gviz = vis_factory.apply(model, parameters={"format": "svg"})
    elif log_type == 1 or log_type == 2:
        import pandas as pd
        if type(log) is pd.DataFrame:
            from pm4py.objects.dfg.retrieval.pandas import get_dfg_graph
            dfg = get_dfg_graph(log)
            from pm4py.statistics.start_activities.pandas import get as pd_sa_get
            from pm4py.statistics.end_activities.pandas import get as pd_ea_get
            sa = pd_sa_get.get_start_activities(log)
            ea = pd_ea_get.get_end_activities(log)
        else:
            dfg, sa, ea = pm4py.discover_dfg(log)
        act_count = pm4py.get_attribute_values(log, "concept:name")
        dfg, sa, ea, act_count = dfg_filtering.filter_dfg_on_paths_percentage(
            dfg, sa, ea, act_count, 0.2, keep_all_activities=True)
        gviz = pm4py.visualization.dfg.visualizer.apply(
            dfg,
            activities_count=act_count,
            parameters={
                "format": "svg",
                "start_activities": sa,
                "end_activities": ea
            })

    ser = pm4py.visualization.dfg.visualizer.serialize(gviz).decode("utf-8")

    return ser
Ejemplo n.º 13
0
def apply(log, parameters=None):
    """
    Apply the IMDF algorithm to a log obtaining a Petri net along with an initial and final marking

    Parameters
    -----------
    log
        Log
    parameters
        Parameters of the algorithm, including:
            pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY -> attribute of the log to use as activity name
            (default concept:name)

    Returns
    -----------
    net
        Petri net
    initial_marking
        Initial marking
    final_marking
        Final marking
    """
    if parameters is None:
        parameters = {}
    if pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY not in parameters:
        parameters[pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_util.DEFAULT_NAME_KEY
    if pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY not in parameters:
        parameters[pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_util.DEFAULT_TIMESTAMP_KEY
    if pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY not in parameters:
        parameters[pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY] = pmutil.constants.CASE_ATTRIBUTE_GLUE
    if isinstance(log, pandas.core.frame.DataFrame):
        dfg = df_statistics.get_dfg_graph(log, case_id_glue=parameters[pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY],
                                          activity_key=parameters[pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY],
                                          timestamp_key=parameters[pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY])
        start_activities = pd_start_act_stats.get_start_activities(log, parameters=parameters)
        end_activities = pd_end_act_stats.get_end_activities(log, parameters=parameters)
        activities = pd_attributes_stats.get_attribute_values(log, parameters[pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY], parameters=parameters)
        return apply_dfg(dfg, activities=activities, start_activities=start_activities, end_activities=end_activities, parameters=parameters)
    log = log_conversion.apply(log, parameters, log_conversion.TO_EVENT_LOG)
    tree = apply_tree(log, parameters=parameters)
    net, initial_marking, final_marking = tree_to_petri.apply(tree)
    return net, initial_marking, final_marking
Ejemplo n.º 14
0
def apply_auto_filter(df, parameters=None):
    """
    Apply auto filter on end activities

    Parameters
    -----------
    df
        Pandas dataframe
    parameters
        Parameters of the algorithm, including:
            case_id_glue -> Case ID column in the dataframe
            activity_key -> Column that represents the activity
            decreasingFactor -> Decreasing factor that should be passed to the algorithm

    Returns
    -----------
    df
        Filtered dataframe
    """
    if parameters is None:
        parameters = {}

    case_id_glue = parameters[
        PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
    activity_key = parameters[
        PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY
    decreasing_factor = parameters[
        "decreasingFactor"] if "decreasingFactor" in parameters else filtering_constants.DECREASING_FACTOR
    grouped_df = parameters[GROUPED_DATAFRAME] if GROUPED_DATAFRAME in parameters else None

    start_activities = get_start_activities(df, parameters=parameters)
    salist = start_activities_common.get_sorted_start_activities_list(start_activities)
    sathreshold = start_activities_common.get_start_activities_threshold(salist, decreasing_factor)

    return filter_df_on_start_activities_nocc(df, sathreshold, sa_count0=start_activities, case_id_glue=case_id_glue,
                                              activity_key=activity_key, grouped_df=grouped_df)
Ejemplo n.º 15
0
def apply_pandas(df, parameters=None):
    """
    Discovers a Petri net using Heuristics Miner

    Parameters
    ------------
    df
        Pandas dataframe
    parameters
        Possible parameters of the algorithm,
        including: activity_key, case_id_glue, timestamp_key,
        dependency_thresh, and_measure_thresh, min_act_count, min_dfg_occurrences, dfg_pre_cleaning_noise_thresh,
        loops_length_two_thresh

    Returns
    ------------
    net
        Petri net
    im
        Initial marking
    fm
        Final marking
    """
    if parameters is None:
        parameters = {}

    if pkgutil.find_loader("pandas"):
        activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY)
        case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME)
        start_timestamp_key = exec_utils.get_param_value(Parameters.START_TIMESTAMP_KEY, parameters,
                                                         None)
        timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes.DEFAULT_TIMESTAMP_KEY)

        from pm4py.algo.discovery.dfg.adapters.pandas import df_statistics, freq_triples as get_freq_triples
        from pm4py.statistics.attributes.pandas import get as pd_attributes
        from pm4py.statistics.start_activities.pandas import get as pd_sa_filter
        from pm4py.statistics.end_activities.pandas import get as pd_ea_filter

        start_activities = pd_sa_filter.get_start_activities(df, parameters=parameters)
        end_activities = pd_ea_filter.get_end_activities(df, parameters=parameters)
        activities_occurrences = pd_attributes.get_attribute_values(df, activity_key, parameters=parameters)
        activities = list(activities_occurrences.keys())
        heu_net_decoration = exec_utils.get_param_value(Parameters.HEU_NET_DECORATION, parameters, "frequency")

        if timestamp_key in df:
            dfg = df_statistics.get_dfg_graph(df, case_id_glue=case_id_glue,
                                              activity_key=activity_key, timestamp_key=timestamp_key,
                                              start_timestamp_key=start_timestamp_key)
            dfg_window_2 = df_statistics.get_dfg_graph(df, case_id_glue=case_id_glue,
                                                       activity_key=activity_key, timestamp_key=timestamp_key, window=2,
                                                       start_timestamp_key=start_timestamp_key)
            frequency_triples = get_freq_triples.get_freq_triples(df, case_id_glue=case_id_glue,
                                                                  activity_key=activity_key,
                                                                  timestamp_key=timestamp_key)

        else:
            dfg = df_statistics.get_dfg_graph(df, case_id_glue=case_id_glue,
                                              activity_key=activity_key, sort_timestamp_along_case_id=False)
            dfg_window_2 = df_statistics.get_dfg_graph(df, case_id_glue=case_id_glue,
                                                       activity_key=activity_key, sort_timestamp_along_case_id=False,
                                                       window=2)
            frequency_triples = get_freq_triples.get_freq_triples(df, case_id_glue=case_id_glue,
                                                                  activity_key=activity_key,
                                                                  timestamp_key=timestamp_key,
                                                                  sort_timestamp_along_case_id=False)

        performance_dfg = None
        if heu_net_decoration == "performance":
            performance_dfg = df_statistics.get_dfg_graph(df, case_id_glue=case_id_glue,
                                                          activity_key=activity_key, timestamp_key=timestamp_key,
                                                          start_timestamp_key=start_timestamp_key,
                                                          measure="performance")

        heu_net = apply_heu_dfg(dfg, activities=activities, activities_occurrences=activities_occurrences,
                                start_activities=start_activities, end_activities=end_activities,
                                dfg_window_2=dfg_window_2,
                                freq_triples=frequency_triples, performance_dfg=performance_dfg, parameters=parameters)
        net, im, fm = hn_conv_alg.apply(heu_net, parameters=parameters)

        return net, im, fm
Ejemplo n.º 16
0
import pandas as pd

# this part is required because the dataframe provided by PowerBI has strings
dataset["time:timestamp"] = pd.to_datetime(dataset["time:timestamp"])

from pm4py.algo.discovery.dfg.adapters.pandas import df_statistics

dfg = df_statistics.get_dfg_graph(dataset, measure="frequency")

from pm4py.statistics.attributes.pandas import get as attributes_get

activities_count = attributes_get.get_attribute_values(dataset, "concept:name")

from pm4py.statistics.start_activities.pandas import get as sa_get

start_activities = sa_get.get_start_activities(dataset)
from pm4py.statistics.end_activities.pandas import get as ea_get

end_activities = ea_get.get_end_activities(dataset)

from pm4py.visualization.dfg import visualizer

gviz = visualizer.apply(dfg,
                        activities_count=activities_count,
                        variant=visualizer.Variants.FREQUENCY,
                        parameters={
                            "start_activities": start_activities,
                            "end_activities": end_activities
                        })
visualizer.matplotlib_view(gviz)
Ejemplo n.º 17
0
def apply_pandas(df, parameters=None):
    """
    Discovers a Petri net using Heuristics Miner

    Parameters
    ------------
    df
        Pandas dataframe
    parameters
        Possible parameters of the algorithm,
        including: activity_key, case_id_glue, timestamp_key,
        dependency_thresh, and_measure_thresh, min_act_count, min_dfg_occurrences, dfg_pre_cleaning_noise_thresh,
        loops_length_two_thresh

    Returns
    ------------
    net
        Petri net
    im
        Initial marking
    fm
        Final marking
    """
    if parameters is None:
        parameters = {}

    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters, xes.DEFAULT_NAME_KEY)
    case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY,
                                              parameters,
                                              constants.CASE_CONCEPT_NAME)
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY,
                                               parameters,
                                               xes.DEFAULT_TIMESTAMP_KEY)

    start_activities = pd_sa_filter.get_start_activities(df,
                                                         parameters=parameters)
    end_activities = pd_ea_filter.get_end_activities(df, parameters=parameters)
    activities_occurrences = pd_attributes.get_attribute_values(
        df, activity_key, parameters=parameters)
    activities = list(activities_occurrences.keys())
    if timestamp_key in df:
        dfg = df_statistics.get_dfg_graph(df,
                                          case_id_glue=case_id_glue,
                                          activity_key=activity_key,
                                          timestamp_key=timestamp_key)
        dfg_window_2 = df_statistics.get_dfg_graph(df,
                                                   case_id_glue=case_id_glue,
                                                   activity_key=activity_key,
                                                   timestamp_key=timestamp_key,
                                                   window=2)
        frequency_triples = get_freq_triples.get_freq_triples(
            df,
            case_id_glue=case_id_glue,
            activity_key=activity_key,
            timestamp_key=timestamp_key)

    else:
        dfg = df_statistics.get_dfg_graph(df,
                                          case_id_glue=case_id_glue,
                                          activity_key=activity_key,
                                          sort_timestamp_along_case_id=False)
        dfg_window_2 = df_statistics.get_dfg_graph(
            df,
            case_id_glue=case_id_glue,
            activity_key=activity_key,
            sort_timestamp_along_case_id=False,
            window=2)
        frequency_triples = get_freq_triples.get_freq_triples(
            df,
            case_id_glue=case_id_glue,
            activity_key=activity_key,
            timestamp_key=timestamp_key,
            sort_timestamp_along_case_id=False)

    heu_net = apply_heu_dfg(dfg,
                            activities=activities,
                            activities_occurrences=activities_occurrences,
                            start_activities=start_activities,
                            end_activities=end_activities,
                            dfg_window_2=dfg_window_2,
                            freq_triples=frequency_triples,
                            parameters=parameters)
    net, im, fm = hn_conv_alg.apply(heu_net, parameters=parameters)

    return net, im, fm