def apply(df, min_freq=0):
    if min_freq > 0:
        persps = [x for x in df.columns if not x.startswith("event_")]
        collation = []
        for persp in persps:
            red_df = df.dropna(subset=[persp])
            prevlen = len(df)
            while True:
                dfg = df_statistics.get_dfg_graph(
                    red_df,
                    activity_key="event_activity",
                    timestamp_key="event_timestamp",
                    case_id_glue=persp)
                dfg = [x for x in dfg if dfg[x] >= min_freq]
                param = {}
                param[PARAMETER_CONSTANT_CASEID_KEY] = persp
                param[PARAMETER_CONSTANT_ATTRIBUTE_KEY] = "event_activity"
                red_df = filter_paths(red_df, dfg, parameters=param)
                thislen = len(red_df)
                dfg = df_statistics.get_dfg_graph(
                    red_df,
                    activity_key="event_activity",
                    timestamp_key="event_timestamp",
                    case_id_glue=persp)
                if len(dfg) == 0 or min(
                        dfg.values()) >= min_freq or prevlen == thislen:
                    collation.append(red_df)
                    break
                prevlen = thislen
        return pd.concat(collation)
    return df
Esempio n. 2
0
def execute_script():
    try:
        from pm4py.objects.log.importer.parquet import factory as parquet_importer
        from pm4py.algo.discovery.dfg.adapters.pandas import df_statistics

        log_path = os.path.join("..", "tests", "input_data", log_name)
        time1 = time.time()
        dataframe = parquet_importer.apply(
            log_path, parameters={"columns": allowed_columns})
        time2 = time.time()
        print(dataframe.columns)
        print(
            "time interlapsed importing " + log_name + " on columns " +
            str(allowed_columns) + ": ", (time2 - time1))
        dfg1 = df_statistics.get_dfg_graph(dataframe,
                                           sort_timestamp_along_case_id=False)
        time3 = time.time()
        print(
            "time interlapsed calculating the DFG on columns " +
            str(allowed_columns) + " : ", (time3 - time2))
        del dataframe
        time4 = time.time()
        dataframe = parquet_importer.apply(log_path)
        print(dataframe.columns)
        time5 = time.time()
        print("time interlapsed importing " + log_name + " (all columns): ",
              (time5 - time4))
        dfg2 = df_statistics.get_dfg_graph(dataframe,
                                           sort_timestamp_along_case_id=False)
        time6 = time.time()
        print("time interlapsed calculating the DFG on all columns : ",
              (time6 - time5))
    except:
        traceback.print_exc()
Esempio n. 3
0
def apply(df, activity, parameters=None):
    """
    Gets the time passed from each preceding activity

    Parameters
    -------------
    df
        Dataframe
    activity
        Activity that we are considering
    parameters
        Possible parameters of the algorithm

    Returns
    -------------
    dictio
        Dictionary containing a 'pre' key with the
        list of aggregates times from each preceding activity to the given activity
    """
    if parameters is None:
        parameters = {}

    case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY,
                                              parameters, CASE_CONCEPT_NAME)
    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters, DEFAULT_NAME_KEY)
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY,
                                               parameters,
                                               DEFAULT_TIMESTAMP_KEY)
    start_timestamp_key = exec_utils.get_param_value(
        Parameters.START_TIMESTAMP_KEY, parameters, None)

    [dfg_frequency, dfg_performance
     ] = pandas.get_dfg_graph(df,
                              measure="both",
                              activity_key=activity_key,
                              case_id_glue=case_id_glue,
                              timestamp_key=timestamp_key,
                              start_timestamp_key=start_timestamp_key)

    pre = []
    sum_perf_pre = 0.0
    sum_acti_pre = 0.0

    for entry in dfg_performance.keys():
        if entry[1] == activity:
            pre.append([
                entry[0],
                float(dfg_performance[entry]),
                int(dfg_frequency[entry])
            ])
            sum_perf_pre = sum_perf_pre + float(
                dfg_performance[entry]) * float(dfg_frequency[entry])
            sum_acti_pre = sum_acti_pre + float(dfg_frequency[entry])

    perf_acti_pre = 0.0
    if sum_acti_pre > 0:
        perf_acti_pre = sum_perf_pre / sum_acti_pre

    return {"pre": pre, "pre_avg_perf": perf_acti_pre}
Esempio n. 4
0
def apply(log, parameters=None, variant=DEFAULT_VARIANT):
    if parameters is None:
        parameters = {}
    if pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY not in parameters:
        parameters[pmutil.constants.
                   PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_util.DEFAULT_NAME_KEY
    if pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY not in parameters:
        parameters[
            pmutil.constants.
            PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_util.DEFAULT_TIMESTAMP_KEY
    if pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY not in parameters:
        parameters[
            pmutil.constants.
            PARAMETER_CONSTANT_CASEID_KEY] = log_util.CASE_ATTRIBUTE_GLUE
    if isinstance(log, pandas.core.frame.DataFrame):
        dfg = df_statistics.get_dfg_graph(
            log,
            case_id_glue=parameters[
                pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY],
            activity_key=parameters[
                pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY],
            timestamp_key=parameters[
                pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY])
        return VERSIONS_DFG[variant](dfg, parameters=parameters)
    return VERSIONS[variant](log_conversion.apply(log, parameters,
                                                  log_conversion.TO_EVENT_LOG),
                             parameters)
Esempio n. 5
0
def transient_analysis_from_dataframe(df, delay, parameters=None):
    """
    Gets the transient analysis from a dataframe and a delay

    Parameters
    -------------
    df
        Pandas dataframe
    delay
        Time delay
    parameters
        Parameters of the algorithm

    Returns
    -------------
    transient_result
        Transient analysis result
    """
    if parameters is None:
        parameters = {}

    activity_key = parameters[
        constants.
        PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY
    case_id_glue = parameters[
        constants.
        PARAMETER_CONSTANT_CASEID_KEY] if constants.PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
    timestamp_key = parameters[
        constants.
        PARAMETER_CONSTANT_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY

    log = log_conv_factory.apply(df,
                                 variant=log_conv_factory.DF_TO_EVENT_LOG_1V,
                                 parameters=parameters)
    # gets the simple Petri net through simple miner
    net, im, fm = simple_factory.apply(log,
                                       parameters=parameters,
                                       classic_output=True)

    activities_count = dict(df.groupby(activity_key).size())
    dfg_performance = df_statistics.get_dfg_graph(df,
                                                  measure="performance",
                                                  perf_aggregation_key="mean",
                                                  case_id_glue=case_id_glue,
                                                  activity_key=activity_key,
                                                  timestamp_key=timestamp_key)

    spaths = get_shortest_paths(net)
    aggregated_statistics = get_decorations_from_dfg_spaths_acticount(
        net, dfg_performance, spaths, activities_count, variant="performance")

    # gets the stochastic map out of the dataframe and the Petri net
    s_map = smap_builder.get_map_exponential_from_aggstatistics(
        aggregated_statistics, parameters=parameters)

    return transient_analysis_from_petri_net_and_smap(net,
                                                      im,
                                                      s_map,
                                                      delay,
                                                      parameters=parameters)
Esempio n. 6
0
def calculate_process_schema_from_df(dataframe, path_frequency,
                                     path_performance):
    activities_count = attributes_filter.get_attribute_values(
        dataframe, attribute_key=ACTIVITY_KEY)
    [dfg_frequency, dfg_performance
     ] = df_statistics.get_dfg_graph(dataframe,
                                     measure="both",
                                     perf_aggregation_key="median",
                                     case_id_glue=CASEID_GLUE,
                                     activity_key=ACTIVITY_KEY,
                                     timestamp_key=TIMEST_KEY,
                                     sort_caseid_required=False)
    net, initial_marking, final_marking = inductive_factory.apply_dfg(
        dfg_frequency)
    spaths = vis_trans_shortest_paths.get_shortest_paths(net)
    aggregated_statistics = vis_trans_shortest_paths.get_decorations_from_dfg_spaths_acticount(
        net, dfg_frequency, spaths, activities_count, variant="frequency")
    parameters_viz = {"format": "svg"}
    gviz = pn_vis_factory.apply(net,
                                initial_marking,
                                final_marking,
                                variant="frequency",
                                aggregated_statistics=aggregated_statistics,
                                parameters=parameters_viz)
    pn_vis_factory.save(gviz, path_frequency)
    aggregated_statistics = vis_trans_shortest_paths.get_decorations_from_dfg_spaths_acticount(
        net, dfg_performance, spaths, activities_count, variant="performance")
    parameters_viz = {"format": "svg"}
    gviz = pn_vis_factory.apply(net,
                                initial_marking,
                                final_marking,
                                variant="performance",
                                aggregated_statistics=aggregated_statistics,
                                parameters=parameters_viz)
    pn_vis_factory.save(gviz, path_performance)
Esempio n. 7
0
def discover_dfg(
        log: Union[EventLog, pd.DataFrame]) -> Tuple[dict, dict, dict]:
    """
    Discovers a DFG from a log

    Parameters
    --------------
    log
        Event log

    Returns
    --------------
    dfg
        DFG
    start_activities
        Start activities
    end_activities
        End activities
    """
    if check_is_dataframe(log):
        check_dataframe_columns(log)
        from pm4py.algo.discovery.dfg.adapters.pandas.df_statistics import get_dfg_graph
        dfg = get_dfg_graph(log)
        from pm4py.statistics.start_activities.pandas import get as start_activities_module
        from pm4py.statistics.end_activities.pandas import get as end_activities_module
        start_activities = start_activities_module.get_start_activities(log)
        end_activities = end_activities_module.get_end_activities(log)
    else:
        from pm4py.algo.discovery.dfg import algorithm as dfg_discovery
        dfg = dfg_discovery.apply(log)
        from pm4py.statistics.start_activities.log import get as start_activities_module
        from pm4py.statistics.end_activities.log import get as end_activities_module
        start_activities = start_activities_module.get_start_activities(log)
        end_activities = end_activities_module.get_end_activities(log)
    return dfg, start_activities, end_activities
Esempio n. 8
0
def discover_abstraction_dataframe(df: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None) -> Tuple[
    Any, Any, Any, Any, Any, Any, Any]:
    """
    Discovers an abstraction from a dataframe that is useful for the Heuristics Miner ++ algorithm

    Parameters
    --------------
    df
        Dataframe
    parameters
        Parameters of the algorithm, including:
        - Parameters.ACTIVITY_KEY
        - Parameters.START_TIMESTAMP_KEY
        - Parameters.TIMESTAMP_KEY
        - Parameters.CASE_ID_KEY

    Returns
    --------------
    start_activities
        Start activities
    end_activities
        End activities
    activities_occurrences
        Activities along with their number of occurrences
    dfg
        Directly-follows graph
    performance_dfg
        (Performance) Directly-follows graph
    sojourn_time
        Sojourn time for each activity
    concurrent_activities
        Concurrent activities
    """
    if parameters is None:
        parameters = {}
    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY)
    start_timestamp_key = exec_utils.get_param_value(Parameters.START_TIMESTAMP_KEY, parameters,
                                                     None)
    if start_timestamp_key is None:
        start_timestamp_key = xes.DEFAULT_START_TIMESTAMP_KEY
        parameters = copy(parameters)
        parameters[Parameters.START_TIMESTAMP_KEY] = start_timestamp_key
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes.DEFAULT_TIMESTAMP_KEY)
    case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME)
    start_activities = pd_sa.get_start_activities(df, parameters=parameters)
    end_activities = pd_ea.get_end_activities(df, parameters=parameters)
    activities_occurrences = pd_attributes.get_attribute_values(df, activity_key, parameters=parameters)
    efg_parameters = copy(parameters)
    efg_parameters[pd_efg.Parameters.KEEP_FIRST_FOLLOWING] = True
    dfg = pd_efg.apply(df, parameters=efg_parameters)
    performance_dfg = df_statistics.get_dfg_graph(df, case_id_glue=case_id_glue,
                                                  activity_key=activity_key, timestamp_key=timestamp_key,
                                                  start_timestamp_key=start_timestamp_key, measure="performance")
    sojourn_time = pd_soj_time.apply(df, parameters=parameters)
    concurrent_activities = pd_conc_act.apply(df, parameters=parameters)
    return (
        start_activities, end_activities, activities_occurrences, dfg, performance_dfg, sojourn_time,
        concurrent_activities)
Esempio n. 9
0
def apply(df, parameters=None):
    """
    Discovers a footprint object from a dataframe
    (the footprints of the dataframe are returned)

    Parameters
    --------------
    df
        Dataframe
    parameters
        Parameters of the algorithm

    Returns
    --------------
    footprints_obj
        Footprints object
    """
    if parameters is None:
        parameters = {}

    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY)
    caseid_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME)
    start_timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters,
                                               None)
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters,
                                               xes_constants.DEFAULT_TIMESTAMP_KEY)
    sort_required = exec_utils.get_param_value(Parameters.SORT_REQUIRED, parameters, DEFAULT_SORT_REQUIRED)
    index_key = exec_utils.get_param_value(Parameters.INDEX_KEY, parameters, DEFAULT_INDEX_KEY)

    df = df[[caseid_key, activity_key, timestamp_key]]
    if sort_required:
        df = pandas_utils.insert_index(df, index_key)
        if start_timestamp_key is not None:
            df = df.sort_values([caseid_key, start_timestamp_key, timestamp_key, index_key])
        else:
            df = df.sort_values([caseid_key, timestamp_key, index_key])

    grouped_df = df.groupby(caseid_key)
    dfg = df_statistics.get_dfg_graph(df, measure="frequency", activity_key=activity_key, case_id_glue=caseid_key,
                                      timestamp_key=timestamp_key, sort_caseid_required=False,
                                      sort_timestamp_along_case_id=False, start_timestamp_key=start_timestamp_key)
    activities = set(df[activity_key].unique())
    start_activities = set(grouped_df.first()[activity_key].unique())
    end_activities = set(grouped_df.last()[activity_key].unique())

    parallel = {(x, y) for (x, y) in dfg if (y, x) in dfg}
    sequence = set(causal_discovery.apply(dfg, causal_discovery.Variants.CAUSAL_ALPHA))

    ret = {}
    ret[Outputs.DFG.value] = dfg
    ret[Outputs.SEQUENCE.value] = sequence
    ret[Outputs.PARALLEL.value] = parallel
    ret[Outputs.ACTIVITIES.value] = activities
    ret[Outputs.START_ACTIVITIES.value] = start_activities
    ret[Outputs.END_ACTIVITIES.value] = end_activities
    ret[Outputs.MIN_TRACE_LENGTH.value] = int(grouped_df.size().min())

    return ret
Esempio n. 10
0
def apply(log, parameters=None, variant=DFG_NATIVE):
    """
    Calculates DFG graph (frequency or performance) starting from a log

    Parameters
    ----------
    log
        Log
    parameters
        Possible parameters passed to the algorithms:
            aggregationMeasure -> performance aggregation measure (min, max, mean, median)
            activity_key -> Attribute to use as activity
            timestamp_key -> Attribute to use as timestamp
    variant
        Variant of the algorithm to use, possible values:
            native, frequency, performance, frequency_greedy, performance_greedy

    Returns
    -------
    dfg
        DFG graph
    """
    if parameters is None:
        parameters = {}
    if pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY not in parameters:
        parameters[pmutil.constants.
                   PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_util.DEFAULT_NAME_KEY
    if pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY not in parameters:
        parameters[
            pmutil.constants.
            PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_util.DEFAULT_TIMESTAMP_KEY
    if pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY not in parameters:
        parameters[
            pmutil.constants.
            PARAMETER_CONSTANT_CASEID_KEY] = pmutil.constants.CASE_ATTRIBUTE_GLUE
    if isinstance(log, pandas.core.frame.DataFrame):
        log = csv_import_adapter.convert_timestamp_columns_in_df(
            log,
            timest_columns=[
                parameters[pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY]
            ])
        dfg_frequency, dfg_performance = df_statistics.get_dfg_graph(
            log,
            measure="both",
            activity_key=parameters[
                pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY],
            timestamp_key=parameters[
                pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY],
            case_id_glue=parameters[
                pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY])
        if 'native' in variant or 'frequency' in variant:
            return dfg_frequency
        else:
            return dfg_performance
    return VERSIONS[variant](log_conversion.apply(log, parameters,
                                                  log_conversion.TO_EVENT_LOG),
                             parameters=parameters)
Esempio n. 11
0
def apply(df, activity, parameters=None):
    """
    Gets the time passed from each preceding activity

    Parameters
    -------------
    df
        Dataframe
    activity
        Activity that we are considering
    parameters
        Possible parameters of the algorithm

    Returns
    -------------
    dictio
        Dictionary containing a 'pre' key with the
        list of aggregates times from each preceding activity to the given activity
    """
    if parameters is None:
        parameters = {}

    case_id_glue = parameters[
        PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
    activity_key = parameters[
        PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY
    timestamp_key = parameters[
        PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY

    [dfg_frequency, dfg_performance
     ] = df_statistics.get_dfg_graph(df,
                                     measure="both",
                                     activity_key=activity_key,
                                     case_id_glue=case_id_glue,
                                     timestamp_key=timestamp_key)

    pre = []
    sum_perf_pre = 0.0
    sum_acti_pre = 0.0

    for entry in dfg_performance.keys():
        if entry[1] == activity:
            pre.append([
                entry[0],
                float(dfg_performance[entry]),
                int(dfg_frequency[entry])
            ])
            sum_perf_pre = sum_perf_pre + float(
                dfg_performance[entry]) * float(dfg_frequency[entry])
            sum_acti_pre = sum_acti_pre + float(dfg_frequency[entry])

    perf_acti_pre = 0.0
    if sum_acti_pre > 0:
        perf_acti_pre = sum_perf_pre / sum_acti_pre

    return {"pre": pre, "pre_avg_perf": perf_acti_pre}
Esempio n. 12
0
def execute_script():
    log_path = os.path.join("..", "tests", "input_data", log_name)
    time1 = time.time()
    dataframe = parquet_importer.apply(log_path, parameters={"columns": allowed_columns})
    time2 = time.time()
    print(dataframe.columns)
    print("time interlapsed importing "+log_name+" on columns "+str(allowed_columns)+": ",(time2-time1))
    dfg1 = df_statistics.get_dfg_graph(dataframe, sort_timestamp_along_case_id=False)
    time3 = time.time()
    print("time interlapsed calculating the DFG on columns "+str(allowed_columns)+" : ",(time3-time2))
    del dataframe
    time4 = time.time()
    dataframe = parquet_importer.apply(log_path)
    print(dataframe.columns)
    time5 = time.time()
    print("time interlapsed importing "+log_name+" (all columns): ",(time5-time4))
    dfg2 = df_statistics.get_dfg_graph(dataframe, sort_timestamp_along_case_id=False)
    time6 = time.time()
    print("time interlapsed calculating the DFG on all columns : ",(time6-time5))
Esempio n. 13
0
def apply(log, parameters=None, variant=DEFAULT_VARIANT):
    """
    Apply the Alpha Miner on top of a log

    Parameters
    -----------
    log
        Log
    variant
        Variant of the algorithm to use:
            - Variants.ALPHA_VERSION_CLASSIC
            - Variants.ALPHA_VERSION_PLUS
    parameters
        Possible parameters of the algorithm, including:
            Parameters.ACTIVITY_KEY -> Name of the attribute that contains the activity

    Returns
    -----------
    net
        Petri net
    marking
        Initial marking
    final_marking
        Final marking
    """
    if parameters is None:
        parameters = {}
    case_id_glue = exec_utils.get_param_value(
        Parameters.CASE_ID_KEY, parameters, pmutil.constants.CASE_CONCEPT_NAME)
    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters,
                                              xes_util.DEFAULT_NAME_KEY)
    start_timestamp_key = exec_utils.get_param_value(
        Parameters.START_TIMESTAMP_KEY, parameters, None)
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY,
                                               parameters,
                                               xes_util.DEFAULT_TIMESTAMP_KEY)

    if pkgutil.find_loader("pandas"):
        import pandas
        if isinstance(log, pandas.core.frame.DataFrame
                      ) and variant == ALPHA_VERSION_CLASSIC:
            dfg = df_statistics.get_dfg_graph(
                log,
                case_id_glue=case_id_glue,
                activity_key=activity_key,
                timestamp_key=timestamp_key,
                start_timestamp_key=start_timestamp_key)
            return exec_utils.get_variant(variant).apply_dfg(
                dfg, parameters=parameters)
    return exec_utils.get_variant(variant).apply(
        log_conversion.apply(log, parameters, log_conversion.TO_EVENT_LOG),
        parameters)
Esempio n. 14
0
def apply(log, parameters=None, variant=DEFAULT_VARIANT):
    """
    Calculates DFG graph (frequency or performance) starting from a log

    Parameters
    ----------
    log
        Log
    parameters
        Possible parameters passed to the algorithms:
            Parameters.AGGREGATION_MEASURE -> performance aggregation measure (min, max, mean, median)
            Parameters.ACTIVITY_KEY -> Attribute to use as activity
            Parameters.TIMESTAMP_KEY -> Attribute to use as timestamp
    variant
        Variant of the algorithm to use, possible values:
            - Variants.NATIVE
            - Variants.FREQUENCY
            - Variants.FREQUENCY_GREEDY
            - Variants.PERFORMANCE
            - Variants.PERFORMANCE_GREEDY
            - Variants.FREQ_TRIPLES

    Returns
    -------
    dfg
        DFG graph
    """
    if parameters is None:
        parameters = {}
    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_util.DEFAULT_NAME_KEY)
    start_timestamp_key = exec_utils.get_param_value(Parameters.START_TIMESTAMP_KEY, parameters, None)
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes_util.DEFAULT_TIMESTAMP_KEY)
    case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, pmutil.constants.CASE_CONCEPT_NAME)

    if pkgutil.find_loader("pandas"):
        import pandas
        from pm4py.algo.discovery.dfg.adapters.pandas import df_statistics
        from pm4py.objects.log.util import dataframe_utils

        if isinstance(log, pandas.core.frame.DataFrame) and not variant == Variants.FREQ_TRIPLES:
            log = dataframe_utils.convert_timestamp_columns_in_df(log, timest_columns=[
                timestamp_key])
            dfg_frequency, dfg_performance = df_statistics.get_dfg_graph(log, measure="both",
                                                                         activity_key=activity_key,
                                                                         timestamp_key=timestamp_key,
                                                                         case_id_glue=case_id_glue,
                                                                         start_timestamp_key=start_timestamp_key)
            if variant in [Variants.PERFORMANCE, Variants.PERFORMANCE_GREEDY]:
                return dfg_performance
            else:
                return dfg_frequency

    return exec_utils.get_variant(variant).apply(log_conversion.apply(log, parameters, log_conversion.TO_EVENT_LOG), parameters=parameters)
Esempio n. 15
0
def discover_performance_dfg(log: Union[EventLog, pd.DataFrame], business_hours: bool = False, worktiming: List[int] = [7, 17], weekends: List[int] = [6, 7], workcalendar=constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR) -> Tuple[dict, dict, dict]:
    """
    Discovers a performance directly-follows graph from an event log

    Parameters
    ---------------
    log
        Event log
    business_hours
        Enables/disables the computation based on the business hours (default: False)
    worktiming
        (If the business hours are enabled) The hour range in which the resources of the log are working (default: 7 to 17)
    weekends
        (If the business hours are enabled) The weekends days (default: Saturday (6), Sunday (7))

    Returns
    ---------------
    performance_dfg
        Performance DFG
    start_activities
        Start activities
    end_activities
        End activities
    """
    if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!")

    if check_is_pandas_dataframe(log):
        check_pandas_dataframe_columns(log)
        from pm4py.util import constants
        properties = get_properties(log)
        from pm4py.algo.discovery.dfg.adapters.pandas.df_statistics import get_dfg_graph
        activity_key = properties[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in properties else xes_constants.DEFAULT_NAME_KEY
        timestamp_key = properties[constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in properties else xes_constants.DEFAULT_TIMESTAMP_KEY
        case_id_key = properties[constants.PARAMETER_CONSTANT_CASEID_KEY] if constants.PARAMETER_CONSTANT_CASEID_KEY in properties else constants.CASE_CONCEPT_NAME
        dfg = get_dfg_graph(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_glue=case_id_key, measure="performance", perf_aggregation_key="all",
                            business_hours=business_hours, worktiming=worktiming, weekends=weekends, workcalendar=workcalendar)
        from pm4py.statistics.start_activities.pandas import get as start_activities_module
        from pm4py.statistics.end_activities.pandas import get as end_activities_module
        start_activities = start_activities_module.get_start_activities(log, parameters=properties)
        end_activities = end_activities_module.get_end_activities(log, parameters=properties)
    else:
        from pm4py.algo.discovery.dfg.variants import performance as dfg_discovery
        properties = get_properties(log)
        properties[dfg_discovery.Parameters.AGGREGATION_MEASURE] = "all"
        properties[dfg_discovery.Parameters.BUSINESS_HOURS] = business_hours
        properties[dfg_discovery.Parameters.WORKTIMING] = worktiming
        properties[dfg_discovery.Parameters.WEEKENDS] = weekends
        dfg = dfg_discovery.apply(log, parameters=properties)
        from pm4py.statistics.start_activities.log import get as start_activities_module
        from pm4py.statistics.end_activities.log import get as end_activities_module
        start_activities = start_activities_module.get_start_activities(log, parameters=properties)
        end_activities = end_activities_module.get_end_activities(log, parameters=properties)
    return dfg, start_activities, end_activities
Esempio n. 16
0
def apply(log, parameters=None):
    """
    Apply the IMDF algorithm to a log obtaining a Petri net along with an initial and final marking

    Parameters
    -----------
    log
        Log
    parameters
        Parameters of the algorithm, including:
            Parameters.ACTIVITY_KEY -> attribute of the log to use as activity name
            (default concept:name)

    Returns
    -----------
    net
        Petri net
    initial_marking
        Initial marking
    final_marking
        Final marking
    """
    if parameters is None:
        parameters = {}
    case_id_glue = exec_utils.get_param_value(
        Parameters.CASE_ID_KEY, parameters, pmutil.constants.CASE_CONCEPT_NAME)
    activity_key = exec_utils.get_param_value(
        Parameters.ACTIVITY_KEY, parameters,
        pmutil.xes_constants.DEFAULT_NAME_KEY)
    timestamp_key = exec_utils.get_param_value(
        Parameters.TIMESTAMP_KEY, parameters,
        pmutil.xes_constants.DEFAULT_TIMESTAMP_KEY)
    if isinstance(log, pandas.core.frame.DataFrame):
        dfg = df_statistics.get_dfg_graph(log,
                                          case_id_glue=case_id_glue,
                                          activity_key=activity_key,
                                          timestamp_key=timestamp_key)
        start_activities = pd_start_act_stats.get_start_activities(
            log, parameters=parameters)
        end_activities = pd_end_act_stats.get_end_activities(
            log, parameters=parameters)
        activities = pd_attributes_stats.get_attribute_values(
            log, activity_key, parameters=parameters)
        return apply_dfg(dfg,
                         activities=activities,
                         start_activities=start_activities,
                         end_activities=end_activities,
                         parameters=parameters)
    log = log_conversion.apply(log, parameters, log_conversion.TO_EVENT_LOG)
    tree = apply_tree(log, parameters=parameters)
    net, initial_marking, final_marking = tree_to_petri.apply(tree)
    return net, initial_marking, final_marking
Esempio n. 17
0
def calculate_dfg(path, log_name, managed_logs, parameters=None):
    if parameters is None:
        parameters = {}

    no_samples = parameters[
        PARAMETER_NO_SAMPLES] if PARAMETER_NO_SAMPLES in parameters else DEFAULT_MAX_NO_SAMPLES
    use_transition = parameters[
        PARAMETER_USE_TRANSITION] if PARAMETER_USE_TRANSITION in parameters else DEFAULT_USE_TRANSITION
    activity_key = DEFAULT_NAME_KEY if not use_transition else "@@classifier"
    filters = parameters[FILTERS] if FILTERS in parameters else []
    parameters[pm4py_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = activity_key
    columns = get_columns_to_import(filters,
                                    [CASE_CONCEPT_NAME, DEFAULT_NAME_KEY],
                                    use_transition=use_transition)

    if pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY in parameters:
        columns.append(
            parameters[pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY])
        activity_key, parameters[
            pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = parameters[
                pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY], activity_key
    else:
        parameters[
            pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = activity_key
    folder = os.path.join(path, log_name)

    parquet_list = parquet_importer.get_list_parquet(folder)
    overall_dfg = Counter()
    count = 0
    for index, pq in enumerate(parquet_list):
        pq_basename = Path(pq).name
        if pq_basename in managed_logs:
            count = count + 1
            df = get_filtered_parquet(pq,
                                      columns,
                                      filters,
                                      use_transition=use_transition,
                                      parameters=parameters)
            dfg = Counter(
                df_statistics.get_dfg_graph(df,
                                            activity_key=activity_key,
                                            sort_timestamp_along_case_id=False,
                                            sort_caseid_required=False))
            overall_dfg = overall_dfg + dfg
            if count >= no_samples:
                break

    returned_dict = {}
    for el in overall_dfg:
        returned_dict[el[0] + "@@" + el[1]] = overall_dfg[el]
    return returned_dict
Esempio n. 18
0
def apply(log, parameters=None, variant=ALPHA_VERSION_CLASSIC):
    """
    Apply the Alpha Miner on top of a log

    Parameters
    -----------
    log
        Log
    variant
        Variant of the algorithm to use (classic)
    parameters
        Possible parameters of the algorithm, including:
            activity key -> Name of the attribute that contains the activity

    Returns
    -----------
    net
        Petri net
    marking
        Initial marking
    final_marking
        Final marking
    """
    if parameters is None:
        parameters = {}
    if pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY not in parameters:
        parameters[pmutil.constants.
                   PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_util.DEFAULT_NAME_KEY
    if pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY not in parameters:
        parameters[
            pmutil.constants.
            PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_util.DEFAULT_TIMESTAMP_KEY
    if pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY not in parameters:
        parameters[
            pmutil.constants.
            PARAMETER_CONSTANT_CASEID_KEY] = log_util.CASE_ATTRIBUTE_GLUE
    if isinstance(
            log,
            pandas.core.frame.DataFrame) and variant == ALPHA_VERSION_CLASSIC:
        dfg = df_statistics.get_dfg_graph(
            log,
            case_id_glue=parameters[
                pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY],
            activity_key=parameters[
                pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY],
            timestamp_key=parameters[
                pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY])
        return VERSIONS_DFG[variant](dfg, parameters=parameters)
    return VERSIONS[variant](log_conversion.apply(log, parameters,
                                                  log_conversion.TO_EVENT_LOG),
                             parameters)
Esempio n. 19
0
def calculate_performance_dfg(path, log_name, managed_logs, parameters=None):
    if parameters is None:
        parameters = {}

    no_samples = parameters[PARAMETER_NO_SAMPLES] if PARAMETER_NO_SAMPLES in parameters else DEFAULT_MAX_NO_SAMPLES
    use_transition = parameters[
        PARAMETER_USE_TRANSITION] if PARAMETER_USE_TRANSITION in parameters else DEFAULT_USE_TRANSITION
    activity_key = DEFAULT_NAME_KEY if not use_transition else PARAMETER_PM4PYWS_CLASSIFIER
    filters = parameters[FILTERS] if FILTERS in parameters else []
    parameters[pm4py_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = activity_key
    columns = get_columns_to_import(filters, [CASE_CONCEPT_NAME, DEFAULT_NAME_KEY, DEFAULT_TIMESTAMP_KEY],
                                    use_transition=use_transition)

    if pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY in parameters:
        columns.append(parameters[pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY])
        activity_key, parameters[pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = parameters[
                                                                                         pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY], activity_key
    else:
        parameters[pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = activity_key
    folder = os.path.join(path, log_name)

    parquet_list = parquet_importer.get_list_parquet(folder)
    frequency_dfg = Counter()
    performance_dfg = Counter()
    count = 0
    for index, pq in enumerate(parquet_list):
        pq_basename = Path(pq).name
        if pq_basename in managed_logs:
            count = count + 1
            df = get_filtered_parquet(pq, columns, filters, use_transition=use_transition, parameters=parameters)

            f_dfg, p_dfg = df_statistics.get_dfg_graph(df, activity_key=activity_key,
                                                       sort_timestamp_along_case_id=False, sort_caseid_required=False,
                                                       measure="both")
            f_dfg = Counter(f_dfg)

            for k in p_dfg:
                if k not in performance_dfg:
                    performance_dfg[k] = p_dfg[k]
                else:
                    performance_dfg[k] = (frequency_dfg[k] * performance_dfg[k] + f_dfg[k] * p_dfg[k]) / (
                                frequency_dfg[k] + f_dfg[k])

            frequency_dfg = frequency_dfg + f_dfg
            if count >= no_samples:
                break

    returned_dict = {}
    for el in performance_dfg:
        returned_dict[el[0] + "@@" + el[1]] = performance_dfg[el]
    return returned_dict
Esempio n. 20
0
def execute_script():
    log_path = os.path.join("..", "tests", "input_data",
                            "interval_event_log.csv")
    dataframe = pm4py.read_csv(log_path)
    log_path = os.path.join("..", "tests", "input_data", "reviewing.xes")
    log = pm4py.read_xes(log_path)
    dataframe = pm4py.convert_to_dataframe(log)
    parameters = {}
    #parameters[constants.PARAMETER_CONSTANT_START_TIMESTAMP_KEY] = "start_timestamp"
    parameters[constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] = "time:timestamp"
    parameters[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = "concept:name"
    parameters[constants.PARAMETER_CONSTANT_CASEID_KEY] = "case:concept:name"
    parameters["strict"] = True
    parameters["format"] = "svg"
    start_activities = sa_get.get_start_activities(dataframe,
                                                   parameters=parameters)
    end_activities = ea_get.get_end_activities(dataframe,
                                               parameters=parameters)
    att_count = att_get.get_attribute_values(dataframe,
                                             "concept:name",
                                             parameters=parameters)
    parameters["start_activities"] = start_activities
    parameters["end_activities"] = end_activities
    soj_time = soj_time_get.apply(dataframe, parameters=parameters)
    print("soj_time")
    print(soj_time)
    conc_act = conc_act_get.apply(dataframe, parameters=parameters)
    print("conc_act")
    print(conc_act)
    efg = efg_get.apply(dataframe, parameters=parameters)
    print("efg")
    print(efg)
    dfg_freq, dfg_perf = df_statistics.get_dfg_graph(
        dataframe, measure="both", start_timestamp_key="start_timestamp")
    dfg_gv_freq = dfg_vis_fact.apply(dfg_freq,
                                     activities_count=att_count,
                                     variant=dfg_vis_fact.Variants.FREQUENCY,
                                     soj_time=soj_time,
                                     parameters=parameters)
    dfg_vis_fact.view(dfg_gv_freq)
    dfg_gv_perf = dfg_vis_fact.apply(dfg_perf,
                                     activities_count=att_count,
                                     variant=dfg_vis_fact.Variants.PERFORMANCE,
                                     soj_time=soj_time,
                                     parameters=parameters)
    dfg_vis_fact.view(dfg_gv_perf)
    net, im, fm = dfg_conv.apply(dfg_freq)
    gviz = pn_vis.apply(net, im, fm, parameters=parameters)
    pn_vis.view(gviz)
Esempio n. 21
0
def get(df):
    try:
        if df.type == "succint":
            df = succint_mdl_to_exploded_mdl.apply(df)
    except:
        pass
    activ = dict(
        df.groupby("event_id").first()["event_activity"].value_counts())
    max_activ_freq = max(activ.values()) if len(activ.values()) > 0 else 0
    dfg = df_statistics.get_dfg_graph(red_df,
                                      activity_key="event_activity",
                                      timestamp_key="event_timestamp",
                                      case_id_glue=persp)
    max_edge_freq = max(dfg.values()) if len(dfg) > 0 else 0
    return {"max_activ_freq": max_activ_freq, "max_edge_freq": max_edge_freq}
Esempio n. 22
0
def discover_dfg(log: Union[EventLog, pd.DataFrame]) -> Tuple[dict, dict, dict]:
    """
    Discovers a DFG from a log

    Parameters
    --------------
    log
        Event log

    Returns
    --------------
    dfg
        DFG
    start_activities
        Start activities
    end_activities
        End activities
    """
    if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!")

    if check_is_pandas_dataframe(log):
        check_pandas_dataframe_columns(log)
        from pm4py.util import constants
        properties = get_properties(log)
        from pm4py.algo.discovery.dfg.adapters.pandas.df_statistics import get_dfg_graph
        activity_key = properties[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in properties else xes_constants.DEFAULT_NAME_KEY
        timestamp_key = properties[constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in properties else xes_constants.DEFAULT_TIMESTAMP_KEY
        case_id_key = properties[constants.PARAMETER_CONSTANT_CASEID_KEY] if constants.PARAMETER_CONSTANT_CASEID_KEY in properties else constants.CASE_CONCEPT_NAME
        dfg = get_dfg_graph(log, activity_key=activity_key,
                            timestamp_key=timestamp_key,
                            case_id_glue=case_id_key)
        from pm4py.statistics.start_activities.pandas import get as start_activities_module
        from pm4py.statistics.end_activities.pandas import get as end_activities_module
        start_activities = start_activities_module.get_start_activities(log, parameters=properties)
        end_activities = end_activities_module.get_end_activities(log, parameters=properties)
    else:
        from pm4py.algo.discovery.dfg import algorithm as dfg_discovery
        dfg = dfg_discovery.apply(log, parameters=get_properties(log))
        from pm4py.statistics.start_activities.log import get as start_activities_module
        from pm4py.statistics.end_activities.log import get as end_activities_module
        start_activities = start_activities_module.get_start_activities(log, parameters=get_properties(log))
        end_activities = end_activities_module.get_end_activities(log, parameters=get_properties(log))
    return dfg, start_activities, end_activities
Esempio n. 23
0
def apply(log, parameters=None):
    """
    Apply the IMDF algorithm to a log obtaining a Petri net along with an initial and final marking

    Parameters
    -----------
    log
        Log
    parameters
        Parameters of the algorithm, including:
            pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY -> attribute of the log to use as activity name
            (default concept:name)

    Returns
    -----------
    net
        Petri net
    initial_marking
        Initial marking
    final_marking
        Final marking
    """
    if parameters is None:
        parameters = {}
    if pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY not in parameters:
        parameters[pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_util.DEFAULT_NAME_KEY
    if pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY not in parameters:
        parameters[pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_util.DEFAULT_TIMESTAMP_KEY
    if pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY not in parameters:
        parameters[pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY] = pmutil.constants.CASE_ATTRIBUTE_GLUE
    if isinstance(log, pandas.core.frame.DataFrame):
        dfg = df_statistics.get_dfg_graph(log, case_id_glue=parameters[pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY],
                                          activity_key=parameters[pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY],
                                          timestamp_key=parameters[pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY])
        start_activities = pd_start_act_stats.get_start_activities(log, parameters=parameters)
        end_activities = pd_end_act_stats.get_end_activities(log, parameters=parameters)
        activities = pd_attributes_stats.get_attribute_values(log, parameters[pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY], parameters=parameters)
        return apply_dfg(dfg, activities=activities, start_activities=start_activities, end_activities=end_activities, parameters=parameters)
    log = log_conversion.apply(log, parameters, log_conversion.TO_EVENT_LOG)
    tree = apply_tree(log, parameters=parameters)
    net, initial_marking, final_marking = tree_to_petri.apply(tree)
    return net, initial_marking, final_marking
Esempio n. 24
0
def execute_script():
    log_path = os.path.join("..", "tests", "input_data", "running-example.csv")
    dataframe = pandas_df_imp.import_dataframe_from_path(log_path)
    activities_count = dict(dataframe.groupby("concept:name").size())
    [dfg_frequency, dfg_performance] = df_statistics.get_dfg_graph(dataframe, measure="both",
                                                                   perf_aggregation_key="mean")
    net, initial_marking, final_marking = alpha_miner.apply_dfg(dfg_frequency)
    spaths = get_shortest_paths(net)
    aggregated_statistics = get_decorations_from_dfg_spaths_acticount(net, dfg_performance,
                                                                      spaths,
                                                                      activities_count,
                                                                      variant="performance")
    # obtain stochastic information for transitions in the model
    s_map = stochastic_map.get_map_exponential_from_aggstatistics(aggregated_statistics)
    # gets the reachability graph from the Petri net
    reachab_graph = construct_reachability_graph(net, initial_marking)
    # get the tangible reachability graph from the reachability graph and the stochastic map
    tang_reach_graph = tangible_reachability.get_tangible_reachability_from_reachability(reachab_graph, s_map)
    # visualize the tangible reachability graph on the screen
    viz = ts_vis_factory.apply(tang_reach_graph, parameters={"format": "svg", "show_labels": True, "show_names": True})
    ts_vis_factory.view(viz)
    # gets the Q matrix assuming exponential distributions
    q_matrix = ctmc.get_q_matrix_from_tangible_exponential(tang_reach_graph, s_map)
    # pick a state to start from
    states = sorted(list(tang_reach_graph.states), key=lambda x: x.name)
    state = states[0]
    print("\n\nstarting from state = ", state.name)
    # do transient analysis after 1 day
    transient_result = ctmc.transient_analysis_from_tangible_q_matrix_and_single_state(tang_reach_graph, q_matrix,
                                                                                       state, 86400)
    print("\nprobability for each state after 1 day = ", transient_result)
    # do transient analysis after 10 days
    transient_result = ctmc.transient_analysis_from_tangible_q_matrix_and_single_state(tang_reach_graph, q_matrix,
                                                                                       state, 864000)
    print("\nprobability for each state after 10 days = ", transient_result)
    # do transient analysis after 100 days
    transient_result = ctmc.transient_analysis_from_tangible_q_matrix_and_single_state(tang_reach_graph, q_matrix,
                                                                                       state, 8640000)
    print("\nprobability for each state after 100 days = ", transient_result)
    steady_state = ctmc.steadystate_analysis_from_tangible_q_matrix(tang_reach_graph, q_matrix)
    print("\nsteady state = ", steady_state)
Esempio n. 25
0
    def get_paths(self, attribute_key, parameters=None):
        """
        Gets the paths from the log

        Parameters
        -------------
        attribute_key
            Attribute key

        Returns
        -------------
        paths
            List of paths
        """
        if parameters is None:
            parameters = {}

        dfg = df_statistics.get_dfg_graph(self.dataframe, activity_key=attribute_key,
                                          timestamp_key=DEFAULT_TIMESTAMP_KEY,
                                          case_id_glue=CASE_CONCEPT_NAME, sort_caseid_required=False,
                                          sort_timestamp_along_case_id=False)
        return dfg
Esempio n. 26
0
def apply(df: pd.DataFrame,
          activity: str,
          parameters: Optional[Dict[Any, Any]] = None) -> Dict[str, Any]:
    """
    Gets the time passed from each preceding activity and to each succeeding activity

    Parameters
    -------------
    df
        Dataframe
    activity
        Activity that we are considering
    parameters
        Possible parameters of the algorithm

    Returns
    -------------
    dictio
        Dictionary containing a 'pre' key with the
        list of aggregated times from each preceding activity to the given activity
        and a 'post' key with the list of aggregates times from the given activity
        to each succeeding activity
    """
    if parameters is None:
        parameters = {}

    case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY,
                                              parameters, CASE_CONCEPT_NAME)
    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters, DEFAULT_NAME_KEY)
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY,
                                               parameters,
                                               DEFAULT_TIMESTAMP_KEY)
    start_timestamp_key = exec_utils.get_param_value(
        Parameters.START_TIMESTAMP_KEY, parameters, None)

    business_hours = exec_utils.get_param_value(Parameters.BUSINESS_HOURS,
                                                parameters, False)
    worktiming = exec_utils.get_param_value(Parameters.WORKTIMING, parameters,
                                            [7, 17])
    weekends = exec_utils.get_param_value(Parameters.WEEKENDS, parameters,
                                          [6, 7])
    workcalendar = exec_utils.get_param_value(
        Parameters.WORKCALENDAR, parameters,
        constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR)

    [dfg_frequency, dfg_performance
     ] = pandas.get_dfg_graph(df,
                              measure="both",
                              activity_key=activity_key,
                              case_id_glue=case_id_glue,
                              timestamp_key=timestamp_key,
                              start_timestamp_key=start_timestamp_key,
                              business_hours=business_hours,
                              worktiming=worktiming,
                              weekends=weekends,
                              workcalendar=workcalendar)

    pre = []
    sum_perf_post = 0.0
    sum_acti_post = 0.0
    post = []
    sum_perf_pre = 0.0
    sum_acti_pre = 0.0

    for entry in dfg_performance.keys():
        if entry[1] == activity:
            pre.append([
                entry[0],
                float(dfg_performance[entry]),
                int(dfg_frequency[entry])
            ])
            sum_perf_pre = sum_perf_pre + float(
                dfg_performance[entry]) * float(dfg_frequency[entry])
            sum_acti_pre = sum_acti_pre + float(dfg_frequency[entry])
        if entry[0] == activity:
            post.append([
                entry[1],
                float(dfg_performance[entry]),
                int(dfg_frequency[entry])
            ])
            sum_perf_post = sum_perf_post + float(
                dfg_performance[entry]) * float(dfg_frequency[entry])
            sum_acti_post = sum_acti_post + float(dfg_frequency[entry])

    perf_acti_pre = 0.0
    if sum_acti_pre > 0:
        perf_acti_pre = sum_perf_pre / sum_acti_pre
    perf_acti_post = 0.0
    if sum_acti_post > 0:
        perf_acti_post = sum_perf_post / sum_acti_post

    return {
        "pre": pre,
        "post": post,
        "post_avg_perf": perf_acti_post,
        "pre_avg_perf": perf_acti_pre
    }
Esempio n. 27
0
def apply(dataframe, parameters=None):
    """
    Gets the performance DFG

    Parameters
    ------------
    dataframe
        Dataframe
    parameters
        Parameters of the algorithm

    Returns
    ------------
    base64
        Base64 of an SVG representing the model
    model
        Text representation of the model
    format
        Format of the model
    """
    if parameters is None:
        parameters = {}

    decreasingFactor = parameters[
        "decreasingFactor"] if "decreasingFactor" in parameters else constants.DEFAULT_DEC_FACTOR

    activity_key = parameters[
        pm4_constants.
        PARAMETER_CONSTANT_ACTIVITY_KEY] if pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY
    timestamp_key = parameters[
        pm4_constants.
        PARAMETER_CONSTANT_TIMESTAMP_KEY] if pm4_constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY
    case_id_glue = parameters[
        pm4_constants.
        PARAMETER_CONSTANT_CASEID_KEY] if pm4_constants.PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME

    parameters[pm4_constants.RETURN_EA_COUNT_DICT_AUTOFILTER] = True
    dataframe = attributes_filter.filter_df_keeping_spno_activities(
        dataframe,
        activity_key=activity_key,
        max_no_activities=constants.MAX_NO_ACTIVITIES)
    dataframe, end_activities = auto_filter.apply_auto_filter(
        dataframe, parameters=parameters)
    end_activities = list(end_activities.keys())
    [dfg, dfg_perf
     ] = df_statistics.get_dfg_graph(dataframe,
                                     activity_key=activity_key,
                                     timestamp_key=timestamp_key,
                                     case_id_glue=case_id_glue,
                                     sort_caseid_required=False,
                                     sort_timestamp_along_case_id=False,
                                     measure="both")
    activities_count = attributes_filter.get_attribute_values(
        dataframe, activity_key, parameters=parameters)
    activities = list(activities_count.keys())
    dfg = clean_dfg_based_on_noise_thresh(
        dfg,
        activities,
        decreasingFactor * constants.DEFAULT_DFG_CLEAN_MULTIPLIER,
        parameters=parameters)
    dfg_perf = {x: y for x, y in dfg_perf.items() if x in dfg}
    start_activities = list(
        start_activities_filter.get_start_activities(
            dataframe, parameters=parameters).keys())
    gviz = dfg_vis_factory.apply(dfg_perf,
                                 activities_count=activities_count,
                                 variant="performance",
                                 parameters={
                                     "format": "svg",
                                     "start_activities": start_activities,
                                     "end_activities": end_activities
                                 })

    gviz_base64 = base64.b64encode(str(gviz).encode('utf-8'))

    ret_graph = get_graph.get_graph_from_dfg(dfg, start_activities,
                                             end_activities)

    net, im, fm = dfg_conv_factory.apply(dfg,
                                         parameters={
                                             "start_activities":
                                             start_activities,
                                             "end_activities": end_activities
                                         })

    return get_base64_from_gviz(gviz), export_petri_as_string(
        net, im, fm
    ), ".pnml", "parquet", activities, start_activities, end_activities, gviz_base64, ret_graph, "dfg", "perf", None, "", activity_key
Esempio n. 28
0
def calculate_process_schema_composite_object(path,
                                              log_name,
                                              managed_logs,
                                              parameters=None):
    if parameters is None:
        parameters = {}

    performance_required = parameters[
        "performance_required"] if "performance_required" in parameters else False
    no_samples = parameters[
        PARAMETER_NO_SAMPLES] if PARAMETER_NO_SAMPLES in parameters else DEFAULT_MAX_NO_SAMPLES
    use_transition = parameters[
        PARAMETER_USE_TRANSITION] if PARAMETER_USE_TRANSITION in parameters else DEFAULT_USE_TRANSITION
    activity_key = DEFAULT_NAME_KEY if not use_transition else "@@classifier"
    filters = parameters[FILTERS] if FILTERS in parameters else []
    parameters[pm4py_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = activity_key
    if performance_required:
        columns = get_columns_to_import(
            filters,
            [CASE_CONCEPT_NAME, DEFAULT_NAME_KEY, DEFAULT_TIMESTAMP_KEY],
            use_transition=use_transition)
    else:
        columns = get_columns_to_import(filters,
                                        [CASE_CONCEPT_NAME, DEFAULT_NAME_KEY],
                                        use_transition=use_transition)

    if pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY in parameters:
        columns.append(
            parameters[pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY])
        activity_key, parameters[
            pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = parameters[
                pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY], activity_key
    else:
        parameters[
            pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = activity_key
    folder = os.path.join(path, log_name)

    parquet_list = parquet_importer.get_list_parquet(folder)
    frequency_dfg = Counter()
    performance_dfg = Counter()
    overall_ea = Counter()
    overall_sa = Counter()
    values = Counter({})
    events = 0
    cases = 0

    count = 0
    for index, pq in enumerate(parquet_list):
        pq_basename = Path(pq).name
        if pq_basename in managed_logs:
            count = count + 1
            df = get_filtered_parquet(pq,
                                      columns,
                                      filters,
                                      use_transition=use_transition,
                                      parameters=parameters)

            if performance_required:
                f_dfg, p_dfg = df_statistics.get_dfg_graph(
                    df,
                    activity_key=activity_key,
                    sort_timestamp_along_case_id=False,
                    sort_caseid_required=False,
                    measure="both")
            else:
                f_dfg = df_statistics.get_dfg_graph(
                    df,
                    activity_key=activity_key,
                    sort_timestamp_along_case_id=False,
                    sort_caseid_required=False)

            f_dfg = Counter(f_dfg)

            if performance_required:
                for k in p_dfg:
                    if k not in performance_dfg:
                        performance_dfg[k] = p_dfg[k]
                    else:
                        performance_dfg[k] = (
                            frequency_dfg[k] * performance_dfg[k] + f_dfg[k] *
                            p_dfg[k]) / (frequency_dfg[k] + f_dfg[k])

            frequency_dfg = frequency_dfg + f_dfg
            ea = Counter(
                end_activities_filter.get_end_activities(
                    df, parameters=parameters))
            overall_ea = overall_ea + ea
            sa = Counter(
                start_activities_filter.get_start_activities(
                    df, parameters=parameters))
            overall_sa = overall_sa + sa
            values = values + Counter(dict(df[activity_key].value_counts()))
            events = events + len(df)
            cases = cases + df[CASE_CONCEPT_NAME].nunique()

            if count >= no_samples:
                break

    returned_dict = {}
    returned_dict["events"] = events
    returned_dict["cases"] = cases
    values = dict(values)
    for el in values:
        values[el] = int(values[el])
    returned_dict["activities"] = values
    overall_sa = dict(overall_sa)
    for el in overall_sa:
        overall_sa[el] = int(overall_sa[el])
    returned_dict["start_activities"] = overall_sa
    overall_ea = dict(overall_ea)
    for el in overall_ea:
        overall_ea[el] = int(overall_ea[el])
    returned_dict["end_activities"] = overall_ea
    returned_dict_freq = {}
    for el in frequency_dfg:
        returned_dict_freq[el[0] + "@@" + el[1]] = int(frequency_dfg[el])
    returned_dict["frequency_dfg"] = returned_dict_freq
    if performance_required:
        returned_dict_perf = {}
        for el in performance_dfg:
            returned_dict_perf[el[0] + "@@" + el[1]] = float(
                performance_dfg[el])
        returned_dict["performance_dfg"] = returned_dict_perf

    return returned_dict
Esempio n. 29
0
def apply(dataframe, parameters=None):
    """
    Gets the Petri net through Inductive Miner, decorated by performance metric

    Parameters
    ------------
    dataframe
        Dataframe
    parameters
        Parameters of the algorithm

    Returns
    ------------
    base64
        Base64 of an SVG representing the model
    model
        Text representation of the model
    format
        Format of the model
    """
    if parameters is None:
        parameters = {}

    decreasingFactor = parameters[
        "decreasingFactor"] if "decreasingFactor" in parameters else constants.DEFAULT_DEC_FACTOR

    activity_key = parameters[
        pm4_constants.
        PARAMETER_CONSTANT_ACTIVITY_KEY] if pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY
    timestamp_key = parameters[
        pm4_constants.
        PARAMETER_CONSTANT_TIMESTAMP_KEY] if pm4_constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY
    case_id_glue = parameters[
        pm4_constants.
        PARAMETER_CONSTANT_CASEID_KEY] if pm4_constants.PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME

    parameters[pm4_constants.RETURN_EA_COUNT_DICT_AUTOFILTER] = True
    dataframe = attributes_filter.filter_df_keeping_spno_activities(
        dataframe,
        activity_key=activity_key,
        max_no_activities=constants.MAX_NO_ACTIVITIES)
    dataframe, end_activities = auto_filter.apply_auto_filter(
        dataframe, parameters=parameters)
    end_activities = list(end_activities.keys())

    activities_count = attributes_filter.get_attribute_values(
        dataframe, activity_key, parameters=parameters)
    activities = list(activities_count.keys())
    start_activities = list(
        start_activities_filter.get_start_activities(
            dataframe, parameters=parameters).keys())

    [dfg, dfg_perf
     ] = df_statistics.get_dfg_graph(dataframe,
                                     activity_key=activity_key,
                                     timestamp_key=timestamp_key,
                                     case_id_glue=case_id_glue,
                                     sort_caseid_required=False,
                                     sort_timestamp_along_case_id=False,
                                     measure="both")
    dfg = clean_dfg_based_on_noise_thresh(
        dfg,
        activities,
        decreasingFactor * constants.DEFAULT_DFG_CLEAN_MULTIPLIER,
        parameters=parameters)
    dfg_perf = {x: y for x, y in dfg_perf.items() if x in dfg}

    net, im, fm = inductive_miner.apply_dfg(dfg,
                                            parameters,
                                            activities=activities,
                                            start_activities=start_activities,
                                            end_activities=end_activities)
    spaths = get_shortest_paths(net)

    bpmn_graph, el_corr, inv_el_corr, el_corr_keys_map = petri_to_bpmn.apply(
        net, im, fm)

    aggregated_statistics = get_decorations_from_dfg_spaths_acticount(
        net, dfg_perf, spaths, activities_count, variant="performance")

    bpmn_aggreg_statistics = convert_performance_map.convert_performance_map_to_bpmn(
        aggregated_statistics, inv_el_corr)
    #bpmn_graph = bpmn_embedding.embed_info_into_bpmn(bpmn_graph, bpmn_aggreg_statistics, "performance")
    bpmn_graph = bpmn_diagram_layouter.apply(bpmn_graph)
    bpmn_string = bpmn_exporter.get_string_from_bpmn(bpmn_graph)

    gviz = bpmn_vis_factory.apply_petri(
        net,
        im,
        fm,
        aggregated_statistics=aggregated_statistics,
        variant="performance",
        parameters={"format": "svg"})
    gviz2 = bpmn_vis_factory.apply_petri(
        net,
        im,
        fm,
        aggregated_statistics=aggregated_statistics,
        variant="performance",
        parameters={"format": "dot"})

    gviz_base64 = get_base64_from_file(gviz2.name)

    ret_graph = get_graph.get_graph_from_petri(net, im, fm)

    return get_base64_from_file(gviz.name), export_petri_as_string(
        net, im, fm
    ), ".pnml", "parquet", activities, start_activities, end_activities, gviz_base64, ret_graph, "indbpmn", "perf", bpmn_string, ".bpmn", activity_key
Esempio n. 30
0
def apply_pandas(df, parameters=None):
    """
    Discovers a Petri net using Heuristics Miner

    Parameters
    ------------
    df
        Pandas dataframe
    parameters
        Possible parameters of the algorithm,
        including: activity_key, case_id_glue, timestamp_key,
        dependency_thresh, and_measure_thresh, min_act_count, min_dfg_occurrences, dfg_pre_cleaning_noise_thresh,
        loops_length_two_thresh

    Returns
    ------------
    net
        Petri net
    im
        Initial marking
    fm
        Final marking
    """
    if parameters is None:
        parameters = {}

    if pkgutil.find_loader("pandas"):
        activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY)
        case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME)
        start_timestamp_key = exec_utils.get_param_value(Parameters.START_TIMESTAMP_KEY, parameters,
                                                         None)
        timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes.DEFAULT_TIMESTAMP_KEY)

        from pm4py.algo.discovery.dfg.adapters.pandas import df_statistics, freq_triples as get_freq_triples
        from pm4py.statistics.attributes.pandas import get as pd_attributes
        from pm4py.statistics.start_activities.pandas import get as pd_sa_filter
        from pm4py.statistics.end_activities.pandas import get as pd_ea_filter

        start_activities = pd_sa_filter.get_start_activities(df, parameters=parameters)
        end_activities = pd_ea_filter.get_end_activities(df, parameters=parameters)
        activities_occurrences = pd_attributes.get_attribute_values(df, activity_key, parameters=parameters)
        activities = list(activities_occurrences.keys())
        heu_net_decoration = exec_utils.get_param_value(Parameters.HEU_NET_DECORATION, parameters, "frequency")

        if timestamp_key in df:
            dfg = df_statistics.get_dfg_graph(df, case_id_glue=case_id_glue,
                                              activity_key=activity_key, timestamp_key=timestamp_key,
                                              start_timestamp_key=start_timestamp_key)
            dfg_window_2 = df_statistics.get_dfg_graph(df, case_id_glue=case_id_glue,
                                                       activity_key=activity_key, timestamp_key=timestamp_key, window=2,
                                                       start_timestamp_key=start_timestamp_key)
            frequency_triples = get_freq_triples.get_freq_triples(df, case_id_glue=case_id_glue,
                                                                  activity_key=activity_key,
                                                                  timestamp_key=timestamp_key)

        else:
            dfg = df_statistics.get_dfg_graph(df, case_id_glue=case_id_glue,
                                              activity_key=activity_key, sort_timestamp_along_case_id=False)
            dfg_window_2 = df_statistics.get_dfg_graph(df, case_id_glue=case_id_glue,
                                                       activity_key=activity_key, sort_timestamp_along_case_id=False,
                                                       window=2)
            frequency_triples = get_freq_triples.get_freq_triples(df, case_id_glue=case_id_glue,
                                                                  activity_key=activity_key,
                                                                  timestamp_key=timestamp_key,
                                                                  sort_timestamp_along_case_id=False)

        performance_dfg = None
        if heu_net_decoration == "performance":
            performance_dfg = df_statistics.get_dfg_graph(df, case_id_glue=case_id_glue,
                                                          activity_key=activity_key, timestamp_key=timestamp_key,
                                                          start_timestamp_key=start_timestamp_key,
                                                          measure="performance")

        heu_net = apply_heu_dfg(dfg, activities=activities, activities_occurrences=activities_occurrences,
                                start_activities=start_activities, end_activities=end_activities,
                                dfg_window_2=dfg_window_2,
                                freq_triples=frequency_triples, performance_dfg=performance_dfg, parameters=parameters)
        net, im, fm = hn_conv_alg.apply(heu_net, parameters=parameters)

        return net, im, fm