Esempio n. 1
0
 def test_statistics_log(self):
     log = pm4py.read_xes("input_data/running-example.xes")
     pm4py.get_start_activities(log)
     pm4py.get_end_activities(log)
     pm4py.get_attributes(log)
     pm4py.get_trace_attributes(log)
     pm4py.get_attribute_values(log, "org:resource")
     pm4py.get_variants(log)
Esempio n. 2
0
 def test_statistics_df(self):
     df = pd.read_csv("input_data/running-example.csv")
     df = pm4py.format_dataframe(df, case_id="case:concept:name", activity_key="concept:name",
                                 timestamp_key="time:timestamp")
     pm4py.get_start_activities(df)
     pm4py.get_end_activities(df)
     pm4py.get_attributes(df)
     pm4py.get_attribute_values(df, "org:resource")
     pm4py.get_variants(df)
Esempio n. 3
0
 def test_filter_act_percentage(self):
     from pm4py.algo.filtering.dfg import dfg_filtering
     log = pm4py.read_xes("input_data/running-example.xes")
     dfg, sa, ea = pm4py.discover_dfg(log)
     act_count = pm4py.get_attribute_values(log, "concept:name")
     dfg_filtering.filter_dfg_on_activities_percentage(
         dfg, sa, ea, act_count, 0.1)
Esempio n. 4
0
def execute_script():
    log = pm4py.read_xes(
        os.path.join("..", "tests", "input_data", "receipt.xes"))
    activities = pm4py.get_attribute_values(log, "concept:name")
    dfg, sa, ea = pm4py.discover_dfg(log)
    # filters the DFG to make a simpler one
    perc = 0.5
    dfg, sa, ea, activities = dfg_filtering.filter_dfg_on_activities_percentage(
        dfg, sa, ea, activities, perc)
    dfg, sa, ea, activities = dfg_filtering.filter_dfg_on_paths_percentage(
        dfg, sa, ea, activities, perc)
    # creates the simulated log
    simulated_log = dfg_playout.apply(dfg, sa, ea)
    print(simulated_log)
    print(len(simulated_log))
    print(sum(x.attributes["probability"] for x in simulated_log))
    # shows the two DFGs to show that they are identical
    pm4py.view_dfg(dfg, sa, ea, log=log, format="svg")
    new_dfg, new_sa, new_ea = pm4py.discover_dfg(simulated_log)
    pm4py.view_dfg(new_dfg, new_sa, new_ea, log=simulated_log, format="svg")
    for trace in simulated_log:
        print(list(x["concept:name"] for x in trace))
        print(trace.attributes["probability"],
              dfg_playout.get_trace_probability(trace, dfg, sa, ea))
        break
    dfg, sa, ea = pm4py.discover_dfg(log)
    variants = pm4py.get_variants(log)
    sum_prob_log_variants = 0.0
    for var in variants:
        sum_prob_log_variants += dfg_playout.get_trace_probability(
            variants[var][0], dfg, sa, ea)
    print(
        "percentage of behavior allowed from DFG that is in the log (from 0.0 to 1.0): ",
        sum_prob_log_variants)
Esempio n. 5
0
def detect(log: EventLog, alphabet: Dict[str, int], act_key: str, use_msd: bool) -> Optional[str]:
    candidates = set(alphabet.keys())
    for t in log:
        candidates = candidates.intersection(set(map(lambda e: e[act_key], t)))
        if len(candidates) == 0:
            return None
    for a in candidates:
        proj = EventLog()
        for t in log:
            proj.append(pm4py.filter_trace(lambda e: e[act_key] != a, t))
        if len(list(filter(lambda t: len(t) == 0, proj))) == 0:
            dfg_proj = discover_dfg.apply(proj, parameters={
                constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key})
            alphabet_proj = pm4py.get_attribute_values(proj, act_key)
            start_act_proj = get_starters.get_start_activities(proj, parameters={
                constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key})
            end_act_proj = get_ends.get_end_activities(log, parameters={
                constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key})
            pre_proj, post_proj = dfg_utils.get_transitive_relations(dfg_proj, alphabet_proj)
            cut = sequence_cut.detect(alphabet_proj, pre_proj, post_proj)
            if cut is not None:
                return a
            cut = xor_cut.detect(dfg_proj, alphabet_proj)
            if cut is not None:
                return a
            cut = concurrent_cut.detect(dfg_proj, alphabet_proj, start_act_proj, end_act_proj,
                                        msd= msdw_algo.derive_msd_witnesses(proj, msd_algo.apply(log, parameters={
                                        constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}), parameters={
                                        constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key}) if use_msd else None)
            if cut is not None:
                return a
            cut = loop_cut.detect(dfg_proj, alphabet_proj, start_act_proj, end_act_proj)
            if cut is not None:
                return a
    return None
Esempio n. 6
0
def get_change_points(log):
    attr_datetime = pm4py.get_attribute_values(log, 'time:timestamp')
    start_date = min(attr_datetime).date()
    end_date = max(attr_datetime).date()
    delta = datetime.timedelta(days=1)
    print("Start date: ", start_date, "\nEnd date: ", end_date)

    event_counts = {}
    i = start_date
    while i <= end_date:
        event_counts[i.strftime('%Y-%m-%d')] = 0
        #print(i)
        i += delta

    #print(event_counts)

    for t in attr_datetime:
        event_counts[t.date().strftime('%Y-%m-%d')] += 1

    dates = np.array(list(event_counts.values()))

    # detection
    algo = rpt.Pelt(model=MODEL).fit(dates)
    detect_result = algo.predict(pen=PENALTY)

    # display
    rpt.display(dates, detect_result, detect_result)
    plt.savefig('change_points.png')
    plt.show()
    print('Change point plot is saved as "change_points.png"')

    return event_counts, detect_result
Esempio n. 7
0
 def test_dfg_align(self):
     import pm4py
     from pm4py.objects.dfg.filtering import dfg_filtering
     from pm4py.objects.dfg.utils import dfg_alignment
     log = pm4py.read_xes(os.path.join("input_data", "running-example.xes"))
     dfg, sa, ea = pm4py.discover_dfg(log)
     act_count = pm4py.get_attribute_values(log, "concept:name")
     dfg, sa, ea, act_count = dfg_filtering.filter_dfg_on_activities_percentage(dfg, sa, ea, act_count, 0.5)
     dfg, sa, ea, act_count = dfg_filtering.filter_dfg_on_paths_percentage(dfg, sa, ea, act_count, 0.5)
     aligned_traces = dfg_alignment.apply(log, dfg, sa, ea)
Esempio n. 8
0
def apply(log: Union[DataFrame, EventLog, EventStream],
          parameters: Optional[Dict[str, Any]] = None) -> Dict[str, int]:
    '''
    This algorithm computes the minimum self-distance for each activity observed in an event log.
    The self distance of a in <a> is infinity, of a in <a,a> is 0, in <a,b,a> is 1, etc.
    The minimum self distance is the minimal observed self distance value in the event log.
    The activity key needs to be specified in the parameters input object (if None, default value 'concept:name' is used).


    Parameters
    ----------
    log
        event log (either pandas.DataFrame, EventLog or EventStream)
    parameters
        parameters object;

    Returns
    -------
        dict mapping an activity to its self-distance, if it exists, otherwise it is not part of the dict.
    '''
    log = pm4py.convert_to_event_log(log)
    act_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters,
                                         xes_constants.DEFAULT_NAME_KEY)
    alphabet = pm4py.get_attribute_values(log, act_key)
    log = list(map(lambda t: list(map(lambda e: e[act_key], t)), log))
    min_self_distances = dict()
    for a in alphabet:
        if len(
                list(
                    filter(
                        lambda t: len(t) > 1,
                        list(
                            map(lambda t: list(filter(lambda e: e == a, t)),
                                log))))) > 0:
            activity_indices = list(
                filter(
                    lambda t: len(t) > 1,
                    list(
                        map(lambda t: [i for i, x in enumerate(t) if x == a],
                            log))))
            min_self_distances[a] = min([
                i for l in list(
                    map(
                        lambda t: [
                            t[i] - t[i - 1] - 1 for i, x in enumerate(t)
                            if i > 0
                        ], activity_indices)) for i in l
            ])
    return min_self_distances
Esempio n. 9
0
def derive_msd_witnesses(
        log: EventLog,
        msd: Optional[Dict[str, int]] = None,
        parameters: Optional[Dict[str, Any]] = None) -> Dict[str, Set[str]]:
    '''
    This function derives the minimum self distance witnesses.
    The self distance of a in <a> is infinity, of a in <a,a> is 0, in <a,b,a> is 1, etc.
    The minimum self distance is the minimal observed self distance value in the event log.
    A 'witness' is an activity that witnesses the minimum self distance.
    For example, if the minimum self distance of activity a in some log L is 2, then,
    if trace <a,b,c,a> is in log L, b and c are a witness of a.

    Parameters
    ----------
    log
        Event Log to use
    msd
        Optional minimum self distance dictionary
    parameters
        Optional parameters dictionary

    Returns
    -------
    Dictionary mapping each activity to a set of witnesses.

    '''
    log = pm4py.convert_to_event_log(log)
    act_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters,
                                         xes_constants.DEFAULT_NAME_KEY)
    alphabet = pm4py.get_attribute_values(log, act_key)
    msd = msd if msd is not None else msd_algo.apply(log, parameters)
    log = list(map(lambda t: list(map(lambda e: e[act_key], t)), log))
    witnesses = dict()
    for a in alphabet:
        if a in msd and msd[a] > 0:
            witnesses[a] = set()
        else:
            continue
        for t in log:
            if len(list(filter(lambda e: e == a, t))) > 1:
                indices = [i for i, x in enumerate(t) if x == a]
                for i in range(len(indices) - 1):
                    if indices[i + 1] - indices[i] - 1 == msd[a]:
                        for b in t[indices[i] + 1:indices[i + 1]]:
                            witnesses[a].add(b)
    return witnesses
Esempio n. 10
0
def get_process_svg():
    parameters = request.args.get("parameters")
    parameters = __process_parameters(parameters)

    log = __prepare_event_log(parameters)
    ext_type = parameters[
        "ext_type"] if "ext_type" in parameters else "document_flow_log"
    log_type = __get_log_type_from_ext_type(ext_type)

    if log_type == 0:
        log.type = "succint"
        from pm4pymdl.algo.mvp.gen_framework import algorithm as discovery
        from pm4pymdl.visualization.mvp.gen_framework import visualizer as vis_factory
        model = discovery.apply(log,
                                model_type_variant="model3",
                                node_freq_variant="type31",
                                edge_freq_variant="type11")
        gviz = vis_factory.apply(model, parameters={"format": "svg"})
    elif log_type == 1 or log_type == 2:
        import pandas as pd
        if type(log) is pd.DataFrame:
            from pm4py.objects.dfg.retrieval.pandas import get_dfg_graph
            dfg = get_dfg_graph(log)
            from pm4py.statistics.start_activities.pandas import get as pd_sa_get
            from pm4py.statistics.end_activities.pandas import get as pd_ea_get
            sa = pd_sa_get.get_start_activities(log)
            ea = pd_ea_get.get_end_activities(log)
        else:
            dfg, sa, ea = pm4py.discover_dfg(log)
        act_count = pm4py.get_attribute_values(log, "concept:name")
        dfg, sa, ea, act_count = dfg_filtering.filter_dfg_on_paths_percentage(
            dfg, sa, ea, act_count, 0.2, keep_all_activities=True)
        gviz = pm4py.visualization.dfg.visualizer.apply(
            dfg,
            activities_count=act_count,
            parameters={
                "format": "svg",
                "start_activities": sa,
                "end_activities": ea
            })

    ser = pm4py.visualization.dfg.visualizer.serialize(gviz).decode("utf-8")

    return ser
Esempio n. 11
0
def execute_script():
    log = pm4py.read_xes("../tests/input_data/receipt.xes")
    dfg, sa, ea = pm4py.discover_dfg(log)
    act_count = pm4py.get_attribute_values(log, "concept:name")
    # keep the specified amount of activities
    dfg, sa, ea, act_count = pm4py.objects.dfg.filtering.dfg_filtering.filter_dfg_on_activities_percentage(
        dfg, sa, ea, act_count, 0.3)
    # keep the specified amount of paths
    dfg, sa, ea, act_count = pm4py.objects.dfg.filtering.dfg_filtering.filter_dfg_on_paths_percentage(
        dfg, sa, ea, act_count, 0.3)
    # view the DFG
    gviz = dfg_visualizer.apply(
        dfg,
        activities_count=act_count,
        parameters={
            dfg_visualizer.Variants.FREQUENCY.value.Parameters.START_ACTIVITIES:
            sa,
            dfg_visualizer.Variants.FREQUENCY.value.Parameters.END_ACTIVITIES:
            ea,
            dfg_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "svg"
        })
    dfg_visualizer.view(gviz)
Esempio n. 12
0
def execute_script():
    ENABLE_VISUALIZATION = True

    # reads a XES into an event log
    log1 = pm4py.read_xes("../tests/input_data/running-example.xes")

    # reads a CSV into a dataframe
    df = pd.read_csv("../tests/input_data/running-example.csv")
    # formats the dataframe with the mandatory columns for process mining purposes
    df = pm4py.format_dataframe(df,
                                case_id="case:concept:name",
                                activity_key="concept:name",
                                timestamp_key="time:timestamp")
    # converts the dataframe to an event log
    log2 = pm4py.convert_to_event_log(df)

    # converts the log read from XES into a stream and dataframe respectively
    stream1 = pm4py.convert_to_event_stream(log1)
    df2 = pm4py.convert_to_dataframe(log1)

    # writes the log1 to a XES file
    pm4py.write_xes(log1, "ru1.xes")

    dfg, dfg_sa, dfg_ea = pm4py.discover_dfg(log1)
    petri_alpha, im_alpha, fm_alpha = pm4py.discover_petri_net_alpha(log1)
    petri_inductive, im_inductive, fm_inductive = pm4py.discover_petri_net_inductive(
        log1)
    petri_heuristics, im_heuristics, fm_heuristics = pm4py.discover_petri_net_heuristics(
        log1)
    tree_inductive = pm4py.discover_tree_inductive(log1)
    heu_net = pm4py.discover_heuristics_net(log1)

    pm4py.write_dfg(dfg, dfg_sa, dfg_ea, "ru_dfg.dfg")
    pm4py.write_petri_net(petri_alpha, im_alpha, fm_alpha, "ru_alpha.pnml")
    pm4py.write_petri_net(petri_inductive, im_inductive, fm_inductive,
                          "ru_inductive.pnml")
    pm4py.write_petri_net(petri_heuristics, im_heuristics, fm_heuristics,
                          "ru_heuristics.pnml")
    pm4py.write_process_tree(tree_inductive, "ru_inductive.ptml")

    dfg, dfg_sa, dfg_ea = pm4py.read_dfg("ru_dfg.dfg")
    petri_alpha, im_alpha, fm_alpha = pm4py.read_petri_net("ru_alpha.pnml")
    petri_inductive, im_inductive, fm_inductive = pm4py.read_petri_net(
        "ru_inductive.pnml")
    petri_heuristics, im_heuristics, fm_heuristics = pm4py.read_petri_net(
        "ru_heuristics.pnml")
    tree_inductive = pm4py.read_process_tree("ru_inductive.ptml")

    pm4py.save_vis_petri_net(petri_alpha, im_alpha, fm_alpha, "ru_alpha.png")
    pm4py.save_vis_petri_net(petri_inductive, im_inductive, fm_inductive,
                             "ru_inductive.png")
    pm4py.save_vis_petri_net(petri_heuristics, im_heuristics, fm_heuristics,
                             "ru_heuristics.png")
    pm4py.save_vis_process_tree(tree_inductive, "ru_inductive_tree.png")
    pm4py.save_vis_heuristics_net(heu_net, "ru_heunet.png")
    pm4py.save_vis_dfg(dfg, dfg_sa, dfg_ea, "ru_dfg.png")

    if ENABLE_VISUALIZATION:
        pm4py.view_petri_net(petri_alpha, im_alpha, fm_alpha, format="svg")
        pm4py.view_petri_net(petri_inductive,
                             im_inductive,
                             fm_inductive,
                             format="svg")
        pm4py.view_petri_net(petri_heuristics,
                             im_heuristics,
                             fm_heuristics,
                             format="svg")
        pm4py.view_process_tree(tree_inductive, format="svg")
        pm4py.view_heuristics_net(heu_net, format="svg")
        pm4py.view_dfg(dfg, dfg_sa, dfg_ea, format="svg")

    aligned_traces = pm4py.conformance_alignments(log1, petri_inductive,
                                                  im_inductive, fm_inductive)
    replayed_traces = pm4py.conformance_tbr(log1, petri_inductive,
                                            im_inductive, fm_inductive)

    fitness_tbr = pm4py.evaluate_fitness_tbr(log1, petri_inductive,
                                             im_inductive, fm_inductive)
    print("fitness_tbr", fitness_tbr)
    fitness_align = pm4py.evaluate_fitness_alignments(log1, petri_inductive,
                                                      im_inductive,
                                                      fm_inductive)
    print("fitness_align", fitness_align)
    precision_tbr = pm4py.evaluate_precision_tbr(log1, petri_inductive,
                                                 im_inductive, fm_inductive)
    print("precision_tbr", precision_tbr)
    precision_align = pm4py.evaluate_precision_alignments(
        log1, petri_inductive, im_inductive, fm_inductive)
    print("precision_align", precision_align)

    print("log start activities = ", pm4py.get_start_activities(log2))
    print("df start activities = ", pm4py.get_start_activities(df2))
    print("log end activities = ", pm4py.get_end_activities(log2))
    print("df end activities = ", pm4py.get_end_activities(df2))
    print("log attributes = ", pm4py.get_attributes(log2))
    print("df attributes = ", pm4py.get_attributes(df2))
    print("log org:resource values = ",
          pm4py.get_attribute_values(log2, "org:resource"))
    print("df org:resource values = ",
          pm4py.get_attribute_values(df2, "org:resource"))

    print("start_activities len(filt_log) = ",
          len(pm4py.filter_start_activities(log2, ["register request"])))
    print("start_activities len(filt_df) = ",
          len(pm4py.filter_start_activities(df2, ["register request"])))
    print("end_activities len(filt_log) = ",
          len(pm4py.filter_end_activities(log2, ["pay compensation"])))
    print("end_activities len(filt_df) = ",
          len(pm4py.filter_end_activities(df2, ["pay compensation"])))
    print(
        "attributes org:resource len(filt_log) (cases) cases = ",
        len(
            pm4py.filter_attribute_values(log2,
                                          "org:resource", ["Ellen"],
                                          level="case")))
    print(
        "attributes org:resource len(filt_log) (cases)  events = ",
        len(
            pm4py.filter_attribute_values(log2,
                                          "org:resource", ["Ellen"],
                                          level="event")))
    print(
        "attributes org:resource len(filt_df) (events) cases = ",
        len(
            pm4py.filter_attribute_values(df2,
                                          "org:resource", ["Ellen"],
                                          level="case")))
    print(
        "attributes org:resource len(filt_df) (events) events = ",
        len(
            pm4py.filter_attribute_values(df2,
                                          "org:resource", ["Ellen"],
                                          level="event")))
    print(
        "attributes org:resource len(filt_df) (events) events notpositive = ",
        len(
            pm4py.filter_attribute_values(df2,
                                          "org:resource", ["Ellen"],
                                          level="event",
                                          retain=False)))

    print("variants log = ", pm4py.get_variants(log2))
    print("variants df = ", pm4py.get_variants(df2))
    print(
        "variants filter log = ",
        len(
            pm4py.filter_variants(log2, [[
                "register request", "examine thoroughly", "check ticket",
                "decide", "reject request"
            ]])))
    print(
        "variants filter df = ",
        len(
            pm4py.filter_variants(df2, [[
                "register request", "examine thoroughly", "check ticket",
                "decide", "reject request"
            ]])))
    print("variants filter percentage = ",
          len(pm4py.filter_variants_percentage(log2, threshold=0.8)))

    print(
        "paths filter log len = ",
        len(
            pm4py.filter_directly_follows_relation(
                log2, [("register request", "examine casually")])))
    print(
        "paths filter dataframe len = ",
        len(
            pm4py.filter_directly_follows_relation(
                df2, [("register request", "examine casually")])))

    print(
        "timeframe filter log events len = ",
        len(
            pm4py.filter_time_range(log2,
                                    "2011-01-01 00:00:00",
                                    "2011-02-01 00:00:00",
                                    mode="events")))
    print(
        "timeframe filter log traces_contained len = ",
        len(
            pm4py.filter_time_range(log2,
                                    "2011-01-01 00:00:00",
                                    "2011-02-01 00:00:00",
                                    mode="traces_contained")))
    print(
        "timeframe filter log traces_intersecting len = ",
        len(
            pm4py.filter_time_range(log2,
                                    "2011-01-01 00:00:00",
                                    "2011-02-01 00:00:00",
                                    mode="traces_intersecting")))
    print(
        "timeframe filter df events len = ",
        len(
            pm4py.filter_time_range(df2,
                                    "2011-01-01 00:00:00",
                                    "2011-02-01 00:00:00",
                                    mode="events")))
    print(
        "timeframe filter df traces_contained len = ",
        len(
            pm4py.filter_time_range(df2,
                                    "2011-01-01 00:00:00",
                                    "2011-02-01 00:00:00",
                                    mode="traces_contained")))
    print(
        "timeframe filter df traces_intersecting len = ",
        len(
            pm4py.filter_time_range(df2,
                                    "2011-01-01 00:00:00",
                                    "2011-02-01 00:00:00",
                                    mode="traces_intersecting")))

    # remove the temporary files
    os.remove("ru1.xes")
    os.remove("ru_dfg.dfg")
    os.remove("ru_alpha.pnml")
    os.remove("ru_inductive.pnml")
    os.remove("ru_heuristics.pnml")
    os.remove("ru_inductive.ptml")
    os.remove("ru_alpha.png")
    os.remove("ru_inductive.png")
    os.remove("ru_heuristics.png")
    os.remove("ru_inductive_tree.png")
    os.remove("ru_heunet.png")
    os.remove("ru_dfg.png")
Esempio n. 13
0
 def test_filter_paths_percentage(self):
     log = pm4py.read_xes("input_data/running-example.xes")
     dfg, sa, ea = pm4py.discover_dfg(log)
     act_count = pm4py.get_attribute_values(log, "concept:name")
     pm4py.objects.dfg.filtering.dfg_filtering.filter_dfg_on_paths_percentage(
         dfg, sa, ea, act_count, 0.3)
Esempio n. 14
0
def lstm_algorithm(index, len_of_points, resample_dataset, LSTM_CELLS):

    #print('Current path: ',os.getcwd())
    if not os.path.exists('results'):
        os.makedirs('results')

    COLOR_CYCLE = ["#4286f4", "#f44174"]

    split_percentage = 0.8
    """                        
    answer = input('Give me the length of window: ')
    if answer == 'max':
        len_of_points = 0
    else:
        len_of_points = int(answer)
       
    #len_of_points = 3
    """

    fileinfo = {
        0: {
            'filename': 'base_kasteren.csv',
            'separator': ' ',
            'columns':
            ['date', 'time', 'attr1', 'attr2', 'state', 'concept:name']
        },
        1: {
            'filename':
            'activity3.csv',
            'separator':
            ',',
            'columns': [
                'id', 'case:concept:name', 'subjectID', 'attr_starttime',
                'time:timestamp', 'concept:name', 'label_subactivity'
            ]
        },
        2: {
            'filename': 'activitylog_uci_detailed_labour.xes',
            'separator': '',
            'columns': []
        },
        3: {
            'filename': 'atmo1.csv',
            'separator': ' ',
            'columns': ['date', 'time', 'concept:name', 'state', 'activity']
        },
        4: {
            'filename':
            'activity1.csv',
            'separator':
            ',',
            'columns': [
                'id', 'case:concept:name', 'subjectID', 'attr_starttime',
                'time:timestamp', 'concept:name', 'label_subactivity'
            ]
        },
        5: {
            'filename':
            'activity2.csv',
            'separator':
            ',',
            'columns': [
                'id', 'case:concept:name', 'subjectID', 'attr_starttime',
                'time:timestamp', 'concept:name', 'label_subactivity'
            ]
        },
        6: {
            'filename': 'espa.xes',
            'separator': ';',
            'columns': []
        },
        7: {
            'filename':
            'activity3.csv',
            'separator':
            ',',
            'columns': [
                'id', 'case:concept:name', 'subjectID', 'attr_starttime',
                'time:timestamp', 'concept:name', 'label_subactivity'
            ]
        },
        8: {
            'filename':
            'activity4.csv',
            'separator':
            ',',
            'columns': [
                'id', 'case:concept:name', 'subjectID', 'attr_starttime',
                'time:timestamp', 'concept:name', 'label_subactivity'
            ]
        },
        9: {
            'filename':
            'activity5.csv',
            'separator':
            ',',
            'columns': [
                'id', 'case:concept:name', 'subjectID', 'attr_starttime',
                'time:timestamp', 'concept:name', 'label_subactivity'
            ]
        },
        10: {
            'filename':
            'activity6.csv',
            'separator':
            ',',
            'columns': [
                'id', 'case:concept:name', 'subjectID', 'attr_starttime',
                'time:timestamp', 'concept:name', 'label_subactivity'
            ]
        },
        11: {
            'filename':
            'activity7.csv',
            'separator':
            ',',
            'columns': [
                'id', 'case:concept:name', 'subjectID', 'attr_starttime',
                'time:timestamp', 'concept:name', 'label_subactivity'
            ]
        },
        12: {
            'filename': 'BPI_Challenge_2017.xes',
            'separator': '',
            'columns': []
        },
    }

    #choose file

    filename = fileinfo[index]['filename']
    filepath = '../datasets/' + fileinfo[index]['filename']

    dataframe = pd.DataFrame()
    if not os.path.exists('results/' + filename):
        os.makedirs('results/' + filename)
    if not os.path.exists('results/' + filename + '/' + str(len_of_points) +
                          '/'):
        os.makedirs('results/' + filename + '/' + str(len_of_points) + '/')

    #if it is a csv file
    if (filename.find('.csv') != -1):
        #load file to dataframe
        dataframe = pd.read_csv(filepath,
                                sep=fileinfo[index]['separator'],
                                names=fileinfo[index]['columns'],
                                low_memory=False)
        #for Kastern dataset prepare columns
        if index in [0, 3, 12]:
            dataframe[
                'time:timestamp'] = dataframe['date'] + ' ' + dataframe['time']
            dataframe['case:concept:name'] = dataframe['date']
            #dataframe = dataframe[dataframe['concept:name']!='None']

        #print ("file is csv ")
        #print(dataframe.head(20))

        #drop nan

        #convert csv to xes
        log = pm4py.convert_to_event_log(dataframe)

    else:
        #the file is xes
        #import log
        #xes_importer.iterparse.Parameters.MAX_TRACES = 10
        #parameters = {xes_importer.iterparse.Parameters.MAX_TRACES: 50}
        #log = xes_importer.apply('datasets/BPI Challenge 2018.xes.gz', parameters=parameters)
        log = pm4py.read_xes(filepath)
        print(log)
        #convert to dataframe
        dataframe = pm4py.convert_to_dataframe(log)
        print(dataframe)
        #print(dataframe['time:timestamp'][0].replace(tzinfo=timezone.utc).astimezone(tz=None))
        #dataframe['time:timestamp'] = dataframe['time:timestamp'].dt.tz_convert(None)

    if index in [2, 12]:
        #process time:timestamp remove zone information
        dataframe['time:timestamp'] = dataframe[
            'time:timestamp'].dt.tz_convert(None)

    #del log
    print('Dataframe print\n', dataframe)
    #get only start events if lifecycle:transition if column does not exists create it
    if 'lifecycle:transition' in dataframe.columns:

        dataframe = dataframe[dataframe['lifecycle:transition'] == 'complete']

    else:
        dataframe['lifecycle:transition'] = 'complete'

    #remove Start and End events
    dataframe = dataframe[dataframe['concept:name'] != 'Start']
    dataframe = dataframe[dataframe['concept:name'] != 'End']

    #sort by time
    if 'time:timestamp' in dataframe.columns:
        dataframe = dataframe.sort_values('time:timestamp')
    else:
        print('Error: no column time:timestamp in event log')

    #print('Sorted dataframe\n',dataframe)

    #plot time vs activity
    #fig, axes = plt.subplots(1, 1, figsize=(100, 100))
    #fig = dataframe.plot(x='time:timestamp', y='concept:name', kind="scatter").get_figure()
    #fig.savefig('results/'+filename+'/'+str(len_of_points) +'/conceptname.png', bbox_inches='tight')

    #plot time vs trace id
    #df.plot(x='col_name_1', y='col_name_2', style='o')
    #fig = dataframe.plot(x='time:timestamp', y='case:concept:name', kind="scatter").get_figure()
    #fig.savefig('results/'+filename+'/'+str(len_of_points) +'/caseconceptname.png', bbox_inches='tight')

    #keep only mandatory columns
    dataframe = dataframe[[
        'case:concept:name', 'concept:name', 'time:timestamp'
    ]]
    #convert sorted dataframe to log
    log = pm4py.convert_to_event_log(dataframe)

    #initial_df = dataframe.copy()
    #print('Initial dataframe\n',initial_df)
    #-----------------------------------------------------------------
    ############################################################
    #-------------- Resample -----------------------------------
    ###########################################################
    if resample_dataset:
        #preprocess timestamp to be prepared for resample
        #make time:timestamp datetime
        dataframe.loc[:, 'time:timestamp'] = pd.to_datetime(
            dataframe['time:timestamp'])
        #set time:timestamp as index
        dataframe = dataframe.set_index(["time:timestamp"])
        #remove duplicates
        #print('Duplicated\n')
        #print(dataframe[dataframe.index.duplicated()])
        dataframe = dataframe[~dataframe.index.duplicated(keep='first')]

        #------Reasample dataframe every 5min keep last value found if Nan-------------
        dataframe = dataframe.resample("5T").fillna("backfill")
        print('Resample', dataframe)
        #print( dataframe.last())

    #save resampled dataframe to csv
    dataframe.to_csv('../datasets/resampled_sorted_df.csv')

    #dataframe is initial event log sorted by time (start event only)
    #convert sorted by time dataframe back to log (xes)
    #log = pm4py.convert_to_event_log(dataframe)

    #-----------------------------------save to csv-------------------------------------
    #uncomment only if you need it
    #dataframe.to_csv('datasets/activitylog_uci_detailed_labour.csv')

    #print('\nDataframe LOG\n',dataframe)

    #--------------- Concat activities of a trace in one row ---------------
    #concat events with same case:concept:name (space separated)
    print('dataframe\n', dataframe)
    df = dataframe.groupby('case:concept:name', sort=False).agg(
        {'concept:name': lambda x: ' '.join(x)})
    print('df\n', df)
    df = df.reset_index()
    if len_of_points:
        print('--------------------------------------')
        df['concept:name'] = df['concept:name'].apply(
            lambda x: list(x.split(' ')))
        df['concept:name'] = df['concept:name'].apply(
            lambda x:
            [x[i:i + len_of_points] for i in range(0, len(x), len_of_points)])
        df = df.set_index('case:concept:name')['concept:name'].apply(
            pd.Series).stack().reset_index(level=0).rename(
                columns={0: 'concept:name'})
        df['concept:name'] = df['concept:name'].apply(lambda x: ' '.join(x))
        print('\ndftest\n', df)

    df = df.reset_index()  #check here

    #del dataframe

    #print the activities of the log
    activities = pm4py.get_attribute_values(log, 'concept:name')
    print('\nActivities:\n', activities)

    #split data from event log - 80% for train and 20% for test

    #shuffle before split
    #df = shuffle(df)
    #print('df', df,'\n')

    #----------------------- Split Train and Test data ----------------------
    #split rows depending on percentage
    split_rows = int(df.shape[0] * split_percentage)
    print('Split Rows', split_rows, '\n')

    #train dataframe
    train_df = df[:split_rows]
    train_df.to_csv('train.csv')
    print('Train Rows', train_df, '\n')

    #test dataframe
    test_df = df[split_rows:]
    test_df.to_csv('test.csv')
    #print('Test Rows', test_df,'\n')

    # --------------------------------------------------------------

    #data = df['concept:name'].copy().to_list()
    data = train_df['concept:name'].copy().to_list()
    #Just for Eating/Drinking
    #data = data.replace('Eating/Drinking','EatDrink')
    #print('Data\n',data)

    tokenizer = Tokenizer()
    #reads the words in data and gives an index for every words based on frequency
    tokenizer.fit_on_texts([data])
    print('Word index: ')
    print(tokenizer.word_index)

    #replace every word in the text to correspoding word index - returns list of list with one element so use [0] to get the one and only first list
    encoded = tokenizer.texts_to_sequences([data])[0]
    #print('encoded: \n')
    #print(encoded)
    vocab_size = len(tokenizer.word_index) + 1
    print('Vocabulary Size: %d' % vocab_size)
    #print('list\n',[e for e in encoded])
    #print('Min ',min([len(e) for e in encoded]))

    # LSTM 3 timesteps - prepare data - encode 2 words -> 1 word
    sequences = list()
    for i in range(n_input, len(encoded)):
        sequence = encoded[i - n_input:i + 1]
        sequences.append(sequence)
    print('Total Sequences: %d' % len(sequences))
    print('Sequences: \n')
    print(sequences)

    max_length = max([len(seq) for seq in sequences])  #max_length is 3
    # Pad sequence to be of the same length
    # length of sequence must be 3 (maximum)
    # 'pre' or 'post': pad either before or after each sequence
    sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
    print('Max Sequence Length: %d' % max_length)

    #convert list to array to get X,y train
    sequences = array(sequences)
    X, y = sequences[:, :-1], sequences[:, -1]
    print('X: \n')
    print(X)
    print('y: \n')
    print(y)

    #convert y to binary vectors
    y = to_categorical(y, num_classes=vocab_size)
    print('y: \n')
    print(y)

    #test data
    test_data = test_df['concept:name'].copy().to_list()

    test_encoded = tokenizer.texts_to_sequences([test_data])[0]

    test_sequences = list()

    for i in range(n_input, len(test_encoded)):
        test_sequence = test_encoded[i - n_input:i + 1]
        test_sequences.append(test_sequence)
    max_length = max([len(seq) for seq in test_sequences])
    test_sequences = pad_sequences(test_sequences,
                                   maxlen=max_length,
                                   padding='pre')

    test_sequences = array(test_sequences)
    test_X, test_y = test_sequences[:, :-1], test_sequences[:, -1]

    #convert y to binary vectors
    test_yl = to_categorical(test_y, num_classes=vocab_size)

    model = Sequential()
    #the first layer
    # - the largest integer (i.e. word index) in the input should be no larger than vocabulary size
    # - The Embedding layer is initialized with random weights and will learn an embedding for all of the words in the training dataset.
    # - output_dim (50): This is the size of the vector space in which words will be embedded (size of the embedding vectors). It defines the size of the output vectors from this layer for each word. For example, it could be 32 or 100 or even larger. Test different values for your problem.
    # - input_length: This is the length of input sequences (here is 2)
    # The Embedding layer has weights that are learned. If you save your model to file, this will include weights for the Embedding layer.
    # The output of the Embedding layer is a 2D vector with one embedding for each word in the input sequence of words (input document).
    # If you wish to connect a Dense layer directly to an Embedding layer, you must first flatten the 2D output matrix to a 1D vector using the Flatten layer.

    model.add(
        Embedding(vocab_size + 1, LSTM_CELLS, input_length=max_length - 1))

    model.add(LSTM(vocab_size))
    model.add(Dropout(0.1))
    model.add(Dense(vocab_size, activation='softmax'))
    opt = Adam(learning_rate=0.001)
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    history = model.fit(X,
                        y,
                        epochs=500,
                        verbose=0,
                        batch_size=20,
                        validation_data=(test_X, test_yl))

    print(model.summary())
    model.save('lstm_model.h5')  # creates a HDF5 file

    #del model  # deletes the existing model

    #predict sequence of n_words activities
    def generate_seq(model, tokenizer, max_length, seed_text, n_words):
        #get input activity
        in_text = seed_text
        #print('in_text',in_text,'\n')
        #for the number of activities on sequence you want to predict
        for _ in range(n_words):
            encoded = tokenizer.texts_to_sequences([in_text])[0]
            #pad if less than max text length
            encoded = pad_sequences([encoded],
                                    maxlen=max_length,
                                    padding='pre')
            #print('in text ',in_text)
            #predict one activity
            #yhat = model.predict_classes(encoded, verbose=0)
            yhat = np.argmax(model.predict(encoded), axis=-1)
            out_word = ''
            for word, index in tokenizer.word_index.items():
                #convert predicted activity to word
                if index == yhat:
                    #print('Word',word,'\n')
                    out_word = word
                    break
            #feed the next input with the sequence of activities
            in_text += ' ' + out_word

        return in_text

    #load trained model
    #model = load_model('lstm_model.h5')

    # Evaluate network
    print('LSTM Network Evaluation:\n')
    train_score = model.evaluate(X, y, verbose=0)
    print('Train Score\n', train_score)
    score = model.evaluate(test_X, test_yl, verbose=0)
    print('Test Score\n')
    print(score)

    print('History\n')
    print(history.history.keys())
    # plot loss during training
    fig = plt.figure()
    plt.subplot(211)
    plt.title('Loss')
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='test')
    plt.legend()
    fig.savefig('results/' + filename + '/' + str(len_of_points) + '/Loss.png',
                bbox_inches='tight')
    # plot accuracy during training
    fig = plt.figure()
    plt.subplot(212)
    plt.title('Accuracy')
    plt.plot(history.history['accuracy'], label='train')
    plt.plot(history.history['val_accuracy'], label='test')
    plt.legend()
    plt.show()
    fig.savefig('results/' + filename + '/' + str(len_of_points) +
                '/Accuracy.png',
                bbox_inches='tight')

    print('LSTM Results: ')
    print('\n')
    #generated_text = ''
    #sequence prediction
    for i in tokenizer.word_index:
        #print(tokenizer.index_word)
        w = generate_seq(model, tokenizer, max_length - 1, i, n_input + 1)
        #generated_text = generated_text.join('\n'+w)
        print(w)

    print('LSTM Results: ')
    print('\n')
    #for i in tokenizer.word_index:
    #	print(generate_seq(model, tokenizer, max_length-1, i , 1))
    all_data = df['concept:name'].copy().to_list()

    all_encoded = tokenizer.texts_to_sequences([all_data])[0]

    all_sequences = list()

    for i in range(n_input, len(all_encoded)):
        all_sequence = all_encoded[i - n_input:i + 1]
        all_sequences.append(all_sequence)
    max_length = max([len(seq) for seq in all_sequences])
    all_sequences = pad_sequences(all_sequences,
                                  maxlen=max_length,
                                  padding='pre')

    all_sequences = array(all_sequences)
    all_X, all_y = all_sequences[:, :-1], all_sequences[:, -1]

    #convert y to binary vectors
    all_yl = to_categorical(all_y, num_classes=vocab_size)

    #load trained model
    #model = load_model('lstm_model.h5')

    #print('Tokenizer \n',tokenizer)
    print('Tokenizer word index\n', tokenizer.word_index)

    np.set_printoptions(suppress=True)
    cnt = 0
    for i in range(len(all_X)):
        #yhat = model.predict_classes(all_X[i].reshape(1,2,1), verbose=0)
        yhat = np.argmax(model.predict(all_X[i].reshape(1, n_input, 1)),
                         axis=-1)
        df.loc[i, 'X_input'] = str(all_X[i])
        df.loc[i, 'Expected'] = all_y[i]
        df.loc[i, 'predicted'] = yhat

        #print('Expected:', all_y[i] , 'Predicted', yhat)
        prob = model.predict_proba(all_X[i].reshape(1, n_input, 1))[0]
        df.loc[i,
               'probabilities'] = ' '.join([str(elem) for elem in list(prob)])
        if (all_y[i] == yhat):
            df.loc[i, 'result'] = 'ok'
            cnt += 1
        else:
            df.loc[i, 'result'] = 'Error'

    #print(df['predicted'].replace(tokenizer.word_index))
    df.to_csv('results/' + filename + '/' + str(len_of_points) + '/resample_' +
              str(resample_dataset) + '_lstm.csv')
    print('Total successful: ', cnt, ' out of ', len(all_X), 'Percentage: ',
          cnt / len(all_X))

    # predict probabilities for test set
    yhat_probs = model.predict(test_X, verbose=0)
    # predict crisp classes for test set
    yhat_classes = model.predict_classes(test_X, verbose=0)
    print('yhat_classes\n', yhat_classes)
    # reduce to 1d array
    #yhat_probs = yhat_probs[:, 0]
    #yhat_classes = yhat_classes[:, 0]

    # accuracy: (tp + tn) / (p + n)
    accuracy = accuracy_score(test_y, yhat_classes)
    print('Accuracy: %f' % accuracy)
    # precision tp / (tp + fp)
    precision = precision_score(test_y, yhat_classes, average='weighted')
    print('Precision: %f' % precision)
    # recall: tp / (tp + fn)
    recall = recall_score(test_y, yhat_classes, average='weighted')
    print('Recall: %f' % recall)
    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(test_y, yhat_classes, average='weighted')
    print('F1 score: %f' % f1)

    # kappa
    kappa = cohen_kappa_score(test_y, yhat_classes)
    print('Cohens kappa: %f' % kappa)
    # ROC AUC
    #auc = roc_auc_score(test_y, yhat_probs,multi_class='ovr')
    #print('ROC AUC: %f' % auc)
    # confusion matrix
    matrix = confusion_matrix(test_y, yhat_classes)
    print(matrix)
    fig = plt.figure()
    sns.heatmap(matrix, center=True)
    plt.show()
    fig.savefig('results/' + filename + '/' + str(len_of_points) +
                '/ConfusionMatrix.png',
                bbox_inches='tight')

    #headers
    #filename - resample - len of points - train loss + Accuracy - test score
    #write results to csv
    fd = open("total_results.csv", "a+")
    row = filename + '\t' + str(resample_dataset) + '\t' + str(
        len_of_points
    ) + '\t' + str(train_score[0]) + '\t' + str(train_score[1]) + '\t' + str(
        score[0]) + '\t' + str(score[1]) + '\t' + str(accuracy) + '\t' + str(
            precision) + '\t' + str(recall) + '\t' + str(f1) + '\t' + str(
                kappa) + '\t' + '' + '\t' + json.dumps(
                    tokenizer.word_index) + '\n'
    fd.write(row)
    fd.close()
Esempio n. 15
0
def read_data_equitemp(no_intervals, interval_width, sorted_aps, act_map, log,
                       dataset):
    timestamps = pm4py.get_attribute_values(log, 'time:timestamp')
    print('Earliest:', min(timestamps))
    print('Latest:', max(timestamps))
    interval_length = (max(timestamps) - min(timestamps)) / no_intervals
    print('Interval length:', interval_length)

    no_act = len(act_map.keys())

    dfg_time_matrix = np.zeros([no_intervals, no_act, no_act], dtype=int)

    interval_timing = []
    no_events_sums = 0
    no_events_logs = 0
    no_dfs = 0
    for i in range(0, no_intervals):
        print('Interval ', i, '/', no_intervals)
        lower_bound = min(timestamps) + i * interval_length
        if i == (no_intervals - 1):
            upper_bound = min(timestamps) + (i + 1) * interval_length * 2
        else:
            upper_bound = min(timestamps) + (i + 1) * interval_length
        lb = lower_bound
        ub = upper_bound
        print(lb)
        print(ub)

        dfs = []
        empty_mat = np.zeros([no_act, no_act], dtype=float)

        filtered_events = {}
        start = Event()
        end = Event()
        start['concept:name'] = str(act_map['start'])
        end['concept:name'] = str(act_map['end'])
        highest = datetime(1970, 1, 1, tzinfo=pytz.UTC)
        lowest = datetime(2050, 1, 1, tzinfo=pytz.UTC)

        count = 0
        for df in sorted_aps:
            if ub > df.event2[
                    'time:timestamp'] >= lb:  # and ub > df.event['time:timestamp'] >= lb:
                dfs.append(df)

        no_dfs += len(dfs)

        log_dfs = {}
        for df in dfs:
            if df.trace_no not in log_dfs.keys():
                log_dfs[df.trace_no] = []
            log_dfs[df.trace_no].append(df)

        for trace_no, dfss in log_dfs.items():
            # print('\nTrace:', trace_no)
            sorted_dfs = sorted(dfss)
            filtered_events[trace_no] = []
            for df in sorted_dfs:
                # print(df)
                filtered_events[trace_no].append(df.event)
                no_events_sums += 1
            filtered_events[trace_no].append(sorted_dfs[len(sorted_dfs) -
                                                        1].event2)
            no_events_sums += 1

        print('#traces:', len(log_dfs))

        for trace_no, events in filtered_events.items():
            empty_mat[act_map['start'],
                      act_map[events[0]['concept:name']]] += 1
            empty_mat[act_map[events[-1]['concept:name']], act_map['end']] += 1

        # Export filtered events to interval event logs
        new_log = EventLog()
        no_eve = 0
        for t, trace in enumerate(log):
            new_trace = Trace()
            # new_trace.append(start)
            for trace_no, events in filtered_events.items():
                if t == trace_no:
                    for event in trace:
                        if event in events:
                            if event['time:timestamp'] < lowest:
                                lowest = event['time:timestamp']
                            if event['time:timestamp'] > highest:
                                highest = event['time:timestamp']
                            new_event = Event()
                            new_event['concept:name'] = str(
                                act_map[event['concept:name']])
                            new_trace.append(new_event)
                            no_events_sums += 1
                            no_eve += 1
            if len(new_trace) > 0:
                # new_trace.append(end)
                new_log.append(new_trace)
        exporter.apply(
            new_log, './logs/' + dataset + '_log_interval_' + str(i) + '-' +
            str(no_intervals) + '_equitemp.xes')

        # print('no eve:', no_eve)
        for act_pair in dfs:
            a1 = act_map[act_pair.a1]
            a2 = act_map[act_pair.a2]
            empty_mat[a1, a2] += 1

        dfg_time_matrix[i] = empty_mat
        interval_timing.append((lowest, highest))
    print('Event sums:', no_events_sums)
    print('Event logs:', no_events_logs)
    print('#DFS:', no_dfs)

    return dfg_time_matrix, interval_timing
Esempio n. 16
0
def inductive_miner(log, dfg, threshold, root, act_key, use_msd):
    alphabet = pm4py.get_attribute_values(log, act_key)
    start_activities = get_starters.get_start_activities(
        log, parameters={constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key})
    end_activities = get_ends.get_end_activities(
        log, parameters={constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key})
    empty_traces = pm4py.filter_log(lambda trace: len(trace) == 0, log)
    if len(empty_traces) == 0:
        if _is_base_case_act(log, act_key) or _is_base_case_silent(log):
            return _apply_base_case(log, root, act_key)
        pre, post = dfg_utils.get_transitive_relations(dfg, alphabet)
        cut = sequence_cut.detect(alphabet, pre, post)
        if cut is not None:
            return _add_operator_recursive(
                pt.ProcessTree(pt.Operator.SEQUENCE, root), threshold, act_key,
                sequence_cut.project(log, cut, act_key), use_msd)
        cut = xor_cut.detect(dfg, alphabet)
        if cut is not None:
            return _add_operator_recursive(
                pt.ProcessTree(pt.Operator.XOR, root), threshold, act_key,
                xor_cut.project(log, cut, act_key), use_msd)
        cut = concurrent_cut.detect(
            dfg,
            alphabet,
            start_activities,
            end_activities,
            msd=msdw_algo.derive_msd_witnesses(
                log,
                msd_algo.apply(log,
                               parameters={
                                   constants.PARAMETER_CONSTANT_ACTIVITY_KEY:
                                   act_key
                               }),
                parameters={
                    constants.PARAMETER_CONSTANT_ACTIVITY_KEY: act_key
                }) if use_msd else None)
        if cut is not None:
            return _add_operator_recursive(
                pt.ProcessTree(pt.Operator.PARALLEL, root), threshold, act_key,
                concurrent_cut.project(log, cut, act_key), use_msd)
        cut = loop_cut.detect(dfg, alphabet, start_activities, end_activities)
        if cut is not None:
            return _add_operator_recursive(
                pt.ProcessTree(pt.Operator.LOOP, root), threshold, act_key,
                loop_cut.project(log, cut, act_key), use_msd)
    if len(empty_traces) > 0:
        nempty = pm4py.filter_log(lambda t: len(t) > 0, log)
        return _add_operator_recursive(pt.ProcessTree(pt.Operator.XOR,
                                                      root), threshold,
                                       act_key, [EventLog(), nempty], use_msd)
    aopt = activity_once_per_trace.detect(log, alphabet, act_key)
    if aopt is not None:
        operator = pt.ProcessTree(operator=pt.Operator.PARALLEL, parent=root)
        operator.children.append(
            pt.ProcessTree(operator=None, parent=operator, label=aopt))
        return _add_operator_recursive(
            operator, threshold, act_key,
            activity_once_per_trace.project(log, aopt, act_key), use_msd)
    act_conc = activity_concurrent.detect(log, alphabet, act_key, use_msd)
    if act_conc is not None:
        return _add_operator_recursive(
            pt.ProcessTree(pt.Operator.PARALLEL, root), threshold, act_key,
            activity_concurrent.project(log, act_conc, act_key), use_msd)
    stl = strict_tau_loop.detect(log, start_activities, end_activities,
                                 act_key)
    if stl is not None:
        return _add_operator_recursive(pt.ProcessTree(pt.Operator.LOOP,
                                                      root), threshold,
                                       act_key, [stl, EventLog()], use_msd)
    tl = tau_loop.detect(log, start_activities, act_key)
    if tl is not None:
        return _add_operator_recursive(pt.ProcessTree(pt.Operator.LOOP,
                                                      root), threshold,
                                       act_key, [tl, EventLog()], use_msd)
    return _flower(alphabet, root)
Esempio n. 17
0
agg_type = 'equisize'
no_pairs = 0
horizon = 25
no_intervals = 75
no_folds = 10
no_intervals_all = 100

# Parameters
############

variant = xes_importer.Variants.ITERPARSE
paras = {variant.value.Parameters.MAX_TRACES: 1000000000}
log = xes_importer.apply(dataset + '.xes', parameters=paras)

# read and encode data
activity_names = pm4py.get_attribute_values(log, 'concept:name')
no_act = len(activity_names)
act_map = {}
reverse_map = {}
for a, value in enumerate(activity_names.keys()):
    act_map[value] = a
    reverse_map[a] = value

# add start and end points for DFGs
act_map['start'] = no_act
act_map['end'] = no_act + 1
reverse_map[no_act] = 'start'
reverse_map[no_act + 1] = 'end'
no_act += 2
print('Activity encoding:', act_map)