Ejemplo n.º 1
0
 def test_filtering_attributes_events(self):
     # to avoid static method warnings in tests,
     # that by construction of the unittest package have to be expressed in such way
     self.dummy_variable = "dummy_value"
     input_log = os.path.join(INPUT_DATA_DIR, "running-example.xes")
     log = xes_importer.import_log(input_log)
     log1 = attributes_filter.apply_events(log, ["reject request"],
                                           parameters={"positive": True})
     log2 = attributes_filter.apply_events(log, ["reject request"],
                                           parameters={"positive": True})
     del log1
     del log2
Ejemplo n.º 2
0
def apply(dataframe, filter, parameters=None):
    """
    Apply a filter to the current log (attributes filter)

    Parameters
    ------------
    log
        Event log
    filter
        Filter to apply
    parameters
        Parameters of the algorithm

    Returns
    ------------
    log
        Event log
    """
    if parameters is None:
        parameters = {}

    parameters[constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = filter[1][0]

    return attributes_filter.apply_events(dataframe,
                                          filter[1][1],
                                          parameters=parameters)
Ejemplo n.º 3
0
def apply(log, list_activities, sample_size, parameters):
    """
    Finds the performance spectrum provided a log
    and a list of activities

    Parameters
    -------------
    log
        Log
    list_activities
        List of activities interesting for the performance spectrum (at least two)
    sample_size
        Size of the sample
    parameters
        Parameters of the algorithm, including the activity key and the timestamp key

    Returns
    -------------
    points
        Points of the performance spectrum
    """
    if parameters is None:
        parameters = {}

    activity_key = parameters[
        constants.
        PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY
    timestamp_key = parameters[
        constants.
        PARAMETER_CONSTANT_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY

    log = sorting.sort_timestamp_log(log, timestamp_key=timestamp_key)
    parameters[constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = activity_key
    log = attributes_filter.apply_events(log,
                                         list_activities,
                                         parameters=parameters)

    points = []

    for trace in log:
        for i in range(len(trace) - len(list_activities) + 1):
            acti_comb = [
                event[activity_key]
                for event in trace[i:i + len(list_activities)]
            ]

            if acti_comb == list_activities:
                timest_comb = [
                    event[timestamp_key].timestamp()
                    for event in trace[i:i + len(list_activities)]
                ]

                points.append(timest_comb)

    points = sorted(points, key=lambda x: x[0])

    if len(points) > sample_size:
        points = points_subset.pick_chosen_points_list(sample_size, points)

    return points
def filter_events(log, starts, ends, parameters):
    log_classifier = attributes_filter.apply_events(
        log,
        starts + ends,
        parameters={
            constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY:
            parameters[performance_constants.EVENT_CLASSIFIER],
            'positive':
            True
        })
    return log_classifier
Ejemplo n.º 5
0
def filter_per_activities_to_keep(chat_id, activities):
    log = get_current_log(chat_id)
    tracefilter_log_pos = attributes_filter.apply_events(
        log,
        activities,
        parameters={
            constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: "concept:name",
            "positive": True
        })
    xes_exporter.export_log(tracefilter_log_pos,
                            get_log_filename(chat_id, True),
                            parameters={"compress": False})
    set_property(chat_id, "current_log", get_log_filename(chat_id, True))
Ejemplo n.º 6
0
def clean_lifecycle_events(log):
    '''recives a log and checks if it contains multiple
    lifecycle events per activity. Returns a log that
    only contains the starting events. If the log has no
    lifecycle events it is returned without any changes'''

    try:
        if 'lifecycle:transition' in log[0][0].keys():
            if len(set([e['lifecycle:transition'] for e in log[0]])) > 1:
                log = attributes_filter.apply_events(
                    log, ["start"],
                    parameters={
                        constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY:
                        "lifecycle:transition",
                        "positive": True
                    })
    except Exception as e:
        print('An exception occured during cleaning of lifecycle events')
        print(e)

    return log
Ejemplo n.º 7
0
Archivo: GRM.py Proyecto: fau-is/grm
def filter_log_by_relevance(topK, log, relevance_scores):
    log_new = EventLog()
    for label in relevance_scores:
        topK = len(relevance_scores[label]['scores']) if len(
            relevance_scores[label]['scores']) < topK else topK
        relevance_scores[label]['scores'] = dict(
            sorted(relevance_scores[label]['scores'].items(),
                   key=lambda x: x[1],
                   reverse=True)[:topK])
        log_dummy = filter_log_by_caseid(log,
                                         relevance_scores[label]['traces'])
        log_dummy = attributes_filter.apply_events(
            log_dummy,
            relevance_scores[label]['scores'].keys(),
            parameters={
                attributes_filter.PARAMETER_CONSTANT_ATTRIBUTE_KEY:
                "concept:name",
                "positive": True
            })
        log_new._list = log_new._list + log_dummy._list
    return relevance_scores, log_new
Ejemplo n.º 8
0
    - Filtering: -
    - Used Columns: -
"""


from eval.evaluation import run_experiment
from grm import preprocessing
from pm4py.algo.filtering.log.attributes import attributes_filter


log_file = "bpi2017.csv"
name_of_case_id = "Case ID"
name_of_activity = "Activity"
name_of_timestamp = "Complete Timestamp"
name_of_label = "Accepted"
hyper_params = {'num_epochs': 1000}
k = 10

log = preprocessing.import_data("../data", log_file, separator=";", quote='', case_id=name_of_case_id,
                                activity=name_of_activity,
                                time_stamp=name_of_timestamp, target=name_of_label)

activities = attributes_filter.get_attribute_values(log, "concept:name")

# filter out activities representing work items
w_activities = [i for i in activities.keys() if i.startswith('W_')]
log_filtered = attributes_filter.apply_events(log, w_activities, parameters={
    attributes_filter.PARAMETER_CONSTANT_ATTRIBUTE_KEY: "concept:name", "positive": True})

run_experiment(log, hyper_params=hyper_params, k=k, ml_flow_run_name_prefix=log_file)
Ejemplo n.º 9
0
                                log_file,
                                separator=";",
                                quote='',
                                case_id=name_of_case_id,
                                activity=name_of_activity,
                                time_stamp=name_of_timestamp,
                                target=name_of_label)

activities_2017 = attributes_filter.get_attribute_values(log, "concept:name")

# filter out activities representing work items
w_activities_2017 = [i for i in activities_2017.keys() if i.startswith('W_')]
fil_log_17 = attributes_filter.apply_events(
    log,
    w_activities_2017,
    parameters={
        attributes_filter.PARAMETER_CONSTANT_ATTRIBUTE_KEY: "concept:name",
        "positive": True
    })

# instances
print("2017 instances: ", len(fil_log_17))

# variants
variants = variants_filter.get_variants(fil_log_17)
print("2017 variants: ", len(variants))

# instances per variant
sum_val = 0
for value in variants.values():
    sum_val += len(value)
Ejemplo n.º 10
0
def apply(df, discovery_algorithm=discover_inductive, parameters=None):
    if parameters is None:
        parameters = {}

    allowed_activities = parameters[
        "allowed_activities"] if "allowed_activities" in parameters else None
    debug = parameters["debug"] if "debug" in parameters else True

    try:
        if df.type == "succint":
            df = succint_mdl_to_exploded_mdl.apply(df)
            df.type = "exploded"
    except:
        pass

    if len(df) == 0:
        df = pd.DataFrame({"event_id": [], "event_activity": []})

    min_node_freq = parameters[
        "min_node_freq"] if "min_node_freq" in parameters else 0
    min_edge_freq = parameters[
        "min_edge_freq"] if "min_edge_freq" in parameters else 0

    df = clean_frequency.apply(df, min_node_freq)
    df = clean_arc_frequency.apply(df, min_edge_freq)

    if len(df) == 0:
        df = pd.DataFrame({"event_id": [], "event_activity": []})

    persps = [x for x in df.columns if not x.startswith("event_")]

    ret = {}
    ret["nets"] = {}
    ret["act_count"] = {}
    ret["replay"] = {}
    ret["group_size_hist"] = {}
    ret["act_count_replay"] = {}
    ret["group_size_hist_replay"] = {}
    ret["aligned_traces"] = {}
    ret["place_fitness_per_trace"] = {}
    ret["aggregated_statistics_frequency"] = {}
    ret["aggregated_statistics_performance_min"] = {}
    ret["aggregated_statistics_performance_max"] = {}
    ret["aggregated_statistics_performance_median"] = {}
    ret["aggregated_statistics_performance_mean"] = {}

    diff_log = 0
    diff_model = 0
    diff_token_replay = 0
    diff_performance_annotation = 0
    diff_basic_stats = 0

    for persp in persps:
        aa = time.time()
        if debug:
            print(persp, "getting log")
        log = algorithm.apply(df, persp, parameters=parameters)
        if debug:
            print(len(log))

        if allowed_activities is not None:
            if persp not in allowed_activities:
                continue
            filtered_log = attributes_filter.apply_events(
                log, allowed_activities[persp])
        else:
            filtered_log = log
        bb = time.time()

        diff_log += (bb - aa)

        # filtered_log = variants_filter.apply_auto_filter(deepcopy(filtered_log), parameters={"decreasingFactor": 0.5})

        if debug:
            print(len(log))
            print(persp, "got log")

        cc = time.time()
        #net, im, fm = inductive_miner.apply(filtered_log)
        net, im, fm = discovery_algorithm(filtered_log)
        """if persp == "items":
            trans_map = {t.label:t for t in net.transitions}
            source_place_it = list(trans_map["item out of stock"].in_arcs)[0].source
            target_place_re = list(trans_map["reorder item"].out_arcs)[0].target
            skip_trans_1 = PetriNet.Transition(str(uuid.uuid4()), None)
            net.transitions.add(skip_trans_1)
            add_arc_from_to(source_place_it, skip_trans_1, net)
            add_arc_from_to(skip_trans_1, target_place_re, net)"""

        #net = reduce_petri_net(net)
        dd = time.time()

        diff_model += (dd - cc)

        # net, im, fm = alpha_miner.apply(filtered_log)
        if debug:
            print(persp, "got model")

        xx1 = time.time()
        activ_count = algorithm.apply(df,
                                      persp,
                                      variant="activity_occurrence",
                                      parameters=parameters)
        if debug:
            print(persp, "got activ_count")
        xx2 = time.time()

        ee = time.time()
        variants_idx = variants_module.get_variants_from_log_trace_idx(log)
        # variants = variants_module.convert_variants_trace_idx_to_trace_obj(log, variants_idx)
        # parameters_tr = {PARAM_ACTIVITY_KEY: "concept:name", "variants": variants}

        if debug:
            print(persp, "got variants")

        aligned_traces, place_fitness_per_trace, transition_fitness_per_trace, notexisting_activities_in_model = tr_factory.apply(
            log,
            net,
            im,
            fm,
            parameters={
                "enable_pltr_fitness": True,
                "disable_variants": True
            })

        if debug:
            print(persp, "done tbr")

        element_statistics = performance_map.single_element_statistics(
            log, net, im, aligned_traces, variants_idx)

        if debug:
            print(persp, "done element_statistics")
        ff = time.time()

        diff_token_replay += (ff - ee)

        aggregated_statistics = performance_map.aggregate_statistics(
            element_statistics)

        if debug:
            print(persp, "done aggregated_statistics")

        element_statistics_performance = performance_map.single_element_statistics(
            log, net, im, aligned_traces, variants_idx)

        if debug:
            print(persp, "done element_statistics_performance")

        gg = time.time()

        aggregated_statistics_performance_min = performance_map.aggregate_statistics(
            element_statistics_performance,
            measure="performance",
            aggregation_measure="min")
        aggregated_statistics_performance_max = performance_map.aggregate_statistics(
            element_statistics_performance,
            measure="performance",
            aggregation_measure="max")
        aggregated_statistics_performance_median = performance_map.aggregate_statistics(
            element_statistics_performance,
            measure="performance",
            aggregation_measure="median")
        aggregated_statistics_performance_mean = performance_map.aggregate_statistics(
            element_statistics_performance,
            measure="performance",
            aggregation_measure="mean")

        hh = time.time()

        diff_performance_annotation += (hh - ee)

        if debug:
            print(persp, "done aggregated_statistics_performance")

        group_size_hist = algorithm.apply(df,
                                          persp,
                                          variant="group_size_hist",
                                          parameters=parameters)

        if debug:
            print(persp, "done group_size_hist")

        occurrences = {}
        for trans in transition_fitness_per_trace:
            occurrences[trans.label] = set()
            for trace in transition_fitness_per_trace[trans]["fit_traces"]:
                if not trace in transition_fitness_per_trace[trans][
                        "underfed_traces"]:
                    case_id = trace.attributes["concept:name"]
                    for event in trace:
                        if event["concept:name"] == trans.label:
                            occurrences[trans.label].add(
                                (case_id, event["event_id"]))
            # print(transition_fitness_per_trace[trans])

        len_different_ids = {}
        for act in occurrences:
            len_different_ids[act] = len(set(x[1] for x in occurrences[act]))

        eid_acti_count = {}
        for act in occurrences:
            eid_acti_count[act] = {}
            for x in occurrences[act]:
                if not x[0] in eid_acti_count:
                    eid_acti_count[act][x[0]] = 0
                eid_acti_count[act][x[0]] = eid_acti_count[act][x[0]] + 1
            eid_acti_count[act] = sorted(list(eid_acti_count[act].values()))

        ii = time.time()

        diff_basic_stats += (ii - hh) + (xx2 - xx1)

        ret["nets"][persp] = [net, im, fm]
        ret["act_count"][persp] = activ_count
        ret["aligned_traces"][persp] = aligned_traces
        ret["place_fitness_per_trace"][persp] = place_fitness_per_trace
        ret["aggregated_statistics_frequency"][persp] = aggregated_statistics
        ret["aggregated_statistics_performance_min"][
            persp] = aggregated_statistics_performance_min
        ret["aggregated_statistics_performance_max"][
            persp] = aggregated_statistics_performance_max
        ret["aggregated_statistics_performance_median"][
            persp] = aggregated_statistics_performance_median
        ret["aggregated_statistics_performance_mean"][
            persp] = aggregated_statistics_performance_mean

        ret["replay"][persp] = aggregated_statistics
        ret["group_size_hist"][persp] = group_size_hist
        ret["act_count_replay"][persp] = len_different_ids
        ret["group_size_hist_replay"][persp] = eid_acti_count

    ret["computation_statistics"] = {
        "diff_log": diff_log,
        "diff_model": diff_model,
        "diff_token_replay": diff_token_replay,
        "diff_performance_annotation": diff_performance_annotation,
        "diff_basic_stats": diff_basic_stats
    }

    return ret
Ejemplo n.º 11
0
                                target=name_of_label)

# filter out most relevant activity
model_path = '../best_models/sp2020/2020-05-05-14-59_best_model.pickle'
activities = get_activities(log)
grm_model = GRM.GRM(log, activities, restore_file=model_path)

filtered_log = EventLog()
for trace in log:
    case_id, pred, rel_scores = grm_model.predict(trace)
    if len(rel_scores) > 1:
        most_relevant = max(rel_scores.items(), key=operator.itemgetter(1))[0]
        log_trace = attributes_filter.apply_events(
            log, [case_id],
            parameters={
                attributes_filter.PARAMETER_CONSTANT_ATTRIBUTE_KEY:
                name_of_case_id,
                "positive": True
            })

        trace_without_most = attributes_filter.apply_events(
            log_trace, [most_relevant],
            parameters={
                attributes_filter.PARAMETER_CONSTANT_ATTRIBUTE_KEY:
                "concept:name",
                "positive": False
            })

        trace_without_most = trace_without_most[0]

        filtered_log._list.append(trace_without_most)
Ejemplo n.º 12
0
tracefilter_log_pos = attributes_filter.apply(
    log, ["Resource10"],
    parameters={
        constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: "org:resource",
        "positive": True
    })
tracefilter_log_neg = attributes_filter.apply(
    log, ["Resource10"],
    parameters={
        constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: "org:resource",
        "positive": False
    })

eventsfilter_log = attributes_filter.apply_events(
    log, ["Resource10"],
    parameters={
        constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: "org:resource",
        "positive": True
    })
eventsfilter_log

from pm4py.algo.filtering.log.attributes import attributes_filter
from pm4py.util import constants
filtered_log = attributes_filter.apply_auto_filter(
    log,
    parameters={
        constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: "concept:name",
        "decreasingFactor": 0.6
    })
filtered_log

#filter on numeric attributes
Ejemplo n.º 13
0
- Pre-processing
    - Filtering: -
    - Used Columns: -
"""

from eval.evaluation import run_experiment
from grm import preprocessing
from pm4py.algo.filtering.log.attributes import attributes_filter
from pm4py.objects.log.util import sampling

logfile = "BPI2020_PermitLog.csv"
name_of_case_id = "Case ID"
name_of_activity = "Activity"
name_of_timestamp = "Complete Timestamp"
name_of_label = "(case) Overspent"
hyper_params = {'num_epochs': 1000}
k = 10

log = preprocessing.import_data("../data", logfile, separator=";", quote='', case_id=name_of_case_id,
                                activity=name_of_activity,
                                time_stamp=name_of_timestamp, target=name_of_label)

# log = sampling.sample(log, n=100)

# filter only activities representing work items
log_filtered = attributes_filter.apply_events(log, ['STAFF MEMBER'],
                                              parameters={attributes_filter.PARAMETER_CONSTANT_ATTRIBUTE_KEY:
                                                              "Resource", "positive": True})

run_experiment(log_filtered, hyper_params=hyper_params, k=k, ml_flow_run_name_prefix=logfile)
Ejemplo n.º 14
0
    if ENABLE_TESTS:
        # TEST 9: discover performance spectrum from dataframe
        t0 = time.time()
        pspectrum.apply(roadtraffic_df, ["Create Fine", "Send Fine"])
        t1 = time.time()
        T9[0] = (t1 - t0)
        T9[2] = math.ceil(T9[1] / (T9[0] + 0.00000001) * 1000.0)
        print(
            "TEST 9 - Discover peformance spectrum from dataframe - %.5f s (test score: %d)"
            % (T9[0], T9[2]))

    if ENABLE_TESTS:
        # TEST 10: filter bpic2017 event log on event attributes
        t0 = time.time()
        new_log = attributes_filter.apply_events(bpic2017_log,
                                                 ["O_Create Offer"])
        t1 = time.time()
        T10[0] = (t1 - t0)
        T10[2] = math.ceil(T10[1] / (T10[0] + 0.00000001) * 1000.0)
        print(
            "TEST 10 - Filter bpic2017 event log on event attributes - %.5f s (test score: %d)"
            % (T10[0], T10[2]))

    if DEBUG:
        F = open("debug.csv", "a")
        F.write("%.4f;%.4f;%.4f;%.4f;%.4f;%.4f;%.4f;%.4f;%.4f;%.4f\n" %
                (T1[0], T2[0], T3[0], T4[0], T5[0], T6[0], T7[0], T8[0], T9[0],
                 T10[0]))
        F.close()

    scores = [