Beispiel #1
0
    def test_61(self):
        import os
        from pm4py.objects.log.importer.xes import importer as xes_importer
        log = xes_importer.apply(os.path.join("input_data", "roadtraffic50traces.xes"))

        from pm4py.objects.log.util import get_log_representation
        str_trace_attributes = []
        str_event_attributes = ["concept:name"]
        num_trace_attributes = []
        num_event_attributes = ["amount"]

        data, feature_names = get_log_representation.get_representation(log, str_trace_attributes, str_event_attributes,
                                                                        num_trace_attributes, num_event_attributes)

        data, feature_names = get_log_representation.get_default_representation(log)

        from pm4py.objects.log.util import get_class_representation
        target, classes = get_class_representation.get_class_representation_by_trace_duration(log, 2 * 8640000)

        from sklearn import tree
        clf = tree.DecisionTreeClassifier()
        clf.fit(data, target)

        from pm4py.visualization.decisiontree import visualizer as dectree_visualizer
        gviz = dectree_visualizer.apply(clf, feature_names, classes)
def form_representation_from_dictio_couple(first_cases_repr, second_cases_repr, string_attributes, numeric_attributes,
                                           enable_multiplier=False):
    """
    Gets a log_skeleton representation, useful for training the decision tree,
    from a couple of dictionaries along with the list of string attributes
    and numeric attributes to consider, to use for root cause analysis

    Parameters
    ------------
    first_cases_repr
        First cases representation
    second_cases_repr
        Second cases representation
    string_attributes
        String attributes contained in the log_skeleton
    numeric_attributes
        Numeric attributes contained in the log_skeleton
    enable_multiplier
        Enable balancing of classes

    Returns
    ------------
    data
        Matrix representation of the event log_skeleton
    feature_names
        Array of feature names
    """
    from pm4py.objects.log.util import get_log_representation

    log = form_log_from_dictio_couple(first_cases_repr, second_cases_repr,
                                      enable_multiplier=enable_multiplier)

    data, feature_names = get_log_representation.get_representation(log, [], string_attributes, [], numeric_attributes)

    return data, feature_names
def apply(log, parameters=None):
    """
    Apply PCA + DBSCAN clustering after creating a representation of the log containing
    the wanted attributes and the wanted succession of attributes

    Parameters
    -----------
    log
        Trace log
    parameters
        Parameters of the algorithm, including:
            pca_components -> Number of the components for the PCA
            dbscan_eps -> EPS value for the DBScan clustering
            str_tr_attr -> String trace attributes to consider in feature representation
            str_ev_attr -> String event attributes to consider in feature representation
            num_tr_attr -> Numeric trace attributes to consider in feature representation
            num_ev_attr -> Numeric event attributes to consider in feature representation
            str_evsucc_attr -> Succession between event attributes to consider in feature representation

    Returns
    -----------
    log_list
        A list containing, for each cluster, a different log
    """
    if parameters is None:
        parameters = {}

    pca_components = parameters[
        "pca_components"] if "pca_components" in parameters else 3
    dbscan_eps = parameters["dbscan_eps"] if "dbscan_eps" in parameters else 0.3

    log_list = []

    data, feature_names = get_.get_representation(log,
                                                  str_ev_attr=['concept:name'],
                                                  str_tr_attr=[],
                                                  num_ev_attr=[],
                                                  num_tr_attr=[],
                                                  str_evsucc_attr=[])

    pca = PCA(n_components=pca_components)
    pca.fit(data)
    data2d = pca.transform(data)

    db = DBSCAN(eps=dbscan_eps).fit(data2d)
    labels = db.labels_

    already_seen = {}

    for i in range(len(log)):
        if not labels[i] in already_seen:
            already_seen[labels[i]] = len(list(already_seen.keys()))
            log_list.append(EventLog())
        trace = Trace(log[i])
        for attribute in log[i].attributes:
            trace.attributes[attribute] = log[i].attributes[attribute]
        log_list[already_seen[labels[i]]].append(trace)

    return log_list
Beispiel #4
0
 def test_decisiontree_traceduration(self):
     # to avoid static method warnings in tests,
     # that by construction of the unittest package have to be expressed in such way
     self.dummy_variable = "dummy_value"
     log_path = os.path.join("input_data", "roadtraffic50traces.xes")
     log = xes_importer.apply(log_path)
     data, feature_names = get_log_representation.get_representation(log, [], ["concept:name"], [], ["amount"])
     target, classes = get_class_representation.get_class_representation_by_trace_duration(log, 2 * 8640000)
     clf = tree.DecisionTreeClassifier(max_depth=7)
     clf.fit(data, target)
     gviz = dt_vis.apply(clf, feature_names, classes,
                         parameters={dt_vis.Variants.CLASSIC.value.Parameters.FORMAT: "svg"})
     del gviz
Beispiel #5
0
def test(model, obj, parameters=None):
    """
    Test the prediction model

    Parameters
    ------------
    model
        Prediction model
    obj
        Object to predict (Trace / EventLog)
    parameters
        Possible parameters of the algorithm

    Returns
    ------------
    pred
        Result of the prediction (single value / list)
    """
    if parameters is None:
        parameters = {}

    str_tr_attr = model["str_tr_attr"]
    str_ev_attr = model["str_ev_attr"]
    num_tr_attr = model["num_tr_attr"]
    num_ev_attr = model["num_ev_attr"]
    str_evsucc_attr = model["str_evsucc_attr"]
    feature_names = model["feature_names"]
    regr = model["regr"]

    if type(obj) is EventLog:
        log = obj
    else:
        log = EventLog([obj])
    data, feature_names = get_log_representation.get_representation(
        log,
        str_tr_attr,
        str_ev_attr,
        num_tr_attr,
        num_ev_attr,
        str_evsucc_attr=str_evsucc_attr,
        feature_names=feature_names)

    pred = regr.predict(data)

    if len(pred) == 1:
        # prediction on a single case
        return pred[0]
    else:
        return pred
 def test_decisiontree_evattrvalue(self):
     # to avoid static method warnings in tests,
     # that by construction of the unittest package have to be expressed in such way
     self.dummy_variable = "dummy_value"
     log_path = os.path.join("input_data", "roadtraffic50traces.xes")
     log = xes_importer.import_log(log_path)
     data, feature_names = get_log_representation.get_representation(
         log, [], ["concept:name"], [], ["amount"])
     target, classes = get_class_representation.get_class_representation_by_str_ev_attr_value_value(
         log, "concept:name")
     clf = tree.DecisionTreeClassifier(max_depth=7)
     clf.fit(data, target)
     gviz = dt_vis_factory.apply(clf,
                                 feature_names,
                                 classes,
                                 parameters={"format": "svg"})
     del gviz
def find_anonmalies_with_isolation_forest(log, original_features,
                                          original_log_df, result_path):
    log_features, feature_names_log = get_log_representation.get_representation(
        log,
        str_ev_attr=["concept:name"],
        str_tr_attr=[],
        num_ev_attr=[],
        num_tr_attr=[],
        str_evsucc_attr=["concept:name"])
    log_df = pd.DataFrame(log_features, columns=feature_names_log)

    features = np.union1d(original_features, feature_names_log)

    new_features_train = np.setxor1d(original_features, features)
    new_features_df = pd.DataFrame(columns=new_features_train)
    train_df = original_log_df.append(new_features_df)
    train_df = train_df.fillna(0)

    model = IsolationForest()
    model.fit(train_df)

    new_features_test = np.setxor1d(feature_names_log, features)
    new_features_df = pd.DataFrame(columns=new_features_test)
    test_df = log_df.append(new_features_df)
    test_df = test_df.fillna(0)

    log_df["scores"] = model.decision_function(test_df)
    results = dict()
    results["avg"] = log_df["scores"].mean()
    count_traces = log_df["scores"].count() + 1
    anonmalies = log_df[log_df.scores <= 0].shape[0]
    results["anonmaly_relative_frequency"] = anonmalies / count_traces
    print(results)

    with open(result_path, 'wb') as file:
        pickle.dump(results, file)
Beispiel #8
0
def train(log, parameters=None):
    """
    Train the model

    Parameters
    -------------
    log
        Log
    parameters
        Possible parameters of the algorithm, including default_epochs
    """
    if parameters is None:
        parameters = {}
    default_epochs = parameters[
        "default_epochs"] if "default_epochs" in parameters else 50
    parameters["enable_sort"] = False
    activity_key = parameters[
        constants.
        PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY
    # log = sorting.sort_timestamp(log, timestamp_key)
    max_len_trace = max([len(trace) for trace in log])
    y_orig = parameters[
        "y_orig"] if "y_orig" in parameters else get_remaining_time_from_log(
            log, max_len_trace=max_len_trace, parameters=parameters)
    y, log_max_value = normalize_remaining_time(y_orig)
    y = np.array(y)
    str_evsucc_attr = [activity_key]
    if "str_ev_attr" in parameters:
        str_tr_attr = parameters[
            "str_tr_attr"] if "str_tr_attr" in parameters else []
        str_ev_attr = parameters[
            "str_ev_attr"] if "str_ev_attr" in parameters else []
        num_tr_attr = parameters[
            "num_tr_attr"] if "num_tr_attr" in parameters else []
        num_ev_attr = parameters[
            "num_ev_attr"] if "num_ev_attr" in parameters else []
    else:
        str_tr_attr, str_ev_attr, num_tr_attr, num_ev_attr = attributes_filter.select_attributes_from_log_for_tree(
            log)
        if activity_key not in str_ev_attr:
            str_ev_attr.append(activity_key)

    data, feature_names = get_log_representation.get_representation(
        log,
        str_tr_attr,
        str_ev_attr,
        num_tr_attr,
        num_ev_attr,
        str_evsucc_attr=str_evsucc_attr)
    X = get_X_from_log(log, feature_names, max_len_trace)
    in_out_neurons = X.shape[2]
    hidden_neurons = min(int(in_out_neurons * 7.5), 50)
    input_shape = (X.shape[1], X.shape[2])
    model = Sequential()
    model.add(
        LSTM(hidden_neurons, return_sequences=False, input_shape=input_shape))
    model.add(Dense(in_out_neurons))
    model.add(Activation("linear"))
    model.compile(loss="mean_squared_error", optimizer="rmsprop")
    model.fit(X,
              y,
              batch_size=X.shape[1],
              nb_epoch=default_epochs,
              validation_split=0.2)
    return {
        "str_tr_attr": str_tr_attr,
        "str_ev_attr": str_ev_attr,
        "num_tr_attr": num_tr_attr,
        "num_ev_attr": num_ev_attr,
        "str_evsucc_attr": str_evsucc_attr,
        "feature_names": feature_names,
        "regr": model,
        "max_len_trace": max_len_trace,
        "log_max_value": log_max_value,
        "variant": "keras_rnn"
    }
precision = precision_evaluator.apply(
    original_log,
    model,
    initial_marking,
    final_marking,
    variant=precision_evaluator.Variants.ETCONFORMANCE_TOKEN)
print(str(precision))
fscore = 2 * precision * fitness / (precision + fitness)
print("Fscore of: " + str(fscore))

generalization = generalization_evaluator.apply(original_log, model,
                                                initial_marking, final_marking)
print("Generalization of: " + str(generalization))

log_features, feature_names_log = get_log_representation.get_representation(
    original_log,
    str_ev_attr=["concept:name"],
    str_tr_attr=[],
    num_ev_attr=[],
    num_tr_attr=[],
    str_evsucc_attr=["concept:name"])
log_df = pd.DataFrame(log_features, columns=feature_names_log)

model = IsolationForest()
model.fit(log_df)

log_df["scores"] = model.decision_function(log_df)
count_traces = log_df["scores"].count() + 1
anonmalies = log_df[log_df.scores <= 0].shape[0]
anonmaly_relative_frequency = anonmalies / count_traces
print("Relative frequency anonmalies: " + str(anonmaly_relative_frequency))
Beispiel #10
0
from pm4py.visualization.process_tree import visualizer as pt_visualizer
gviz = pt_visualizer.apply(tree, parameters= {pt_visualizer.Variants.WO_DECORATION.value.Parameters.FORMAT: "png"})
pt_visualizer.view(gviz)


#Decision Tree
import os
from pm4py.objects.log.importer.xes import importer as xes_importer
log = xes_importer.apply(os.path.join("tests", "input_data", "roadtraffic50traces.xes"))

from pm4py.objects.log.util import get_log_representation
str_trace_attributes = []
str_event_attributes = ["concept:name"]
num_trace_attributes = []
num_event_attributes = ["amount"]
data, feature_names = get_log_representation.get_representation(log, str_trace_attributes, str_event_attributes, num_trace_attributes, num_event_attributes)  #error

data, feature_names = get_log_representation.get_default_representation(log)
import pandas as pd
dataframe = pd.DataFrame(data, columns=feature_names)
dataframe

dataframe.to_csv("features.csv", index=False)

from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf.fit(data, target)

from pm4py.visualization.decisiontree import visualizer as dectree_visualizer
gviz = dectree_visualizer.apply(clf, feature_names, classes)
Beispiel #11
0
def train(log, parameters=None):
    """
    Train the prediction model

    Parameters
    -----------
    log
        Event log
    parameters
        Possible parameters of the algorithm

    Returns
    ------------
    model
        Trained model
    """
    if parameters is None:
        parameters = {}

    parameters["enable_sort"] = False
    activity_key = parameters[
        constants.
        PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY
    timestamp_key = parameters[
        constants.
        PARAMETER_CONSTANT_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY
    business_hours = parameters[
        "business_hours"] if "business_hours" in parameters else False
    worktiming = parameters["worktiming"] if "worktiming" in parameters else [
        7, 17
    ]
    weekends = parameters["weekends"] if "weekends" in parameters else [6, 7]

    y_orig = parameters["y_orig"] if "y_orig" in parameters else None

    log = sorting.sort_timestamp(log, timestamp_key)

    str_evsucc_attr = [activity_key]
    if "str_ev_attr" in parameters:
        str_tr_attr = parameters[
            "str_tr_attr"] if "str_tr_attr" in parameters else []
        str_ev_attr = parameters[
            "str_ev_attr"] if "str_ev_attr" in parameters else []
        num_tr_attr = parameters[
            "num_tr_attr"] if "num_tr_attr" in parameters else []
        num_ev_attr = parameters[
            "num_ev_attr"] if "num_ev_attr" in parameters else []
    else:
        str_tr_attr, str_ev_attr, num_tr_attr, num_ev_attr = attributes_filter.select_attributes_from_log_for_tree(
            log)
        if activity_key not in str_ev_attr:
            str_ev_attr.append(activity_key)

    max_trace_length = max(len(x) for x in log)

    if max_trace_length == 1:
        # this you shall use
        data, feature_names = get_log_representation.get_representation(
            log,
            str_tr_attr,
            str_ev_attr,
            num_tr_attr,
            num_ev_attr,
            str_evsucc_attr=str_evsucc_attr)
        ext_log = log
    else:
        ext_log, change_indexes = get_log_with_log_prefixes(log)
        data, feature_names = get_log_representation.get_representation(
            ext_log,
            str_tr_attr,
            str_ev_attr,
            num_tr_attr,
            num_ev_attr,
            str_evsucc_attr=str_evsucc_attr)

    if y_orig is not None:
        remaining_time = [y for x in y_orig for y in x]
    else:
        if business_hours:
            remaining_time = []
            for trace in ext_log:
                if trace:
                    timestamp_et = trace[-1][timestamp_key]
                    timestamp_st = trace[0][timestamp_key]

                    bh = BusinessHours(timestamp_st.replace(tzinfo=None),
                                       timestamp_et.replace(tzinfo=None),
                                       worktiming=worktiming,
                                       weekends=weekends)
                    remaining_time.append(bh.getseconds())
                else:
                    remaining_time.append(0)
        else:
            remaining_time = []
            for trace in ext_log:
                if trace:
                    remaining_time.append(
                        (trace[-1][timestamp_key] -
                         trace[0][timestamp_key]).total_seconds())
                else:
                    remaining_time.append(0)
    regr = ElasticNet(max_iter=10000, l1_ratio=0.7)
    print(data)
    regr.fit(data, remaining_time)

    return {
        "str_tr_attr": str_tr_attr,
        "str_ev_attr": str_ev_attr,
        "num_tr_attr": num_tr_attr,
        "num_ev_attr": num_ev_attr,
        "str_evsucc_attr": str_evsucc_attr,
        "feature_names": feature_names,
        "remaining_time": remaining_time,
        "regr": regr,
        "variant": "elasticnet"
    }