Exemple #1
0
def filter_on_case_size(trace_log, min_case_size=2, max_case_size=None):
    """
    Get only traces in the log with a given size

    Parameters
    -----------
    trace_log
        Trace log
    min_case_size
        Minimum desidered size of traces
    max_case_size
        Maximum desidered size of traces

    Returns
    -----------
    filtered_log
        Filtered log
    """
    if max_case_size is not None:
        filtered_log = TraceLog([
            trace for trace in trace_log
            if min_case_size <= len(trace) <= max_case_size
        ])
    else:
        filtered_log = TraceLog(
            [trace for trace in trace_log if len(trace) >= min_case_size])
    return filtered_log
def sample_tracelog(trace_log, no_traces=100):
    """
    Randomly sample a fixed number of traces from the original log

    Parameters
    -----------
    trace_log
        Trace log
    no_traces
        Number of traces that the sample should have

    Returns
    -----------
    newLog
        Filtered log
    """
    new_log = TraceLog(attributes=trace_log.attributes, extensions=trace_log.extensions, globals=trace_log._omni,
                       classifiers=trace_log.classifiers)
    set_traces = set()
    for i in range(0, min(no_traces, len(trace_log._list))):
        set_traces.add(random.randrange(0, len(trace_log._list)))
    set_traces = list(set_traces)
    for trace in set_traces:
        new_log.append(copy(trace_log._list[trace]))
    return new_log
def generate_log(pt, no_traces=100):
    """
    Generate a log out of a process tree

    Parameters
    ------------
    pt
        Process tree
    no_traces
        Number of traces contained in the process tree

    Returns
    ------------
    log
        Trace log object
    """
    log = TraceLog()

    for i in range(no_traces):
        ex_seq = execute(pt)
        ex_seq_labels = pt_util.project_execution_sequence_to_labels(ex_seq)

        trace = Trace()
        trace.attributes[xes.DEFAULT_NAME_KEY] = str(i)
        for label in ex_seq_labels:
            event = Event()
            event[xes.DEFAULT_NAME_KEY] = label
            trace.append(event)
        log.append(trace)

    return log
def filter_log_by_start_activities(start_activities,
                                   variants,
                                   vc,
                                   threshold,
                                   activity_key="concept:name"):
    """
    Keep only variants of the log with a start activity which number of occurrences is above the threshold
    
    Parameters
    ----------
    start_activities
        Dictionary of start attributes associated with their count
    variants
        (If specified) Dictionary with variant as the key and the list of traces as the value
    vc
        List of variant names along with their count
    threshold
        Cutting threshold (remove variants having start attributes which number of occurrences is below the threshold
    activity_key
        (If specified) Specify the activity key in the log (default concept:name)
    
    Returns
    ----------
    filtered_log
        Filtered log
    """
    filtered_log = TraceLog()
    fvsa = variants[vc[0][0]][0][0][activity_key]
    for variant in variants:
        vsa = variants[variant][0][0][activity_key]
        if vsa in start_activities:
            if vsa == fvsa or start_activities[vsa] >= threshold:
                for trace in variants[variant]:
                    filtered_log.append(trace)
    return filtered_log
def filter_log_by_variants_percentage(trace_log,
                                      variants,
                                      variants_percentage=0.0):
    """
    Filter the log by variants percentage

    Parameters
    ----------
    trace_log
        Trace log
    variants
        Dictionary with variant as the key and the list of traces as the value
    variants_percentage
        Percentage of variants that should be kept (the most common variant is always kept)

    Returns
    ----------
    filtered_log
        Filtered trace log
    """
    filtered_log = TraceLog()
    no_of_traces = len(trace_log)
    variant_count = get_variants_sorted_by_count(variants)
    already_added_sum = 0

    for i in range(len(variant_count)):
        variant = variant_count[i][0]
        varcount = variant_count[i][1]
        percentage_already_added = already_added_sum / no_of_traces
        if already_added_sum == 0 or percentage_already_added < variants_percentage:
            for trace in variants[variant]:
                filtered_log.append(trace)
            already_added_sum = already_added_sum + varcount

    return filtered_log
def apply(trace_log, admitted_variants, parameters=None):
    """
    Filter log keeping/removing only provided variants

    Parameters
    -----------
    trace_log
        Trace log object
    admitted_variants
        Admitted variants
    parameters
        Parameters of the algorithm, including:
            activity_key -> Attribute identifying the activity in the log
            positive -> Indicate if events should be kept/removed
    """

    if parameters is None:
        parameters = {}
    positive = parameters["positive"] if "positive" in parameters else True
    variants = get_variants(trace_log, parameters=parameters)
    trace_log = TraceLog()
    for variant in variants:
        if (positive and variant in admitted_variants) or (
                not positive and variant not in admitted_variants):
            for trace in variants[variant]:
                trace_log.append(trace)
    return trace_log
def filter_log_by_paths(trace_log,
                        paths,
                        variants,
                        vc,
                        threshold,
                        attribute_key="concept:name"):
    """
    Keep only paths which number of occurrences is above the threshold (or they belong to the first variant)

    Parameters
    ----------
    trace_log
        Trace log
    paths
        Dictionary of paths associated with their count
    variants
        (If specified) Dictionary with variant as the key and the list of traces as the value
    vc
        List of variant names along with their count
    threshold
        Cutting threshold (remove paths which number of occurrences is below the threshold)
    attribute_key
        (If specified) Specify the attribute key to use (default concept:name)

    Returns
    ----------
    filtered_log
        Filtered log
    """
    filtered_log = TraceLog()
    fvft = variants[vc[0][0]][0]
    fvp = set()
    for i in range(0, len(fvft) - 1):
        path = fvft[i][attribute_key] + "," + fvft[i + 1][attribute_key]
        fvp.add(path)
    for trace in trace_log:
        new_trace = Trace()
        jj = 0
        if len(trace) > 0:
            new_trace.append(trace[0])
            for j in range(1, len(trace) - 1):
                jj = j
                if j >= len(trace):
                    break
                if attribute_key in trace[j] and attribute_key in trace[j + 1]:
                    path = trace[j][attribute_key] + "," + trace[
                        j + 1][attribute_key]
                    if path in paths:
                        if path in fvp or paths[path] >= threshold:
                            new_trace.append(trace[j])
                            new_trace.append(trace[j + 1])
        if len(trace) > 1 and not jj == len(trace):
            new_trace.append(trace[-1])
        if len(new_trace) > 0:
            filtered_log.append(new_trace)
    return filtered_log
def filter_on_case_performance(trace_log, inf_perf, sup_perf, parameters=None):
    """
    Gets a filtered trace log keeping only traces that satisfy the given performance requirements

    Parameters
    ------------
    trace_log
        Trace log
    inf_perf
        Lower bound on the performance
    sup_perf
        Upper bound on the performance
    parameters
        Parameters

    Returns
    -----------
    filtered_log
        Filtered trace log
    """
    if parameters is None:
        parameters = {}
    timestamp_key = parameters[
        PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY
    filtered_log = TraceLog([trace for trace in trace_log if satisfy_perf(trace, inf_perf, sup_perf, timestamp_key)])
    return filtered_log
Exemple #9
0
def filter_traces_intersecting(log, dt1, dt2, parameters=None):
    """
    Filter traces intersecting the given interval

    Parameters
    -----------
    log
        Trace log
    dt1
        Lower bound to the interval
    dt2
        Upper bound to the interval
    parameters
        Possible parameters of the algorithm, including:
            timestamp_key -> Attribute to use as timestamp

    Returns
    ------------
    filtered_log
        Filtered trace log
    """
    if parameters is None:
        parameters = {}
    timestamp_key = parameters[
        PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY
    dt1 = get_dt_from_string(dt1)
    dt2 = get_dt_from_string(dt2)
    filtered_log = TraceLog([
        trace for trace in log
        if is_intersecting(trace, dt1, dt2, timestamp_key)
    ])
    return filtered_log
Exemple #10
0
def apply(trace_log, values, parameters=None):
    """
    Filter log by keeping only traces that has/has not events with an attribute value that belongs to the provided
    values list

    Parameters
    -----------
    trace_log
        Trace log
    values
        Allowed attributes
    parameters
        Parameters of the algorithm, including:
            activity_key -> Attribute identifying the activity in the log
            positive -> Indicate if events should be kept/removed

    Returns
    -----------
    filtered_log
        Filtered log
    """
    if parameters is None:
        parameters = {}

    attribute_key = parameters[
        PARAMETER_CONSTANT_ATTRIBUTE_KEY] if PARAMETER_CONSTANT_ATTRIBUTE_KEY in parameters else DEFAULT_NAME_KEY
    positive = parameters["positive"] if "positive" in parameters else True

    filtered_log = TraceLog()
    for trace in trace_log:
        new_trace = Trace()

        found = False
        for j in range(len(trace)):
            if attribute_key in trace[j]:
                attribute_value = trace[j][attribute_key]
                if attribute_value in values:
                    found = True

        if (found and positive) or (not found and not positive):
            new_trace = trace

        if len(new_trace) > 0:
            filtered_log.append(new_trace)
    return filtered_log
Exemple #11
0
def filter_log_by_attributes_threshold(trace_log,
                                       attributes,
                                       variants,
                                       vc,
                                       threshold,
                                       attribute_key="concept:name"):
    """
    Keep only attributes which number of occurrences is above the threshold (or they belong to the first variant)

    Parameters
    ----------
    trace_log
        Trace log
    attributes
        Dictionary of attributes associated with their count
    variants
        (If specified) Dictionary with variant as the key and the list of traces as the value
    vc
        List of variant names along with their count
    threshold
        Cutting threshold (remove attributes which number of occurrences is below the threshold)
    attribute_key
        (If specified) Specify the activity key in the log (default concept:name)

    Returns
    ----------
    filtered_log
        Filtered log
    """
    filtered_log = TraceLog()
    fva = [
        x[attribute_key] for x in variants[vc[0][0]][0] if attribute_key in x
    ]
    for trace in trace_log:
        new_trace = Trace()
        for j in range(len(trace)):
            if attribute_key in trace[j]:
                attribute_value = trace[j][attribute_key]
                if attribute_value in attributes:
                    if attribute_value in fva or attributes[
                            attribute_value] >= threshold:
                        new_trace.append(trace[j])
        if len(new_trace) > 0:
            filtered_log.append(new_trace)
    return filtered_log
Exemple #12
0
def form_fake_log(prefixes_keys, activity_key=xes_util.DEFAULT_NAME_KEY):
    """
    Form fake log for replay (putting each prefix as separate trace to align)

    Parameters
    ----------
    prefixes_keys
        Keys of the prefixes (to form a log with a given order)
    activity_key
        Activity key (must be provided if different from concept:name)
    """
    fake_log = TraceLog()
    for prefix in prefixes_keys:
        trace = Trace()
        prefix_activities = prefix.split(",")
        for activity in prefix_activities:
            event = Event()
            event[activity_key] = activity
            trace.append(event)
        fake_log.append(trace)
    return fake_log
def project_tracelog(log, allowed_activities, parameters=None):
    """
    Project a log on a given list of allowed (by the user) activities

    Parameters
    -------------
    log
        Trace log
    allowed_activities
        List of allowed activities
    parameters
        Possible parameters of the algorithm, including:
            PARAMETER_CONSTANT_ACTIVITY_KEY -> the activity name to use in the projection

    Returns
    ------------
    projected_log
        Projected trace log
    """
    if parameters is None:
        parameters = {}
    activity_key = parameters[
        constants.
        PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY

    projected_log = TraceLog()

    for trace in log:
        projected_trace = Trace()
        for event in trace:
            if event[activity_key] in allowed_activities:
                projected_trace.append(deepcopy(event))
        if len(projected_trace) > 0:
            projected_log.append(projected_trace)

    return projected_log
def apply(trace_log, paths, parameters=None):
    """
    Apply a filter on traces containing / not containing a path

    Parameters
    -----------
    trace_log
        Trace log
    paths
        Paths that we are looking for (expressed as tuple of 2 strings)
    parameters
        Parameters of the algorithm, including:
            activity_key -> Attribute identifying the activity in the log
            positive -> Indicate if events should be kept/removed

    Returns
    -----------
    filtered_log
        Filtered trace log
    """
    if parameters is None:
        parameters = {}
    attribute_key = parameters[
        PARAMETER_CONSTANT_ATTRIBUTE_KEY] if PARAMETER_CONSTANT_ATTRIBUTE_KEY in parameters else xes.DEFAULT_NAME_KEY
    positive = parameters["positive"] if "positive" in parameters else True
    filtered_log = TraceLog()
    for trace in trace_log:
        found = False
        for i in range(len(trace) - 1):
            path = (trace[i][attribute_key], trace[i + 1][attribute_key])
            if path in paths:
                found = True
                break
        if (found and positive) or (not found and not positive):
            filtered_log.append(trace)
    return filtered_log
def filter_on_ncases(trace_log, max_no_cases=1000):
    """
    Get only a specified number of traces from a trace log

    Parameters
    -----------
    trace_log
        Trace log
    max_no_cases
        Desidered number of traces from the trace log

    Returns
    -----------
    filtered_log
        Filtered log
    """
    filtered_log = TraceLog(trace_log[:min(len(trace_log), max_no_cases)])
    return filtered_log
Exemple #16
0
def apply(log, parameters=None):
    """
    Returns a log from which a sound workflow net could be extracted taking into account
    a discovery algorithm returning models only with visible transitions

    Parameters
    ------------
    log
        Trace log
    parameters
        Possible parameters of the algorithm, including:
            discovery_algorithm -> Discovery algorithm to consider, possible choices: alphaclassic
            max_no_variants -> Maximum number of variants to consider to return a Petri net

    Returns
    ------------
    filtered_log
        Filtered trace log
    """
    if parameters is None:
        parameters = {}
    discovery_algorithm = parameters[
        "discovery_algorithm"] if "discovery_algorithm" in parameters else "alphaclassic"
    max_no_variants = parameters[
        "max_no_variants"] if "max_no_variants" in parameters else 20
    all_variants_dictio = variants_filter.get_variants(log,
                                                       parameters=parameters)
    all_variants_list = []
    for var in all_variants_dictio:
        all_variants_list.append([var, len(all_variants_dictio[var])])
    all_variants_list = sorted(all_variants_list,
                               key=lambda x: (x[1], x[0]),
                               reverse=True)
    considered_variants = []
    considered_traces = []

    i = 0
    while i < min(len(all_variants_list), max_no_variants):
        variant = all_variants_list[i][0]

        considered_variants.append(variant)
        considered_traces.append(all_variants_dictio[variant][0])
        filtered_log = TraceLog(considered_traces)
        net = None
        initial_marking = None
        final_marking = None
        if discovery_algorithm == "alphaclassic" or discovery_algorithm == "alpha":
            net, initial_marking, final_marking = alpha_miner.apply(
                filtered_log, parameters=parameters)
        is_sound = check_soundness.check_petri_wfnet_and_soundness(net)
        if not is_sound:
            del considered_variants[-1]
            del considered_traces[-1]
        else:
            try:
                alignments = alignment_factory.apply(filtered_log, net,
                                                     initial_marking,
                                                     final_marking)
                del alignments
                fitness = replay_fitness_factory.apply(filtered_log,
                                                       net,
                                                       initial_marking,
                                                       final_marking,
                                                       parameters=parameters)
                if fitness["log_fitness"] < 0.99999:
                    del considered_variants[-1]
                    del considered_traces[-1]
            except TypeError:
                del considered_variants[-1]
                del considered_traces[-1]
        i = i + 1

    sound_log = TraceLog()
    if considered_variants:
        sound_log = variants_filter.apply(log,
                                          considered_variants,
                                          parameters=parameters)

    return sound_log
def apply(df, parameters=None):
    """
    Returns a Pandas dataframe from which a sound workflow net could be extracted taking into account
    a discovery algorithm returning models only with visible transitions

    Parameters
    ------------
    df
        Pandas dataframe
    parameters
        Possible parameters of the algorithm, including:
            max_no_variants -> Maximum number of variants to consider to return a Petri net

    Returns
    ------------
    filtered_df
        Filtered dataframe
    """
    if parameters is None:
        parameters = {}

    if PARAMETER_CONSTANT_CASEID_KEY not in parameters:
        parameters[PARAMETER_CONSTANT_CASEID_KEY] = CASE_CONCEPT_NAME
    if PARAMETER_CONSTANT_ACTIVITY_KEY not in parameters:
        parameters[PARAMETER_CONSTANT_ACTIVITY_KEY] = DEFAULT_NAME_KEY
    if PARAMETER_CONSTANT_TIMESTAMP_KEY not in parameters:
        parameters[PARAMETER_CONSTANT_TIMESTAMP_KEY] = DEFAULT_TIMESTAMP_KEY
    if PARAMETER_CONSTANT_ATTRIBUTE_KEY not in parameters:
        parameters[PARAMETER_CONSTANT_ATTRIBUTE_KEY] = parameters[
            PARAMETER_CONSTANT_ACTIVITY_KEY]

    caseid_glue = parameters[PARAMETER_CONSTANT_CASEID_KEY]
    activity_key = parameters[PARAMETER_CONSTANT_ACTIVITY_KEY]
    timest_key = parameters[PARAMETER_CONSTANT_TIMESTAMP_KEY]

    max_no_variants = parameters[
        "max_no_variants"] if "max_no_variants" in parameters else 20

    variants_df = case_statistics.get_variants_df(df, parameters=parameters)
    parameters["variants_df"] = variants_df

    variant_stats = case_statistics.get_variant_statistics(
        df, parameters=parameters)

    all_variants_list = []
    for var in variant_stats:
        all_variants_list.append([var["variant"], var[caseid_glue]])

    all_variants_list = sorted(all_variants_list,
                               key=lambda x: (x[1], x[0]),
                               reverse=True)

    considered_variants = []
    considered_traces = []

    i = 0
    while i < min(len(all_variants_list), max_no_variants):
        variant = all_variants_list[i][0]

        considered_variants.append(variant)

        filtered_df = variants_filter.apply(df,
                                            considered_variants,
                                            parameters=parameters)

        dfg_frequency = dfg_util.get_dfg_graph(filtered_df,
                                               measure="frequency",
                                               perf_aggregation_key="median",
                                               case_id_glue=caseid_glue,
                                               activity_key=activity_key,
                                               timestamp_key=timest_key)

        net, initial_marking, final_marking = alpha_miner.apply_dfg(
            dfg_frequency, parameters=parameters)

        is_sound = check_soundness.check_petri_wfnet_and_soundness(net)
        if not is_sound:
            del considered_variants[-1]
        else:
            traces_of_this_variant = variants_filter.apply(
                df, [variant], parameters=parameters).groupby(caseid_glue)
            traces_of_this_variant_keys = list(
                traces_of_this_variant.groups.keys())
            trace_of_this_variant = traces_of_this_variant.get_group(
                traces_of_this_variant_keys[0])

            this_trace = transform.transform_event_log_to_trace_log(
                pandas_df_imp.convert_dataframe_to_event_log(
                    trace_of_this_variant),
                case_glue=caseid_glue)[0]
            if not activity_key == DEFAULT_NAME_KEY:
                for j in range(len(this_trace)):
                    this_trace[j][DEFAULT_NAME_KEY] = this_trace[j][
                        activity_key]
            considered_traces.append(this_trace)
            filtered_log = TraceLog(considered_traces)

            try:
                alignments = alignment_factory.apply(filtered_log, net,
                                                     initial_marking,
                                                     final_marking)
                del alignments
                fitness = replay_fitness_factory.apply(filtered_log,
                                                       net,
                                                       initial_marking,
                                                       final_marking,
                                                       parameters=parameters)
                if fitness["log_fitness"] < 0.99999:
                    del considered_variants[-1]
                    del considered_traces[-1]
            except TypeError:
                del considered_variants[-1]
                del considered_traces[-1]

        i = i + 1

    return variants_filter.apply(df,
                                 considered_variants,
                                 parameters=parameters)