Example #1
0
def select_attributes_from_log_for_tree(
        log: EventLog,
        max_cases_for_attr_selection=DEFAULT_MAX_CASES_FOR_ATTR_SELECTION,
        max_diff_occ=DEFAULT_MAX_CASES_FOR_ATTR_SELECTION / 4):
    """
    Select attributes from log for tree

    Parameters
    ------------
    log
        Log
    max_cases_for_attr_selection
        Maximum number of cases to consider for attribute selection
    max_diff_occ
        Maximum number of different occurrences

    Returns
    ------------

    """
    log = log_converter.apply(log, variant=log_converter.Variants.TO_EVENT_LOG)

    if len(log) > max_cases_for_attr_selection:
        filtered_log = sampling.sample(log, max_cases_for_attr_selection)
    else:
        filtered_log = log
    event_attributes = get_all_event_attributes_from_log(filtered_log)
    trace_attributes = get_all_trace_attributes_from_log(filtered_log)
    event_attributes_values = {}
    trace_attributes_values = {}
    for attr in event_attributes:
        event_attributes_values[attr] = set(
            get_attribute_values(log, attr).keys())
    for attr in trace_attributes:
        trace_attributes_values[attr] = set(
            get_trace_attribute_values(log, attr).keys())

    numeric_event_attributes_to_consider = list()
    string_event_attributes_to_consider = list()
    numeric_trace_attributes_to_consider = list()
    string_trace_attributes_to_consider = list()

    for attr in event_attributes_values:
        if type(list(event_attributes_values[attr])[0]) is int or type(
                list(event_attributes_values[attr])[0]) is float:
            numeric_event_attributes_to_consider.append(attr)
        elif type(list(event_attributes_values[attr])[0]) is str and len(
                event_attributes_values[attr]) < max_diff_occ:
            string_event_attributes_to_consider.append(attr)

    for attr in trace_attributes_values:
        if type(list(trace_attributes_values[attr])[0]) is int or type(
                list(trace_attributes_values[attr])[0]) is float:
            numeric_trace_attributes_to_consider.append(attr)
        elif type(list(trace_attributes_values[attr])[0]) is str and len(
                trace_attributes_values[attr]) < max_diff_occ:
            string_trace_attributes_to_consider.append(attr)

    numeric_event_attributes_to_consider = check_event_attributes_presence(
        log, numeric_event_attributes_to_consider)
    string_event_attributes_to_consider = check_event_attributes_presence(
        log, string_event_attributes_to_consider)
    numeric_trace_attributes_to_consider = check_trace_attributes_presence(
        log, numeric_trace_attributes_to_consider)
    string_trace_attributes_to_consider = check_trace_attributes_presence(
        log, string_trace_attributes_to_consider)

    return string_trace_attributes_to_consider, string_event_attributes_to_consider, numeric_trace_attributes_to_consider, numeric_event_attributes_to_consider
Example #2
0
def apply(log: Union[EventLog, EventStream, pd.DataFrame],
          trace_attribute: str,
          variant=VARIANT_DMM_LEVEN,
          parameters: Optional[Dict[Any, Any]] = None) -> Any:
    """
    Apply the hierarchical clustering to a log starting from a trace attribute.

    MSc Thesis is available at: https://www.pads.rwth-aachen.de/global/show_document.asp?id=aaaaaaaaalpxgft&download=1
    Defense slides are available at: https://www.pads.rwth-aachen.de/global/show_document.asp?id=aaaaaaaaalpxgqx&download=1

    Parameters
    ----------------
    log
        Log
    trace_attribute
        Trace attribute to exploit for the clustering
    variant
        Variant of the algorithm to apply, possible values:
        - Variants.VARIANT_DMM_LEVEN (that is the default)
        - Variants.VARIANT_AVG_LEVEN
        - Variants.VARIANT_DMM_VEC
        - Variants.VARIANT_AVG_VEC
        - Variants.DFG

    Returns
    -----------------
    tree
        Hierarchical cluster tree
    leafname
        Root node
    """
    if parameters is None:
        parameters = {}

    log = log_converter.apply(log,
                              variant=log_converter.Variants.TO_EVENT_LOG,
                              parameters=parameters)

    percent = 1
    alpha = 0.5

    list_of_vals = []
    list_log = []
    list_of_vals_dict = attributes_filter.get_trace_attribute_values(
        log, trace_attribute)

    list_of_vals_keys = list(list_of_vals_dict.keys())
    for i in range(len(list_of_vals_keys)):
        list_of_vals.append(list_of_vals_keys[i])

    for i in range(len(list_of_vals)):
        logsample = merge_log.log2sublog(log, list_of_vals[i], trace_attribute)
        list_log.append(logsample)

    y = exec_utils.get_variant(variant)(list_log, percent, alpha)

    Z = linkage(y, method='average')

    # Create dictionary for labeling nodes by their IDs

    id2name = dict(zip(range(len(list_of_vals)), list_of_vals))

    T = to_tree(Z, rd=False)
    d3Dendro = dict(children=[], name="Root1")
    merge_log.add_node(T, d3Dendro)

    leafname = merge_log.label_tree(d3Dendro["children"][0], id2name)
    d3Dendro = d3Dendro["children"][0]
    d3Dendro["name"] = 'root'
    tree = d3Dendro

    trilist = bfs(tree)
    trilist[0][0] = trilist[0][1] + '-' + trilist[0][2]

    rootlist = []
    for ele in trilist:
        rootlist.append(ele[0])

    return tree, leafname