Esempio n. 1
0
def dfg_dist_calc_minkowski(log1, log2, alpha):
    act1 = attributes_filter.get_attribute_values(log1, "concept:name")
    act2 = attributes_filter.get_attribute_values(log2, "concept:name")
    dfg1 = dfg_algorithm.apply(log1)
    dfg2 = dfg_algorithm.apply(log2)
    df1_act = act_dist_calc.occu_var_act(act1)
    df2_act = act_dist_calc.occu_var_act(act2)
    df1_dfg = act_dist_calc.occu_var_act(dfg1)
    df2_dfg = act_dist_calc.occu_var_act(dfg2)
    df_act = pd.merge(df1_act, df2_act, how='outer', on='var').fillna(0)
    df_dfg = pd.merge(df1_dfg, df2_dfg, how='outer', on='var').fillna(0)
    dist_act = pdist(np.array([
        df_act['freq_x'].values / np.sum(df_act['freq_x'].values),
        df_act['freq_y'].values / np.sum(df_act['freq_y'].values)
    ]),
                     'minkowski',
                     p=2.)[0]
    dist_dfg = pdist(np.array([
        df_dfg['freq_x'].values / np.sum(df_dfg['freq_x'].values),
        df_dfg['freq_y'].values / np.sum(df_dfg['freq_y'].values)
    ]),
                     'minkowski',
                     p=2.)[0]
    dist = dist_act * alpha + dist_dfg * (1 - alpha)
    return dist
Esempio n. 2
0
def dfg_dist_calc_act(log1, log2):
    act1 = attributes_filter.get_attribute_values(log1, "concept:name")
    act2 = attributes_filter.get_attribute_values(log2, "concept:name")
    df1_act = act_dist_calc.occu_var_act(act1)
    df2_act = act_dist_calc.occu_var_act(act2)
    df_act = pd.merge(df1_act, df2_act, how='outer', on='var').fillna(0)
    dist_act = pdist(
        np.array([df_act['freq_x'].values, df_act['freq_y'].values]),
        'cosine')[0]
    return dist_act
Esempio n. 3
0
def dfg_dist_calc_suc(log1, log2):
    dfg1 = dfg_algorithm.apply(log1)
    dfg2 = dfg_algorithm.apply(log2)
    df1_dfg = act_dist_calc.occu_var_act(dfg1)
    df2_dfg = act_dist_calc.occu_var_act(dfg2)
    df_dfg = pd.merge(df1_dfg, df2_dfg, how='outer', on='var').fillna(0)
    dist_dfg = pdist(
        np.array([df_dfg['freq_x'].values, df_dfg['freq_y'].values]),
        'cosine')[0]
    return dist_dfg
Esempio n. 4
0
def dfg_dist_calc(log1, log2):
    act1 = attributes_filter.get_attribute_values(log1, "concept:name")
    act2 = attributes_filter.get_attribute_values(log2, "concept:name")
    dfg1 = dfg_algorithm.apply(log1)
    dfg2 = dfg_algorithm.apply(log2)
    df1_act = act_dist_calc.occu_var_act(act1)
    df2_act = act_dist_calc.occu_var_act(act2)
    df1_dfg = act_dist_calc.occu_var_act(dfg1)
    df2_dfg = act_dist_calc.occu_var_act(dfg2)
    df_act = pd.merge(df1_act, df2_act, how='outer', on='var').fillna(0)
    df_dfg = pd.merge(df1_dfg, df2_dfg, how='outer', on='var').fillna(0)
    dist_act = pdist(
        np.array([df_act['freq_x'].values, df_act['freq_y'].values]),
        'cosine')[0]
    dist_dfg = pdist(
        np.array([df_dfg['freq_x'].values, df_dfg['freq_y'].values]),
        'cosine')[0]
    if (np.isnan(dist_dfg) == True):
        dist_dfg = 1
    return dist_act, dist_dfg
Esempio n. 5
0
def dist_calc(var_list_1, var_list_2, log1, log2, freq_thres, num, alpha, parameters=None):
    '''
    this function compare the activity similarity between two sublogs via the two lists of variants.
    :param var_list_1: lists of variants in sublog 1
    :param var_list_2: lists of variants in sublog 2
    :param freq_thres: same as sublog2df()
    :param log1: input sublog1 of sublog2df(), which must correspond to var_list_1
    :param log2: input sublog2 of sublog2df(), which must correspond to var_list_2
    :param alpha: the weight parameter between activity similarity and succession similarity, which belongs to (0,1)
    :param parameters: state which linkage method to use
    :return: the similarity value between two sublogs
    '''

    if parameters is None:
        parameters = {}

    single = exec_utils.get_param_value(Parameters.SINGLE, parameters, False)

    if len(var_list_1) >= len(var_list_2):
        max_len = len(var_list_1)
        min_len = len(var_list_2)
        max_var = var_list_1
        min_var = var_list_2
        var_count_max = filter_subsets.sublog2df(log1, freq_thres, num)['count']
        var_count_min = filter_subsets.sublog2df(log2, freq_thres, num)['count']
    else:
        max_len = len(var_list_2)
        min_len = len(var_list_1)
        max_var = var_list_2
        min_var = var_list_1
        var_count_max = filter_subsets.sublog2df(log2, freq_thres, num)['count']
        var_count_min = filter_subsets.sublog2df(log1, freq_thres, num)['count']

    # act
    max_per_var_act = np.zeros(max_len)
    max_freq_act = np.zeros(max_len)
    col_sum_act = np.zeros(max_len)

    # suc
    max_per_var_suc = np.zeros(max_len)
    col_sum_suc = np.zeros(max_len)
    max_freq_suc = np.zeros(max_len)

    if var_list_1 == var_list_2:
        print("Please give different variant lists!")
    else:
        for i in range(max_len):
            dist_vec_act = np.zeros(min_len)
            dist_vec_suc = np.zeros(min_len)
            df_1_act = act_dist_calc.occu_var_act(max_var[i])
            df_1_suc = suc_dist_calc.occu_var_suc(max_var[i], parameters={"binarize": True})
            for j in range(min_len):
                df_2_act = act_dist_calc.occu_var_act(min_var[j])
                df_2_suc = suc_dist_calc.occu_var_suc(min_var[j], parameters={"binarize": True})

                df_act = pd.merge(df_1_act, df_2_act, how='outer', on='var').fillna(0)
                df_suc = pd.merge(df_1_suc, df_2_suc, how='outer', on='direct_suc').fillna(0)

                dist_vec_act[j] = (pdist(np.array([df_act['freq_x'].values, df_act['freq_y'].values]), 'cosine')[0])
                dist_vec_suc[j] = (pdist(np.array([df_suc['freq_x'].values, df_suc['freq_y'].values]), 'cosine')[0])

                if (single):
                    if (abs(dist_vec_act[j]) <= 1e-8) and (abs(dist_vec_suc[j]) <= 1e-6):  # ensure both are 1
                        max_freq_act[i] = var_count_max.iloc[i] * var_count_min.iloc[j]
                        max_freq_suc[i] = max_freq_act[i]
                        max_per_var_act[i] = dist_vec_act[j] * max_freq_act[i]
                        max_per_var_suc[i] = dist_vec_suc[j] * max_freq_suc[i]

                        break
                    elif j == (min_len - 1):
                        max_loc_col_act = np.argmin(dist_vec_act)  # location of max value
                        max_loc_col_suc = np.argmin(dist_vec_suc)  # location of max value
                        max_freq_act[i] = var_count_max.iloc[i] * var_count_min.iloc[max_loc_col_act]
                        max_freq_suc[i] = var_count_max.iloc[i] * var_count_min.iloc[max_loc_col_suc]
                        max_per_var_act[i] = dist_vec_act[max_loc_col_act] * max_freq_act[i]
                        max_per_var_suc[i] = dist_vec_suc[max_loc_col_suc] * max_freq_suc[i]

                else:
                    col_sum_act[i] += dist_vec_act[j] * var_count_max.iloc[i] * var_count_min.iloc[j]
                    col_sum_suc[i] += dist_vec_suc[j] * var_count_max.iloc[i] * var_count_min.iloc[j]
    if (single):
        # single linkage
        dist_act = np.sum(max_per_var_act) / np.sum(max_freq_act)
        dist_suc = np.sum(max_per_var_suc) / np.sum(max_freq_suc)
        dist = dist_act * alpha + dist_suc * (1 - alpha)
    else:
        vmax_vec = (var_count_max.values).reshape(-1, 1)
        vmin_vec = (var_count_min.values).reshape(1, -1)
        vec_sum = np.sum(np.dot(vmax_vec, vmin_vec))
        dist = (np.sum(col_sum_act) * alpha + np.sum(col_sum_suc) * (1 - alpha)) / vec_sum

    return dist
Esempio n. 6
0
def slice_dist_act(log_1, log_2, unit, parameters=None):
    (log1_list, freq1_list) = filter_subsets.logslice_percent(log_1, unit)
    (log2_list, freq2_list) = filter_subsets.logslice_percent(log_2, unit)

    if len(freq1_list) >= len(freq2_list):
        max_len = len(freq1_list)
        min_len = len(freq2_list)
        max_log = log1_list
        min_log = log2_list
        var_count_max = freq1_list
        var_count_min = freq2_list

    else:
        max_len = len(freq2_list)
        min_len = len(freq1_list)
        max_log = log2_list
        min_log = log1_list
        var_count_max = freq2_list
        var_count_min = freq1_list

    dist_matrix = np.zeros((max_len, min_len))
    max_per_var = np.zeros(max_len)
    max_freq = np.zeros(max_len)
    min_freq = np.zeros(min_len)
    min_per_var = np.zeros(min_len)
    index_rec = set(list(range(min_len)))

    if log1_list == log2_list:
        print("Please give different variant lists!")
        dist = 0
    else:
        for i in range(max_len):
            dist_vec = np.zeros(min_len)
            act1 = attributes_filter.get_attribute_values(
                max_log[i], "concept:name")
            df1_act = act_dist_calc.occu_var_act(act1)
            for j in range(min_len):
                act2 = attributes_filter.get_attribute_values(
                    min_log[j], "concept:name")
                df2_act = act_dist_calc.occu_var_act(act2)
                df_act = pd.merge(df1_act, df2_act, how='outer',
                                  on='var').fillna(0)
                dist_vec[j] = pdist(
                    np.array(
                        [df_act['freq_x'].values, df_act['freq_y'].values]),
                    'cosine')[0]
                dist_matrix[i][j] = dist_vec[j]
                if j == (min_len - 1):
                    max_loc_col = np.argmin(dist_vec)
                    if abs(dist_vec[max_loc_col]) <= 1e-8:
                        index_rec.discard(max_loc_col)
                        max_freq[i] = var_count_max[i] * var_count_min[
                            max_loc_col] * 2
                        max_per_var[
                            i] = dist_vec[max_loc_col] * max_freq[i] * 2
                    else:
                        max_freq[
                            i] = var_count_max[i] * var_count_min[max_loc_col]
                        max_per_var[i] = dist_vec[max_loc_col] * max_freq[i]

        if (len(index_rec) != 0):
            for i in list(index_rec):
                min_loc_row = np.argmin(dist_matrix[:, i])
                min_freq[i] = var_count_max[min_loc_row] * var_count_min[i]
                min_per_var[i] = dist_matrix[min_loc_row, i] * min_freq[i]

        dist = (np.sum(max_per_var) +
                np.sum(min_per_var)) / (np.sum(max_freq) + np.sum(min_freq))

    return dist