Beispiel #1
0
def compare_multi(all_methods, all_datasets, output_name, measure=adjusted_rand_score):
    """Shows the results of multiple methods on multiple datasets in a grid.
    
    This is an adaptation of a snippet from scikit-learn library:
    https://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html

    Args:
        all_methods: A list of tuples each of which corresponds to a clustering method.
            Each tuple contains 3 elements, a name that will be shown in the plots for the
            method, the function object for the clustering method, and a dictionary that
            will be passed the the clustering method as the hyper-parameters.
        all_datasets: A list of tuples each of which corresponds to a dataset function.
            Each tuple contains 3 elements, the function object for the dataset, a dictionary
            which will be passed as the hyper-parameters for the dataset function, and a 
            dictionary that contains the hyper-parameters for the clustering function (this
            will override the pre-defined values). All the data generated by the dataset 
            functions have to be 2d, otherwise the first two dimentions will be used.
        output_name: the name of the file (containing the plot) that will be saved.
            The posfix of the image file has to be defined (e.g. .png).

        Example:
            all_methods = [('K_means', K_means, {'required_format': 'data_vectors'})] 
            all_datasets = [(noisy_circle, {}, {'input_format': 'data_vectors'})]
    """
    plt.figure(figsize=(len(all_methods)*2, len(all_datasets)*2))
    plt.subplots_adjust(left=.02, right=.98,
                        bottom=.001, top=.96,
                        wspace=.05, hspace=.01)
    plot_num = 1

    for i_dataset, dataset_tuple in enumerate(all_datasets):
        dataset_method = dataset_tuple[0]
        dataset_params = dataset_tuple[1]
        dataset_clustering_method_overriding_params = dataset_tuple[2]

        X, y = dataset_method(dataset_params)

        prev_coords = None

        for name, algorithm, hyper_params in all_methods:
            hyper_params.update(dataset_clustering_method_overriding_params)
            hyper_params.update({'y': y, 'coords_given': prev_coords})

            output_dict = clustering_method_call(X, algorithm,
                                        hyper_params=hyper_params)
            coords = output_dict['coords']
            if coords is not None:
                prev_coords = coords
            else:
                coords = prev_coords

            y_pred = output_dict['y_pred']

            plt.subplot(len(all_datasets), len(all_methods), plot_num)
            if i_dataset == 0:
                plt.title(name, size=18)

            colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
                                                 '#f781bf', '#a65628', '#984ea3',
                                                 '#999999', '#e41a1c', '#dede00',
                                                 '#BB8FCE', '#F7DC6F', '#117A65',
                                                 '#CA6F1E', '#979A9A', '#34495E',
                                                 '#9A7D0A', '#0B5345', '#641E16']),
                                          int(max(y_pred) + 1))))
            # add black color for outliers (if any)
            colors = np.append(colors, ["#000000"])

            # If the points reside in a space with more than 2 dimensions PCA is used
            if coords.shape[1] > 2:
                pca = PCA(n_components=2)
                coords_2d = pca.fit_transform(coords)
            else:
                coords_2d = coords

            plt.scatter(coords_2d[:, 0], coords_2d[:, 1], s=10, color=colors[y_pred])

            plt.xticks(())
            plt.yticks(())
            
            if len(np.unique(y)) > 1:
                ari_score = measure(y, y_pred)
                plt.text(.99, .01, ('%.2f' % ari_score).lstrip('0'),
                         transform=plt.gca().transAxes, size=15,
                         horizontalalignment='right')

            plot_num += 1
    plt.savefig(output_name, dpi=400)
Beispiel #2
0
def compare_timestamps(entitys, output_name,
                            fixed_window_size=None,
                            update_clusters=False,
                            batch_portion=0.5,
                            incremental_dist_threshold=0.5,
                            batch_dist_threshold=0.9):
    all_v_result = []
    all_inc_c = []
    all_inc_h = []
    sessions_count = []
    
    print(entitys.keys())
    for p_id in entitys:
        v_result = []
        x = []
        single_data = entitys[p_id]
        sim_map = single_data['similarity_graph']
        labels = single_data['id_to_cluster_map']
        print('number of sessions: ', len(labels))
        sessions_count.append(len(labels))
        dist_vals = np.array(list(sim_map.values()))
        y = [labels[key] for key in sorted(labels.keys())]

        time_stamps_orig = single_data['time_stamps']
        try:
            time_stamps = [np.datetime64(datetime.datetime.utcfromtimestamp(elem)) for elem in time_stamps_orig]
        except:
            time_stamps = [np.datetime64(elem) for elem in time_stamps_orig]
        
        period = 1 + int((time_stamps[-1] - time_stamps[0]) / np.timedelta64(86400000000, 'us'))
        batch_period = np.timedelta64(int(np.ceil(period*batch_portion))* 86400000000, 'us')

        for start_date in range(int(np.ceil(period*batch_portion))):
            first_day = time_stamps[0] + np.timedelta64(start_date*86400000000, 'us')
            first_idx = 0
            last_idx = len(time_stamps)
            for i in range(len(time_stamps)):
                if time_stamps[i] < first_day:
                    first_idx += 1
                if time_stamps[i] > first_day + batch_period:
                    last_idx += i
                    break
            
            sim_map_tmp = filter_sim_map(sim_map, last_idx-1)
            sim_map_new = filter_sim_map(sim_map_tmp, first_idx, min_case=True)

            inc_params = {'dist_threshold': incremental_dist_threshold,
                        'data_size': last_idx-first_idx,
                        'update_clusters': update_clusters,
                        'window_size': None}

            y_pred_inc = incremental_average_SI(sim_map_new, hyper_params=inc_params)

            batch_params = {'input_format': 'similarity_dict',
                    'required_format': 'dist_mat',
                    'distance_threshold': 0.9,
                    'n_clusters': None,
                    'data_size': last_idx-first_idx}
            output_dict = clustering_method_call(sim_map_new, 
                                        HAC_average, 
                                        batch_params)
    
            y_pred_batch = output_dict['y_pred']

            v_result.append(v_measure_score(y_pred_batch, y_pred_inc))
            x.append(start_date)

        all_v_result.append(v_result)

    inc_result = np.average(all_v_result, axis=0)
    plt.plot(x, inc_result)
def warm_start_inc_avg_SI(sim_map, hyper_params={}):
    params = {'batch_portion': 0.5,
              'data_size': None,
              'dist_threshold': 0.1,
              'batch_threshold': 0.9,
              'time_stamps': None,
              'window_size': 15,
              'batch_params': {},
              'inc_params': {}}
    params.update(hyper_params)

    dist_threshold = params['dist_threshold']
    batch_threshold = params['batch_threshold']
    all_data_size = params['data_size']
    window_size = params['window_size']
    time_stamps_orig = params['time_stamps']
    batch_portion = params['batch_portion']
    if time_stamps_orig is not None:
        try:
            time_stamps = [np.datetime64(datetime.datetime.utcfromtimestamp(elem)) for elem in time_stamps_orig]
        except:
            time_stamps = [np.datetime64(elem) for elem in time_stamps_orig]

        period = 1 + int((time_stamps[-1] - time_stamps[0]) / np.timedelta64(86400000000, 'us'))
        batch_period = np.timedelta64(int(np.ceil(period*batch_portion))* 86400000000, 'us')
        batch_size = 0
        for i in range(len(time_stamps)):
            if time_stamps[i] < time_stamps[0] + batch_period:
                batch_size += 1
            else:
                break
    else:
        time_stamps = time_stamps_orig
        batch_size = int(np.ceil(all_data_size * batch_portion))

    batch_sim_map = filter_sim_map(sim_map, batch_size-1)

    batch_params = {'input_format': 'similarity_dict',
                    'required_format': 'dist_mat',
                    'distance_threshold': batch_threshold,
                    'n_clusters': None,
                    'data_size': batch_size,
                    # 'top_eigenvals': 2,
                    # 'embedding_method': get_coords_mds_stress,
                    # 'embedding_hyper_params': {'eps': 0.0000001,
                    #                             'max_iter': 1000}
                    }
    batch_params.update(params['batch_params'])

    output_dict = clustering_method_call(batch_sim_map, 
                                        HAC_average, 
                                        batch_params)
    
    y_pred_batch = output_dict['y_pred']
    clusters_initial_tmp = output_dict['cluster_to_id_map'].values()
    clusters_initial = [set(cluster) for cluster in clusters_initial_tmp]

    inc_params = {'dist_threshold': dist_threshold,
                  'update_clusters': False,
                  'clusters': clusters_initial,
                  'data_size': all_data_size,
                  'time_stamps': time_stamps,
                  'window_size': window_size}
    inc_params.update(params['inc_params'])

    y_label_inc = incremental_average_SI(sim_map, hyper_params=inc_params)
    return y_label_inc, y_pred_batch
Beispiel #4
0
def compare_incremental_multi(entitys, output_name,
                            warm_start=False,
                            weighted=False,
                            time_stamps_flag=False,
                            fixed_window_size=None,
                            batch_portion=0.5,
                            update_clusters=False,
                            second_half_batch=False,
                            linkage='centroid',
                            incremental_no_first_half=False,
                            batch_dist_threshold=0.9):
    all_inc_result = []
    all_inc_c = []
    all_inc_h = []
    sessions_count = []

    for p_id in range(len(entitys)):
        single_data = entitys[p_id]
        sim_map = single_data['similarity_graph']
        labels = single_data['id_to_cluster_map']
        print('number of sessions: ', len(labels))
        sessions_count.append(len(labels))
        dist_vals = np.array(list(sim_map.values()))
        y = [labels[key] for key in sorted(labels.keys())]

        if time_stamps_flag:
            time_stamps_orig = single_data['time_stamps']
            try:
                time_stamps = [np.datetime64(datetime.datetime.utcfromtimestamp(elem)) for elem in time_stamps_orig]
            except:
                time_stamps = [np.datetime64(elem) for elem in time_stamps_orig]
            
            period = 1 + int((time_stamps[-1] - time_stamps[0]) / np.timedelta64(86400000000, 'us'))
            batch_period = np.timedelta64(int(np.ceil(period*batch_portion))* 86400000000, 'us')

            batch_size = 0
            for i in range(len(time_stamps)):
                if time_stamps[i] < time_stamps[0] + batch_period:
                    batch_size += 1
                else:
                    break

        inc_result = []
        inc_c = []
        inc_h = []
        best_scores = []
        x = [] 
        for dist in np.arange(0.00, 1.01, 0.05):
            if not warm_start:
                if time_stamps_flag:
                    inc_params = {'dist_threshold': dist,
                                  'data_size': len(labels),
                                  'time_stamps': time_stamps,
                                  'window_size': fixed_window_size,
                                  'linkage': linkage,
                                  'update_clusters': update_clusters}

                else:
                    inc_params = {'dist_threshold': dist,
                                  'data_size': len(labels),
                                  'linkage': linkage,
                                  'update_clusters': update_clusters}
                    # batch_size = len(y)//2
                    batch_size = int(np.ceil(len(y)*batch_portion))

                if incremental_no_first_half:
                    inc_params.update({'data_size': len(labels)-batch_size})
                    new_sim_map = filter_sim_map(sim_map, batch_size, min_case=True)
                    y_label_inc = incremental_average_SI(new_sim_map, hyper_params=inc_params)
                else:
                    y_label_inc = incremental_average_SI(sim_map, hyper_params=inc_params)
                    y_label_inc = y_label_inc[batch_size:]

            else:
                if time_stamps_flag:
                    warm_start_params = {'dist_threshold': dist,
                                        'data_size': len(labels),
                                        'batch_portion': batch_portion,
                                        'time_stamps': time_stamps,
                                        'window_size': fixed_window_size,
                                        'inc_params': {'update_clusters': update_clusters, 'linkage': linkage},
                                        'batch_params': {'distance_threshold': batch_dist_threshold}}
                else:
                    warm_start_params = {'dist_threshold': dist,
                                        'data_size': len(labels),
                                        'batch_portion': batch_portion,
                                        'inc_params': {'update_clusters': update_clusters, 'linkage': linkage},
                                        'batch_params': {'distance_threshold': batch_dist_threshold}}
                y_label_inc, y_pred_batch = warm_start_inc_avg_SI(sim_map, hyper_params=warm_start_params)
                batch_size = len(y_pred_batch)
                y_label_inc = y_label_inc[batch_size:]

            y_half = y[batch_size:]

            if second_half_batch:
                second_sim_map = filter_sim_map(sim_map, batch_size, min_case=True)
                batch_params = {'input_format': 'similarity_dict',
                    'required_format': 'dist_mat',
                    'data_size': len(y_half),
                    'n_clusters': None,
                    'distance_threshold': batch_dist_threshold}
                output_dict = clustering_method_call(second_sim_map, HAC_average, hyper_params=batch_params)
                new_y_half = output_dict['y_pred']
                y_half = new_y_half

            inc_result.append(v_measure_score(y_half, y_label_inc))
            inc_c.append(completeness_score(y_half, y_label_inc))
            inc_h.append(homogeneity_score(y_half, y_label_inc))
            x.append(dist)
        
        print('second half: ', len(y_label_inc))
        best_scores.append(np.max(inc_result))

        all_inc_result.append(inc_result)
        all_inc_c.append(inc_c)
        all_inc_h.append(inc_h)

    if weighted:
        inc_result = np.average(all_inc_result, axis=0, weights=sessions_count)
        inc_c = np.average(all_inc_c, axis=0, weights=sessions_count)
        inc_h = np.average(all_inc_h, axis=0, weights=sessions_count)
    else:
        inc_result = np.average(all_inc_result, axis=0)
        inc_c = np.average(all_inc_c, axis=0) 
        inc_h = np.average(all_inc_h, axis=0) 


    inc_result_err = np.std(all_inc_result, axis=0)
    inc_c_err = np.std(all_inc_c, axis=0)
    inc_h_err = np.std(all_inc_h, axis=0)

    inc_result_err_lower = inc_result - inc_result_err
    inc_result_err_upper = inc_result + inc_result_err
    inc_result_err_tup = list(zip(inc_result_err_lower, inc_result_err_upper))
    inc_result_err_tup = np.array(inc_result_err_tup).T
    # print('err tup: ', inc_result_err_tup)

    # print(list(y_pred_batch))
    # print(y[:len(y_pred_batch)])
    # print(v_measure_score(y[:len(y_pred_batch)], y_pred_batch))
    # print(purity(y[:len(y_pred_batch)], y_pred_batch))

    x.reverse()
    plt.errorbar(x, inc_result, yerr=inc_result_err, fmt='-o')
    
    title = ""
    if warm_start:
        title += 'Warm start, '
    else:
        title += 'Cold start, '

    if update_clusters:
        title += 'with merging clusters, '
    else:
        title += 'no merging clusters, '

    if fixed_window_size is not None:
       title += str(fixed_window_size) + ' day window'
    else:
       title += 'no limited window'

    plt.title(title)
    plt.xlabel('Similarity Threshold')
    plt.ylabel('Score')
    # plt.legend()
    plt.ylim(0.0, 1.0)
    plt.xlim(0.0, 1.0)
    plt.savefig(output_name + '.png')

    plt.clf()
    plt.hist(best_scores)
    # plt.savefig(output_name + '_best_hist.png')
    plt.clf()

    return x, inc_result, inc_c, inc_h
def incremental_average_SI(sim_map, hyper_params={}):
    params = {'dist_threshold': 0.5,
              'update_clusters': False,
              'clusters': [set([0])],
              'time_stamps': None,
              'window_size': 15,
              'data_size': None,
              'linkage': 'centroid',
              'max_sim_value': 1.0,
              'only_final_clusters': False,
              'normalize_flag': False,
              'keep_timestamp_results': {'active': False, 'v_scores': [], 'batch_params': {}, 'complete_batch': [], 'v_scores_complete': [],
                                          'size_list': [], 'h_scores': [], 'h_scores_complete': [], 'c_scores': [], 'c_scores_complete': [],
                                           'v_scores_limited': []},
              'dist_mat': False}
    params.update(hyper_params)

    data_size = params['data_size']
    max_sim_value = params['max_sim_value']
    normalize_flag = params['normalize_flag']
    linkage = params['linkage']
    keep_timestamp_results = params['keep_timestamp_results']

    if params['dist_mat']:
        dist_mat = sim_map
    else:
        sim_mat = dict_to_mat(sim_map, data_size=data_size, max_val=max_sim_value)[0]
        dist_mat = sim_dist_convert(sim_mat, normalize_flag=normalize_flag)

    dist_threshold = params['dist_threshold']
    update_clusters = params['update_clusters']
    clusters = deepcopy(params['clusters'])

    time_stamps_orig = params['time_stamps']
    window_size = params['window_size']
    windowed_flag = False

    if time_stamps_orig is not None and window_size is not None:
        windowed_flag = True
        try:
            time_stamps = [np.datetime64(datetime.datetime.utcfromtimestamp(elem)) for elem in time_stamps_orig]
        except:
            time_stamps = [np.datetime64(elem) for elem in time_stamps_orig]

    outdated_set = set()
    already_clustered = set()
    for cluster in clusters:
        already_clustered |= cluster

    first_window_move_flag = True
    first_window_move_idx = 0
    outdate_index = 0  # points to position after the last outdated element.
    for i in range(len(dist_mat)):
        if i in already_clustered:
            continue

        if windowed_flag:
            while (time_stamps[i] - time_stamps[outdate_index]) / np.timedelta64(86400000000, 'us') >= window_size:
                print((time_stamps[i] - time_stamps[outdate_index]) / np.timedelta64(86400000000, 'us'))
                outdated_set.add(outdate_index)
                outdate_index += 1
                if first_window_move_flag:
                    first_window_move_flag = False
                    first_window_move_idx = i

        best_cluster = -1
        min_dist = float("inf")
        for idx, cluster in enumerate(clusters):
            new_dist = average_dist_SI(dist_mat, cluster, i, outdated_set=outdated_set, linkage=linkage)
            if new_dist < min_dist:
                min_dist = new_dist
                best_cluster = idx 

        update_necessary = False
        if min_dist < dist_threshold:
            changed_cluster = idx
            clusters[best_cluster].add(i)
            update_necessary = True
        else:
            changed_cluster = len(clusters)
            clusters.append(set([i]))

        if update_clusters and update_necessary:
            remaining_flag = True
            while remaining_flag:
                remaining_flag = False
                for c in range(len(clusters)):
                    if c == changed_cluster:
                        continue
                    cc_dist = compute_new_dist_SI(dist_mat, clusters[changed_cluster], clusters[c], outdated_set=outdated_set, linkage=linkage)
                    if cc_dist < dist_threshold:
                        remaining_flag = True
                        clusters[changed_cluster] = clusters[changed_cluster] | clusters[c]
                        if c >= len(clusters)-1:
                            clusters = clusters[:c]
                        else:
                            clusters = clusters[:c] + clusters[c+1:]
                        if c < changed_cluster:
                            changed_cluster -= 1
                        break

        if keep_timestamp_results['active']:
            # print('-------------')
            keep_timestamp_results['first_window_move_idx'] = first_window_move_idx
            tmp_y_inc = [-1 for _ in range(i+1)]
            for idx, cluster in enumerate(clusters):
                for elem in list(cluster):
                    tmp_y_inc[elem] = idx
            tmp_y_inc = tmp_y_inc[outdate_index:]
            # print('inc: ', tmp_y_inc)

            tmp_sim_map = filter_sim_map(sim_map, i)
            
            ### if we do not want to outdate:
            keep_timestamp_results['batch_params']['data_size'] = i+1
            keep_timestamp_results['size_list'].append(len(tmp_y_inc))
            tmp_batch_output = clustering_method_call(tmp_sim_map, 
                                                    HAC_average, 
                                                    hyper_params=keep_timestamp_results['batch_params'])
            tmp_y_batch = tmp_batch_output['y_pred']
            tmp_y_batch = tmp_y_batch[-len(tmp_y_inc):]

            keep_timestamp_results['v_scores'].append(v_measure_score(tmp_y_batch, tmp_y_inc)) 
            keep_timestamp_results['h_scores'].append(homogeneity_score(tmp_y_batch, tmp_y_inc)) 
            keep_timestamp_results['c_scores'].append(completeness_score(tmp_y_batch, tmp_y_inc)) 
 
            ### if we want to outdate:
            tmp_sim_map = filter_sim_map(tmp_sim_map, outdate_index, min_case=True)
            keep_timestamp_results['batch_params']['data_size'] = len(tmp_y_inc)
            tmp_batch_output = clustering_method_call(tmp_sim_map, 
                                                    HAC_average, 
                                                    hyper_params=keep_timestamp_results['batch_params'])
            tmp_y_batch = tmp_batch_output['y_pred']

            keep_timestamp_results['v_scores_limited'].append(v_measure_score(tmp_y_batch, tmp_y_inc)) 


            complete_batch = keep_timestamp_results['complete_batch']
            if len(complete_batch) > 0:
                keep_timestamp_results['v_scores_complete'].append(v_measure_score(complete_batch[outdate_index: i+1], tmp_y_inc))
                keep_timestamp_results['h_scores_complete'].append(homogeneity_score(complete_batch[outdate_index: i+1], tmp_y_inc))
                keep_timestamp_results['c_scores_complete'].append(completeness_score(complete_batch[outdate_index: i+1], tmp_y_inc))

    first_index = 0
    if params['only_final_clusters']:
        first_index = outdate_index

    y = [-1 for i in range(len(dist_mat))]
    for idx, cluster in enumerate(clusters):
        for elem in list(cluster):
            if elem < first_index:
                continue
            y[elem] = idx
    return y[first_index:]