コード例 #1
0
def hac_clustering_experiment(datasets,
                              save_results_path=None,
                              preprocess=None,
                              verbose=True,
                              return_datasets=False):

    affinity_names = ['l1', 'l2', 'cosine', 'canberra', 'correlation', 'rbf']
    linkage = ['complete', 'average']
    if return_datasets:
        updated_datasets = {}
    result = np.zeros([len(datasets), len(affinity_names), len(linkage), 3])
    dataset_names = list(datasets.keys())

    for i in range(len(dataset_names)):
        dataset_name = dataset_names[i]
        if verbose:
            print(f'\t\t\t {dataset_name}')
        if (preprocess is not None) and (dataset_name in preprocess):
            T, labels = preprocess[dataset_name](datasets[dataset_name])
            T = numpy_tools.flatten_np(T)
        else:
            T = numpy_tools.flatten_np(datasets[dataset_name]['data'])
            labels = datasets[dataset_name]['labels']
        if return_datasets:
            updated_datasets[dataset_name] = {'data': T, 'labels': labels}
        n_clusters = len(np.unique(labels))
        clustAlg = AgglomerativeClustering(n_clusters=n_clusters)
        for k_affinity in range(len(affinity_names)):
            current_affinity = affinity_names[k_affinity]
            if verbose:
                print(f'\t\t Affinity: {current_affinity}')
            for k_linkage in range(len(linkage)):
                current_linkage = linkage[k_linkage]
                if verbose:
                    print(f'\t Linkage: {current_linkage}')
                clustAlg.linkage = current_linkage
                if ((current_affinity == 'canberra')
                        or (current_affinity == 'correlation')
                        or (current_affinity == 'rbf')):
                    clustAlg.affinity = 'precomputed'
                    if current_affinity == 'rbf':
                        D = pairwise_distances(T, metric='euclidean')
                        D = -np.exp(-D)
                    else:
                        D = pairwise_distances(T, metric=current_affinity)
                    pred = clustAlg.fit_predict(D)
                else:
                    clustAlg.affinity = current_affinity
                    pred = clustAlg.fit_predict(T)
                result[i, k_affinity, k_linkage,
                       0] = adjusted_rand_score(labels, pred)
                result[i, k_affinity, k_linkage,
                       1] = adjusted_mutual_info_score(
                           labels, pred, average_method='arithmetic')
                result[i, k_affinity, k_linkage,
                       2] = fowlkes_mallows_score(labels, pred)
                if save_results_path is not None:
                    np.savez_compressed(os.path.join(save_results_path),
                                        result=result,
                                        affinity_names=affinity_names,
                                        linkage=linkage,
                                        dataset_names=dataset_names,
                                        n_clusters=n_clusters)
                if verbose:
                    print(f'ARI={result[i, k_affinity, k_linkage, 0]:.3f} '
                          f'AMI={result[i, k_affinity, k_linkage, 1]:.3f} '
                          f'FMI={result[i, k_affinity, k_linkage, 2]:.3f} ')
    if return_datasets:
        return result, affinity_names, linkage, dataset_names, n_clusters, updated_datasets
    return result, affinity_names, linkage, dataset_names, n_clusters