Ejemplo n.º 1
0
def test_discrete_clusters():
    def homogeneity_test(estimates, truth, threshold=0.95):
        for x in np.unique(estimates.values):
            if x == -1:
                continue
            inds = np.where(estimates.values == x)[0]
            if len(inds) > 0:
                zeros = truth.iloc[inds].values == 0
                ones = truth.iloc[inds].values == 1

                assert ((np.sum(zeros) / len(inds)) >= threshold) or ((np.sum(ones) / len(inds)) >= threshold)

    models = ['AffinityPropagation', 'AgglomerativeClustering', 'Birch', 'DBSCAN', 'OPTICS', 'FeatureAgglomeration',
              'KMeans', 'MiniBatchKMeans', 'MeanShift', 'SpectralClustering']

    for m in models:
        labels = hyp.cluster(clusters, model=m)
        homogeneity_test(labels, true_labels)

        labels2 = hyp.cluster([cluster1, cluster2], model=m)
        homogeneity_test(labels2[0], true_labels.iloc[:cluster1.shape[0]])
        homogeneity_test(labels2[1], true_labels.iloc[cluster1.shape[0]:])
Ejemplo n.º 2
0
def cluster(
    x,
    n_clusters=5
):  #x should be a dataframe with 1 row per video and 1 column per timepoint/topic -- e.g. the result of np.ravel(x0.values).T
    clustered_labels = hyp.cluster(x, cluster='KMeans', n_clusters=5)

    clusters = []
    for k in np.unique(clustered_labels):
        inds = np.where(
            clustered_labels == k
        )[0]  #might need to change clustered_labels to np.array(clustered_labels) in this line
        clusters.append(x.iloc[inds].copy())
    return clusters, clustered_labels  #clusters[0] is a number-of-cluster_0-videos by timepoints*topics dataframe; clusters[0].iloc[0] is the reshpaed trajectory from the first video from the first cluster (a 1 by timepoints*topics matrix)
Ejemplo n.º 3
0
def test_cluster_mixture():
    n_components = 3
    mode = 'fit_predict_proba'
    models = ['GaussianMixture', 'BayesianGaussianMixture']

    for m in models:
        next_model = {'model': m, 'args': [], 'kwargs': {'n_components': n_components}}
        mixture_proportions = hyp.cluster(clusters, model=next_model, mode=mode)

        assert mixture_proportions.shape == (clusters.shape[0], 3)
        assert np.all(np.sum(np.abs(mixture_proportions), axis=0) > 0)
        assert np.all(mixture_proportions >= 0)
        assert np.all(mixture_proportions <= 1)
        assert np.allclose(np.sum(mixture_proportions, axis=1), 1)
Ejemplo n.º 4
0
# -*- coding: utf-8 -*-
"""
=============================
Using the cluster function to label clusters
=============================

Here is an example where we generate some synthetic data, and then use the
cluster function to get cluster labels, which we can then pass to the `group`
kwarg to color our points by cluster.
"""

# Code source: Andrew Heusser
# License: MIT

# import
import hypertools as hyp
import numpy as np
from scipy.stats import multivariate_normal

# simulate clusters
cluster1 = np.random.multivariate_normal(np.zeros(3), np.eye(3), size=100)
cluster2 = np.random.multivariate_normal(np.zeros(3) + 3, np.eye(3), size=100)
data = np.vstack([cluster1, cluster2])

# get cluster labels
cluster_labels = hyp.cluster(data, n_clusters=2)

# plot
hyp.plot(data, '.', group=cluster_labels)
Ejemplo n.º 5
0
geo = hyp.plot(data, '.', reduce='FastICA')
geo = hyp.plot(data, '.', reduce='FactorAnalysis')
geo = hyp.plot(data, '.', reduce='TruncatedSVD')  #same results like PCA
geo = hyp.plot(data, '.',
               reduce='DictionaryLearning')  #took a long time to run
geo = hyp.plot(data, '.', reduce='MiniBatchDictionaryLearning')
geo = hyp.plot(data, '.', reduce='TSNE')  #takes long time to run
geo = hyp.plot(data, '.', reduce='Isomap')  #memory error
geo = hyp.plot(data, '.', reduce='SpectralEmbedding')  #system hangs
geo = hyp.plot(data, '.', reduce='LocallyLinearEmbedding')
geo = hyp.plot(data, '.', reduce='MDS')  #memory error

geo = hyp.plot(data, '.', reduce={'model': 'PCA', 'params': {'whiten': True}})

training_set = data.iloc[np.random.choice(len(data), 10000), :]
birch = hyp.cluster(training_set, cluster='Birch')
all_birch = birch.apply(data)

geo_cluster = hyp.plot(training_set, '.', cluster='HDBSCAN', n_clusters=6)

#Clustering

geo_cluster = hyp.plot(data, '.', n_clusters=6)
geo_cluster = hyp.plot(data, '.', cluster='KMeans', n_clusters=8)
geo_cluster = hyp.plot(data, '.', cluster='MiniBatchKMeans', n_clusters=8)
geo_cluster = hyp.plot(data,
                       '.',
                       cluster='AgglomerativeClustering',
                       n_clusters=8)  #memory error
geo_cluster = hyp.plot(
    data, '.', cluster='Birch', n_clusters=8