Beispiel #1
0
    def fit(self, diags):
        """
        Calibration step: learn centers and inertias from current available diagrams.
        @todo: si c'est un array Nx2,

        :param diags: list of diagrams from which to learn center locations and cluster spread
        :return: None
        """
        if self.n_calib > 0:
            diags = diags[:self.n_calib]
        elif self.n_calib == 0:
            diags = np.random.rand(self.n_centers, 2)
        weights = self.weighting_method(diags)
        diags_concat = np.concatenate(diags)
        kmeans = KMeans() if not self.batch_size else MiniBatchKMeans(
            batch_size=self.batch_size)
        kmeans.n_clusters = self.r_centers
        kmeans.fit(diags_concat, sample_weight=weights)
        labels = np.argmin(pairwise.pairwise_distances(
            diags_concat, Y=kmeans.cluster_centers_),
                           axis=1)
        self.centers = np.array(
            [kmeans.cluster_centers_[lab, :] for lab in np.unique(labels)])
        dist_centers = pairwise.pairwise_distances(self.centers)
        np.fill_diagonal(dist_centers, np.inf)
        self.inertias = np.min(dist_centers, axis=0) / 2
        self.r_centers = np.size(np.unique(labels))
        # self.inertias = np.array([[KMeans(n_clusters=1).fit(diags_concat[labels == lab, :],
        #                     sample_weight=weights[labels == lab]).inertia_] for lab in np.unique(labels)])
        return self
def estimate_clusters(data):
    features, _, labels = data
    scores = []
    estimator = KMeans()
    n_clusters = features.shape[1]
    for n in range(1, n_clusters):
        estimator.n_clusters = n
        score = np.mean(cross_val_score(estimator, features, labels, scoring='adjusted_rand_score'))
        scores.append([n, score])
    df = pd.DataFrame.from_records(scores, columns=['clusters', 'score'])
    df['algo'] = 'kmeans'
    return df
def estimate_clusters(data):
    features, _, labels = data
    scores = []
    estimator = KMeans()
    n_clusters = features.shape[1]
    for n in range(1, n_clusters):
        estimator.n_clusters = n
        score = np.mean(
            cross_val_score(estimator,
                            features,
                            labels,
                            scoring='adjusted_rand_score'))
        scores.append([n, score])
    df = pd.DataFrame.from_records(scores, columns=['clusters', 'score'])
    df['algo'] = 'kmeans'
    return df
Beispiel #4
0
def calculate_best_cluster_number(data, n_sequence, n_samples=5000):
    """
    function: calculate the best number of cluster(hyper-parameter K)
    :param n_sequence: array-like, can be iterable.
    :param data: input data
    :param n_samples: sample from data to calculate
    :return: best number of cluster
    """
    idx = np.random.choice(len(data), size=n_samples)
    data_train = data[idx]

    scores = list()

    model = KMeans(random_state=123)
    for index, cluster_number in enumerate(n_sequence):
        model.n_clusters = cluster_number
        model.fit(data_train)
        score = silhouette_score(data_train, model.labels_)
        scores.append(score)

    best_cluster_number = n_sequence[np.argmax(scores)]
    return best_cluster_number
def cell_assemblies(binned_counts, null_method = 'analytical', wts_method = 'pca', rm_diagonal=False):
    if null_method == 'bin_shuffle' and wts_method == 'av':
        raise Exception('AV method needs analytical null_method')


    T, nn = binned_counts.shape

    zsc_binned_spikes = zsc(binned_counts)

    #Get correlation matrix:
    C = np.mat(zsc_binned_spikes).T*np.mat(zsc_binned_spikes) / float(zsc_binned_spikes.shape[0])

    # Analytical distribuiton of eigenvalues: 
    if null_method == 'analytical':
        q = T/float(nn) 
        sig_sq = 1 #Zscored
        lamb_max = sig_sq*(1+np.sqrt(1/q))**2
        lamb_min = sig_sq*(1-np.sqrt(1/q))**2

        lambda_vect = np.arange(lamb_min, lamb_max, .01)
        p_lambda = q/(2*np.pi) * (np.sqrt((lamb_max - lambda_vect)*(lambda_vect - lamb_min))/(lambda_vect))
        null_cutoff = lamb_max
        #plt.plot(lambda_vect, p_lambda)

    # Time shuffle method: 
    elif null_method == 'bin_shuffle':
        eigs = []
        bins = np.linspace(0., 1.5, 50)
        for sim in range(100):
            shuff_bin = np.zeros_like(zsc_binned_spikes)
            for n in range(nn):
                tmp = zsc_binned_spikes[:, n]
                np.random.shuffle(tmp)
                shuff_bin[:, n] = tmp
            C_shuf = (shuff_bin.T * np.mat(shuff_bin))/T
            e = np.linalg.eigvalsh(C_shuf)
            n, x  = np.histogram(e, bins)
            print 'sim: ', sim
            eigs.append(list(n))
        tmp = np.cumsum(np.sum(np.vstack((eigs)), axis=0))
        tmp2 = tmp / float(np.max(tmp))
        null_ = np.nonzero(tmp2==1.)[0]
        if len(null_)> 0:
            null_cutoff = bins[null_[0]]
        else:
            raise Exception('No null cutoff, method: bin_shuffle')

        # plt.plot(bins[1:], np.mean(np.vstack((eigs)), axis=0))
        # plt.plot(lambda_vect, p_lambda, 'r-')


    C_adj = C.copy()
    if rm_diagonal:
        for i in range(C.shape[0]): C_adj[i,i]=0

    w, v = np.linalg.eig(C_adj)
    eig_sig = w > null_cutoff
    vect_sig = v[:, eig_sig]

    ############# PCA METHOD ################
    if wts_method== 'pca':

        #Calculate activation strength
        R = np.zeros((T, np.sum(eig_sig)))
        for t in range(T):
            for e in range(np.sum(eig_sig)):
                R[t, e] = zsc_binned_spikes[t,:]*vect_sig[:,e]*vect_sig[:,e].T*np.mat(zsc_binned_spikes[t,:]).T
        return R, vect_sig, w[eig_sig], 0, 0

    ############# AV METHOD ################
    elif wts_method == 'av':
        PAS = vect_sig*vect_sig.T
        N = PAS*C

        #Length of 'neuron vector':
        N_len = np.linalg.norm(N, axis=1)

        #Number of sig neurons: 
        sig_ix = np.nonzero(np.logical_or(w > lamb_max, w < lamb_min))[0]
        nn_sig = len(sig_ix)

        M = np.zeros((nn, nn_sig))
        for i in range(nn):
            for ij, j in enumerate(sig_ix):
                M[i, ij] = N[:, i].T * N[:, j]

        from sklearn.cluster import KMeans
        KM = KMeans()
        KM.n_clusters = 2
        KM.fit(M.reshape(1, -1).T)
        s = KM.predict(M.reshape(1, -1).T)
        #Check which is higher, group 1 or 0
        ix1 = np.nonzero(s==1.) 
        ix0 = np.nonzero(s==0.)

        if np.mean(s[ix1])>np.mean(s[ix0]):
            #Use ix1:
            ix_corr = ix1
        else:
            ix_corr = ix0

        m = np.zeros_like(M.reshape(1, -1).T)
        m[ix_corr] = 1.
        dig_M = m.reshape(M.shape)
        clusts = clust_alg(dig_M)
        n_sig_ass = int(np.sum(w>lamb_max))
        AV = np.zeros((nn, n_sig_ass))
        for i in range(n_sig_ass):
            try:
                AV[:, i] = np.squeeze(np.sum(N[:, sig_ix[clusts[i]]], axis=1)*(1/float(len(clusts[i]))))
            except:
                print 'No cluster for :', i
        R = np.zeros((T, n_sig_ass))

        for t in range(T):
            for e in range(n_sig_ass):
                R[t, e] = zsc_binned_spikes[t, :]*AV[:, e]*AV[:,e].T*np.mat(zsc_binned_spikes[t, :]).T
        return R, AV, w[w>lamb_max], dig_M, clusts
def clusterize_data(data, k=None, range_k=list(range(2,10)),algorithm='k-means'):
    """
    Clusterize the data by the algorithm with the specified value of 'k'. If the parameter 'k' is None it finds the "best" partition within 
    the determinaded range of the number of clusters. The clustering algorithm is iterativily  with all the values of 'k' in the 'range_k' variable 
    and each partition is evaluated by the silhouette index. The one that result in a better index value is returned.

    Parameters:

        data: array of floats [n_samples,n_features]

            The data to be clustered.

        k: integer, greather than 2. default: None

            The number of clusters of the data. If this value is None tham the value is indicated by the silhouette index.

        range_k: list of integers. default [2,...,9]

            The list of the possible number of clusters of data. The lowest value can not be smaller than 2. The greathest value can not be greather than n_samples -1.

        algorithm: string, defaul:'k-means'

            The clustering algorithm to be used. Allowed: ['k-means','hierarchical-average','hierarchical-complete','hierarchical-single']

    Returns:

        labels: list of integers [n_samples,]

            A list of integers assigning each sample to a cluster.
    """

    # Check the input algorithm
    allowed_algs = ['k-means','hierarchical-average','hierarchical-complete','hierarchical-single']
    if algorithm not in allowed_algs:
        raise ValueError('Algorithm not allowed: \'' + algorithm + '\'. Allowed ones: [' + ','.join(allowed_algs) + ']')

    # Check the number of clusters input
    if k is not None:
        if  k < 2:
            raise ValueError('Invalid value of "k". It should be greather than 2')
        else:
            range_k = [k]

    # Set the classifier
    km = None
    Z = None
    if 'k-means' in algorithm:
        km = KMeans()
    else:
        # calculates the matrix distance and obtain the linkage matrix
        D = squareform(pdist(data))
        type_linkage = algorithm.split('-')[1]
        Z = hierarchy.linkage(D,type_linkage)

    labels_k = []
    silhouette_k = []


    # For each value of k clusterize by kmeans and evaluates the silhouette.
    for k in range_k:
        l_k = None
        if 'hierarchical' not in algorithm:
            km.n_clusters = k
            l_k = km.fit_predict(data) 
        else:
            l_k = hierarchy.fcluster(Z,k,criterion='maxclust')

        s_k = silhouette_score(data,l_k)
        labels_k.append(l_k)
        silhouette_k.append(s_k)

    # Finds the labels with the best [maximum] silhouette index and return it.
    return labels_k[np.argmax(silhouette_k)]
Beispiel #7
0
def build_vocab_from_params(k, c_vecs, inv_indices):
    vocab = KMeans()
    vocab.n_clusters = k
    vocab.cluster_centers_ = c_vecs
    vocab.inv_indices_ = inv_indices
    return vocab
def test(board):
    k = KMeans()
    k.n_clusters = 8
    k.fit(norm(board))
    return k 
Beispiel #9
0
    def _augment_core(
            self, X: np.ndarray, Y: Optional[np.ndarray]
    ) -> Tuple[np.ndarray, Optional[np.ndarray]]:
        rand = np.random.RandomState(self.seed)
        N, T, C = X.shape

        if isinstance(self.n_levels, int):
            n_levels = (np.ones((N, 1, C)) * self.n_levels).astype(int)
        elif isinstance(self.n_levels, list):
            if self.per_channel:
                n_levels = rand.choice(self.n_levels,
                                       size=(N, 1, C)).astype(int)
            else:
                n_levels = rand.choice(self.n_levels,
                                       size=(N, 1, 1)).astype(int)
                n_levels = np.repeat(n_levels, C, axis=2)
        else:
            if self.per_channel:
                n_levels = rand.choice(range(self.n_levels[0],
                                             self.n_levels[1]),
                                       size=(N, 1, C)).astype(int)
            else:
                n_levels = rand.choice(range(self.n_levels[0],
                                             self.n_levels[1]),
                                       size=(N, 1, 1)).astype(int)
                n_levels = np.repeat(n_levels, C, axis=2)

        if self.how == "uniform":
            series_min = X.min(axis=1, keepdims=True)
            series_max = X.max(axis=1, keepdims=True)
            series_range = series_max - series_min
            series_range[series_range == 0] = 1
            X_aug = (X - series_min) / series_range
            X_aug = X_aug * n_levels
            X_aug = X_aug.round()
            X_aug = X_aug.clip(0, n_levels - 1)
            X_aug = X_aug + 0.5
            X_aug = X_aug / n_levels
            X_aug = X_aug * series_range + series_min
        elif self.how == "quantile":
            n_levels = n_levels.flatten()
            X_aug = X.copy()
            X_aug = X_aug.swapaxes(1, 2).reshape((N * C, T))
            for i in range(len(X_aug)):
                bins = np.percentile(
                    X_aug[i, :],
                    np.arange(n_levels[i] + 1) / n_levels[i] / 100)
                bins_center = np.percentile(
                    X_aug[i, :],
                    np.arange(0.5, n_levels[i]) / n_levels[i] / 100,
                )
                X_aug[i, :] = bins_center[np.digitize(X_aug[
                    i, :], bins).clip(0, n_levels[i] - 1), ]
            X_aug = X_aug.reshape(N, C, T).swapaxes(1, 2)
        else:
            try:
                from sklearn.cluster import KMeans
            except ImportError:
                raise ImportError(
                    "To use kmeans quantization, sklearn>=0.22 must be installed."
                )
            n_levels = n_levels.flatten()
            X_aug = X.copy()
            X_aug = X.swapaxes(1, 2).reshape((N * C, T))
            model = KMeans(n_clusters=2, n_jobs=-1, random_state=self.seed)
            for i in range(len(X_aug)):
                model.n_clusters = n_levels[i]
                ind = model.fit_predict(X_aug[i].reshape(-1, 1))
                X_aug[i, :] = model.cluster_centers_[ind, :].flatten()
            X_aug = X_aug.reshape(N, C, T).swapaxes(1, 2)

        if Y is not None:
            Y_aug = Y.copy()
        else:
            Y_aug = None

        return X_aug, Y_aug