Exemple #1
0
def generate_clustering(loom,
                        layername,
                        clustering_depth=3,
                        starting_clustering_depth=0,
                        max_clusters='sqrt_rule',
                        mode='pca',
                        silhouette_threshold=0.1,
                        clusteringcachedir='clusteringcachedir/'):
    """

    Parameters
    ----------
    loom :
        
    clustering_depth :
        (Default value = 3)
    starting_clustering_depth :
        (Default value = 0)
    max_clusters :
        (Default value = 200)
    layername :
        
    mode :
        (Default value = 'pca')
    silhouette_threshold :
        (Default value = 0.1)
    clusteringcachedir :
         (Default value = 'clusteringcachedir/')

    Returns
    -------

    
    """
    if type(clustering_depth) != int or clustering_depth < 1 or type(
            starting_clustering_depth) != int:
        raise Exception(
            "clustering_depth and starting_clustering_depth must be natural numbers."
        )
    if (starting_clustering_depth > 0) and (
            'ClusteringIteration{}'.format(starting_clustering_depth - 1)
            not in loom.ca.keys()):
        raise Exception(
            "starting_clustering_depth not yet computed; please run with lower starting_clustering depth, or 0"
        )
    if mode not in ['pca', 'nmf']:
        raise Exception("Currently only implemented for modes:  pca and nmf")

    from time import time
    from sklearn.decomposition import IncrementalPCA
    from tqdm import tqdm
    from panopticon.analysis import get_subclustering
    if mode == 'pca':
        from sklearn.decomposition import PCA
    elif mode == 'nmf':
        from sklearn.decomposition import NMF

    if starting_clustering_depth == 0:
        if mode == 'nmf':
            n_nmf_cols = loom.attrs['NumberNMFComponents']
            nmf_loadings = []
            for col in [
                    '{} NMF Loading Component {}'.format(layername, x)
                    for x in range(1, n_nmf_cols + 1)
            ]:
                nmf_loadings.append(loom.ca[col])
            X = np.vstack(nmf_loadings).T
        elif mode == 'pca':
            n_pca_cols = loom.attrs['NumberPrincipalComponents_{}'.format(
                layername)]
            pca_loadings = []
            for col in [
                    '{} PC {} Loading'.format(layername, x)
                    for x in range(1, n_pca_cols + 1)
            ]:
                pca_loadings.append(loom.ca[col])
            X = np.vstack(pca_loadings).T
        if max_clusters == 'sqrt_rule':
            clustering = get_subclustering(
                X,
                silhouette_threshold,
                max_clusters=int(np.floor(np.sqrt(X.shape[0]))),
                clusteringcachedir=clusteringcachedir
            )  # This shouldn't be hard-coded S Markson 9 June 2020
        else:
            clustering = get_subclustering(
                X,
                silhouette_threshold,
                max_clusters=max_clusters,
                clusteringcachedir=clusteringcachedir
            )  # This shouldn't be hard-coded S Markson 9 June 2020

        loom.ca['ClusteringIteration0'] = clustering
        starting_clustering_depth = 1

    for subi in range(starting_clustering_depth, clustering_depth):

        loom.ca['ClusteringIteration{}'.format(subi)] = ['U'] * len(
            loom.ca['ClusteringIteration{}'.format(subi - 1)])

        for cluster in set([
                x for x in loom.ca['ClusteringIteration{}'.format(subi - 1)]
                if x != 'U'
        ]):  #will need to fix
            mask = loom.ca['ClusteringIteration{}'.format(
                subi -
                1)] == cluster  #first mask, check for top level clustering
            #break
            start = time()
            data_c = loom[layername][:, mask]
            print("processing cluster", cluster, "; time to load: ",
                  time() - start, ", mask size: ", np.sum(mask))
            if mode == 'nmf':
                model = NMF(n_components=np.min([50, data_c.shape[1]]),
                            init='random',
                            random_state=0)
                X = model.fit_transform(data_c.T)
            elif mode == 'pca':

                data_c = data_c.T
                if data_c.shape[0] > 5000:
                    model = IncrementalPCA(n_components=10)
                    for chunk in tqdm(
                            np.array_split(data_c,
                                           data_c.shape[0] // 512,
                                           axis=0),
                            desc='partial fitting over chunks of masked data'):
                        model.partial_fit(chunk)
                    X = model.transform(data_c)
                    print("EV", model.explained_variance_)
                    print("EVR", model.explained_variance_ratio_)
                else:
                    model = PCA(n_components=np.min([10, data_c.shape[0]]),
                                random_state=0)

                    X = model.fit_transform(data_c)
                    print("EV", model.explained_variance_)
                    print("EVR", model.explained_variance_ratio_)

            if max_clusters == 'sqrt_rule':
                print("xshape", X.shape)
                nopath_clustering = get_subclustering(
                    X,
                    silhouette_threshold,
                    max_clusters=int(np.floor(np.sqrt(X.shape[0]))),
                    clusteringcachedir=clusteringcachedir
                )  # This shouldn't be hard-coded S Markson 9 June 2020
            else:
                nopath_clustering = get_subclustering(
                    X,
                    silhouette_threshold,
                    max_clusters=max_clusters,
                    clusteringcachedir=clusteringcachedir
                )  # This shouldn't be hard-coded S Markson 9 June 2020


#            nopath_clustering = get_subclustering(X, score_threshold=silhouette_threshold)  #Really shouldn't be hard-coded S Markson 9 June 2020
            fullpath_clustering = [
                '{}-{}'.format(cluster, x) for x in nopath_clustering
            ]
            loom.ca['ClusteringIteration{}'.format(
                subi)][mask] = fullpath_clustering
        loom.ca['ClusteringIteration{}'.format(subi)] = loom.ca[
            'ClusteringIteration{}'.format(
                subi)]  #This is to force the changes to save to disk