def generate_clustering(loom, layername, clustering_depth=3, starting_clustering_depth=0, max_clusters='sqrt_rule', mode='pca', silhouette_threshold=0.1, clusteringcachedir='clusteringcachedir/'): """ Parameters ---------- loom : clustering_depth : (Default value = 3) starting_clustering_depth : (Default value = 0) max_clusters : (Default value = 200) layername : mode : (Default value = 'pca') silhouette_threshold : (Default value = 0.1) clusteringcachedir : (Default value = 'clusteringcachedir/') Returns ------- """ if type(clustering_depth) != int or clustering_depth < 1 or type( starting_clustering_depth) != int: raise Exception( "clustering_depth and starting_clustering_depth must be natural numbers." ) if (starting_clustering_depth > 0) and ( 'ClusteringIteration{}'.format(starting_clustering_depth - 1) not in loom.ca.keys()): raise Exception( "starting_clustering_depth not yet computed; please run with lower starting_clustering depth, or 0" ) if mode not in ['pca', 'nmf']: raise Exception("Currently only implemented for modes: pca and nmf") from time import time from sklearn.decomposition import IncrementalPCA from tqdm import tqdm from panopticon.analysis import get_subclustering if mode == 'pca': from sklearn.decomposition import PCA elif mode == 'nmf': from sklearn.decomposition import NMF if starting_clustering_depth == 0: if mode == 'nmf': n_nmf_cols = loom.attrs['NumberNMFComponents'] nmf_loadings = [] for col in [ '{} NMF Loading Component {}'.format(layername, x) for x in range(1, n_nmf_cols + 1) ]: nmf_loadings.append(loom.ca[col]) X = np.vstack(nmf_loadings).T elif mode == 'pca': n_pca_cols = loom.attrs['NumberPrincipalComponents_{}'.format( layername)] pca_loadings = [] for col in [ '{} PC {} Loading'.format(layername, x) for x in range(1, n_pca_cols + 1) ]: pca_loadings.append(loom.ca[col]) X = np.vstack(pca_loadings).T if max_clusters == 'sqrt_rule': clustering = get_subclustering( X, silhouette_threshold, max_clusters=int(np.floor(np.sqrt(X.shape[0]))), clusteringcachedir=clusteringcachedir ) # This shouldn't be hard-coded S Markson 9 June 2020 else: clustering = get_subclustering( X, silhouette_threshold, max_clusters=max_clusters, clusteringcachedir=clusteringcachedir ) # This shouldn't be hard-coded S Markson 9 June 2020 loom.ca['ClusteringIteration0'] = clustering starting_clustering_depth = 1 for subi in range(starting_clustering_depth, clustering_depth): loom.ca['ClusteringIteration{}'.format(subi)] = ['U'] * len( loom.ca['ClusteringIteration{}'.format(subi - 1)]) for cluster in set([ x for x in loom.ca['ClusteringIteration{}'.format(subi - 1)] if x != 'U' ]): #will need to fix mask = loom.ca['ClusteringIteration{}'.format( subi - 1)] == cluster #first mask, check for top level clustering #break start = time() data_c = loom[layername][:, mask] print("processing cluster", cluster, "; time to load: ", time() - start, ", mask size: ", np.sum(mask)) if mode == 'nmf': model = NMF(n_components=np.min([50, data_c.shape[1]]), init='random', random_state=0) X = model.fit_transform(data_c.T) elif mode == 'pca': data_c = data_c.T if data_c.shape[0] > 5000: model = IncrementalPCA(n_components=10) for chunk in tqdm( np.array_split(data_c, data_c.shape[0] // 512, axis=0), desc='partial fitting over chunks of masked data'): model.partial_fit(chunk) X = model.transform(data_c) print("EV", model.explained_variance_) print("EVR", model.explained_variance_ratio_) else: model = PCA(n_components=np.min([10, data_c.shape[0]]), random_state=0) X = model.fit_transform(data_c) print("EV", model.explained_variance_) print("EVR", model.explained_variance_ratio_) if max_clusters == 'sqrt_rule': print("xshape", X.shape) nopath_clustering = get_subclustering( X, silhouette_threshold, max_clusters=int(np.floor(np.sqrt(X.shape[0]))), clusteringcachedir=clusteringcachedir ) # This shouldn't be hard-coded S Markson 9 June 2020 else: nopath_clustering = get_subclustering( X, silhouette_threshold, max_clusters=max_clusters, clusteringcachedir=clusteringcachedir ) # This shouldn't be hard-coded S Markson 9 June 2020 # nopath_clustering = get_subclustering(X, score_threshold=silhouette_threshold) #Really shouldn't be hard-coded S Markson 9 June 2020 fullpath_clustering = [ '{}-{}'.format(cluster, x) for x in nopath_clustering ] loom.ca['ClusteringIteration{}'.format( subi)][mask] = fullpath_clustering loom.ca['ClusteringIteration{}'.format(subi)] = loom.ca[ 'ClusteringIteration{}'.format( subi)] #This is to force the changes to save to disk