Esempio n. 1
0
 def test_normalize_transform(self):
     """Is the transformed batch file different from the original one?"""
     inputs = {
         'biom_file': None,
         'cluster': ['Affinity'],
         'nclust': ['4'],
         'otu_meta': None,
         'otu_table': ['otu_bananas.txt'],
         'prefix': None,
         'sample_data': None,
         'split': 'Rocket Science',
         'tax_table': ['tax_bananas.txt'],
         'name': ['test'],
         'fp': os.path.dirname(massoc.__file__)[:-7].replace('\\', '/')
     }
     batch = Batch(testbiom, inputs)
     clrbatch = batch.normalize_transform(mode="clr")
     self.assertFalse(batch.otu['test'] == clrbatch.otu['test'])
Esempio n. 2
0
 def generate_cluster_figures(self):
     """Generates figures for diagnostics canvas."""
     from massoc.scripts.batch import Batch
     from sklearn.cluster import KMeans, DBSCAN, SpectralClustering, AffinityPropagation
     from sklearn.mixture import GaussianMixture
     from sklearn.metrics import silhouette_score
     from sklearn.decomposition import PCA
     nums = list(range(2, 5))
     try:
         file = self.file_list.GetSelection()
         file = self.file_list.GetString(file)
         x = 'init'
         biomfile = {x: biom.load_table(file)}
         algo = self.cluster_choice.GetSelection()
         algo = self.cluster_choice.GetString(algo)
         inputs = {'biom_file': [file],
                   'cluster': [algo]}
         normbatch = Batch(biomfile, inputs)
         normbatch = normbatch.normalize_transform(mode='clr')
         norm_table = normbatch.otu[x]
         topscore = 0
         bestcluster = [1] * len(norm_table.ids())
         data = csr_matrix.todense(norm_table.matrix_data)
         data = np.matrix.transpose(data)
         data = PCA(n_components=2).fit_transform(data)
         randomclust = np.random.randint(2, size=len(data))
         sh_score = [silhouette_score(data, randomclust)]
         # K-means clustering, tests 2-4 clusters
         if inputs['cluster'][0] == 'K-means':
             for i in nums:
                 clusters = KMeans(i).fit_predict(data)
                 silhouette_avg = silhouette_score(data, clusters)
                 sh_score.append(silhouette_avg)
             topscore = int(np.argmax(sh_score) + 1)
             bestcluster = KMeans(topscore).fit_predict(data)
         # DBSCAN clustering, automatically finds optimal cluster size
         if inputs['cluster'][0] == 'DBSCAN':
             bestcluster = DBSCAN().fit_predict(data)
             topscore = len(set(bestcluster)) - (1 if -1 in bestcluster else 0)
         # Gaussian Mixture Model (gmm) probability distribution
         if inputs['cluster'][0] == 'Gaussian':
             for i in nums:
                 fit = GaussianMixture(i).fit(data)
                 clusters = fit.predict(data)
                 silhouette_avg = silhouette_score(data, clusters)
                 sh_score.append(silhouette_avg)
             topscore = int(np.argmax(sh_score) + 1)
             bestfit = GaussianMixture(topscore).fit(data)
             bestcluster = bestfit.predict(data)
         # Spectral Clustering
         if inputs['cluster'][0] == 'Spectral':
             for i in nums:
                 clusters = SpectralClustering(i).fit_predict(data)
                 silhouette_avg = silhouette_score(data, clusters)
                 sh_score.append(silhouette_avg)
             topscore = int(np.argmax(sh_score) + 1)
             bestcluster = SpectralClustering(topscore).fit_predict(data)
         # Affinity Propagation clustering
         if inputs['cluster'] == 'Affinity':
             bestcluster = AffinityPropagation().fit_predict(data)
             topscore = len(set(bestcluster)) - (1 if -1 in bestcluster else 0)
         if max(sh_score) < 0.25:
             raise ValueError("Silhouette score too low: please try a different algorithm. "
                              "Your data may not be suitable for clustering.")
         for i in range(topscore):
             mask, = np.where(bestcluster == i)
             for j in mask:
                 norm_table._sample_metadata[j]['cluster'] = inputs['cluster'][0] + '_' + str(i)
         x, y = zip(*data)
         self.prev.scatter(x, y, bestcluster)
         self.canvas1.draw()
     except Exception:
         logger.error("Failed to generate figures. ", exc_info=True)