class TestMetrics(unittest.TestCase): def setUp(self): self.paris = Paris() self.louvain_hierarchy = LouvainHierarchy() def test_undirected(self): adjacency = cyclic_graph(3) dendrogram = self.paris.fit_transform(adjacency) self.assertAlmostEqual(dasgupta_cost(adjacency, dendrogram), 2.666, 2) self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.111, 2) self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram), 0.0632, 3) self.assertAlmostEqual( tree_sampling_divergence(adjacency, dendrogram, normalized=False), 0.0256, 3) adjacency = test_graph() dendrogram = self.paris.fit_transform(adjacency) self.assertAlmostEqual(dasgupta_cost(adjacency, dendrogram), 4.26, 2) self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.573, 2) self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram), 0.304, 2) dendrogram = self.louvain_hierarchy.fit_transform(adjacency) self.assertAlmostEqual(dasgupta_cost(adjacency, dendrogram), 4.43, 2) self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.555, 2) self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram), 0.286, 2) def test_directed(self): adjacency = test_digraph() dendrogram = self.paris.fit_transform(adjacency) self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.566, 2) self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram), 0.318, 2) dendrogram = self.louvain_hierarchy.fit_transform(adjacency) self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.55, 2) self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram), 0.313, 2) def test_disconnected(self): adjacency = test_graph_disconnect() dendrogram = self.paris.fit_transform(adjacency) self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.682, 2) self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram), 0.464, 2) dendrogram = self.louvain_hierarchy.fit_transform(adjacency) self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.670, 2) self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram), 0.594, 2) def test_options(self): adjacency = test_graph() dendrogram = self.paris.fit_transform(adjacency) self.assertAlmostEqual( dasgupta_score(adjacency, dendrogram, weights='degree'), 0.573, 2) self.assertAlmostEqual( tree_sampling_divergence(adjacency, dendrogram, weights='uniform'), 0.271, 2) self.assertAlmostEqual( tree_sampling_divergence(adjacency, dendrogram, normalized=False), 0.367, 2)
class TestMetrics(unittest.TestCase): def setUp(self): self.paris = Paris() self.louvain_hierarchy = LouvainHierarchy() def test_undirected(self): adjacency = test_graph() dendrogram = self.paris.fit_transform(adjacency) self.assertAlmostEqual(dasgupta_cost(adjacency, dendrogram), 3.98, 2) self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.602, 2) self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram), 0.450, 2) dendrogram = self.louvain_hierarchy.fit_transform(adjacency) self.assertAlmostEqual(dasgupta_cost(adjacency, dendrogram), 4.45, 2) self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.555, 2) self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram), 0.431, 2) def test_directed(self): adjacency = test_digraph() dendrogram = self.paris.fit_transform(adjacency) self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.586, 2) self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram), 0.376, 2) dendrogram = self.louvain_hierarchy.fit_transform(adjacency) self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.56, 2) self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram), 0.357, 2) def test_disconnected(self): adjacency = test_graph_disconnect() dendrogram = self.paris.fit_transform(adjacency) self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.752, 2) self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram), 0.627, 2) dendrogram = self.louvain_hierarchy.fit_transform(adjacency) self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.691, 2) self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram), 0.549, 2) def test_options(self): adjacency = test_graph() dendrogram = self.paris.fit_transform(adjacency) self.assertAlmostEqual( dasgupta_score(adjacency, dendrogram, weights='degree'), 0.602, 2) self.assertAlmostEqual( tree_sampling_divergence(adjacency, dendrogram, weights='uniform'), 0.307, 2) self.assertAlmostEqual( tree_sampling_divergence(adjacency, dendrogram, normalized=False), 0.545, 2)
def compute_rank(self, file_name): x = csr_matrix((self.v, (self.b, self.a)), shape=(len(self.destin_idx), len(self.destin_idx)), dtype=float) print(x) adjacency = x.multiply(x.transpose()) pagerank = PageRank() scores = pagerank.fit_transform(adjacency) image = svg_graph(adjacency, names=self.destin_names, scores=scores, display_node_weight=True, node_order=np.argsort(scores)) with open(file_name, "w") as text_file: print(file_name) print(scores) text_file.write(image) print(self.v) print(self.destin_names) paris = Paris() dendrogram = paris.fit_transform(adjacency) image = svg_dendrogram(dendrogram, self.destin_names, n_clusters=5, rotate=True) with open("dento_" + file_name, "w") as text_file: text_file.write(image)
def _cluster(self): try: self.knn_graph except AttributeError: var_exists = False else: var_exists = True if not var_exists: self.make_distance_matrix() paris = Paris() self.dendrogram = paris.fit_transform(self.knn_graph)
def test_aggregation(self): self.adjacency = karate_club() paris = Paris() self.dendrogram = paris.fit_transform(self.adjacency) n_clusters = 5 dendrogram_, counts = aggregate_dendrogram(self.dendrogram, n_clusters, return_counts=True) self.assertEqual(dendrogram_.shape, (n_clusters - 1, self.dendrogram.shape[1])) self.assertEqual(counts.sum(), self.dendrogram.shape[0] + 1)
def test_undirected(self): adjacency = karate_club() paris = Paris() dendrogram = paris.fit_transform(adjacency) image = svg_dendrogram(dendrogram) self.assertEqual(image[1:4], 'svg') n = adjacency.shape[0] image = svg_dendrogram(dendrogram, names=np.arange(n), width=200, height=200, margin=10, margin_text=5, scale=3, n_clusters=3, color='green', colors=['red', 'blue'], font_size=14, reorder=True, rotate=True) self.assertEqual(image[1:4], 'svg') image = svg_dendrogram(dendrogram, names=np.arange(n), width=200, height=200, margin=10, margin_text=5, scale=3, n_clusters=3, color='green', colors={ 0: 'red', 1: 'blue' }, font_size=14, reorder=False, rotate=True) self.assertEqual(image[1:4], 'svg') svg_dendrogram_top(dendrogram, names=np.arange(n), width=200, height=200, margin=10, margin_text=5, scale=3, n_clusters=3, color='green', colors=np.array(['red', 'black', 'blue']), font_size=14, reorder=False, rotate_names=True, line_width=0.1)
def test_directed(self): graph = painters(True) adjacency = graph.adjacency names = graph.names paris = Paris() dendrogram = paris.fit_transform(adjacency) image = svg_dendrogram(dendrogram) self.assertEqual(image[1:4], 'svg') image = svg_dendrogram(dendrogram, names=names, width=200, height=200, margin=10, margin_text=5, scale=3, n_clusters=3, color='green', font_size=14, reorder=True, rotate=True) self.assertEqual(image[1:4], 'svg')
def test_options(self): paris = Paris(weights='uniform') adjacency = test_graph() dendrogram = paris.fit_transform(adjacency) n = adjacency.shape[0] self.assertEqual(dendrogram.shape, (n - 1, 4))
def setUp(self): paris = Paris() adjacency = karate_club() self.dendrogram = paris.fit_transform(adjacency)
def find_modules( adata, n_levels=100, level_start=2, level_end_size=2, n_pcs=100, layer=None, corr='pearson', corr_threshold=0, corr_power=2, method='complete', metric='correlation', smallest_module=3, key_added=None, ): if n_pcs is not None: print('Fitting PCA...') X = sc.pp.pca(adata, n_comps=n_pcs, copy=True, use_highly_variable=False).varm['PCs'].T else: if layer is None: X = adata.X else: X = adata.layers[layer] X = X.A if sp.sparse.issparse(X) else X key_added = '' if key_added is None else '_' + key_added if method == 'paris': from sknetwork.hierarchy import Paris if corr == 'pearson': corr_df = np.corrcoef(X, rowvar=False) elif corr == 'spearman': corr_df = sp.stats.spearmanr(X)[0] else: raise ValueError('Unknown corr') corr_df = pd.DataFrame(corr_df, columns=adata.var_names, index=adata.var_names) adata.varp[f'paris_corr_raw{key_added}'] = corr_df.copy() if corr_threshold is not None: corr_df[corr_df < corr_threshold] = corr_threshold if corr_power is not None: corr_df = corr_df.pow(corr_power) adata.varp[f'paris_corr{key_added}'] = corr_df.values print('Fitting the Paris model...') # TODO: consider BiParis on the tp10k matrix model = Paris() corr_mat = sp.sparse.csr_matrix(corr_df.values) dendro = model.fit_transform(corr_mat) else: print('Hierarchical clustering...') dendro = linkage(X.T, method=method, metric=metric) level_end = round(adata.n_vars / level_end_size) dfs = [] n_cl_list = np.linspace(level_start, level_end, n_levels, dtype=int) if len(set(n_cl_list)) != len(n_cl_list): n_cl_list = pd.Series(n_cl_list) n_cl_list = n_cl_list[~n_cl_list.duplicated()].tolist() print( f'Not enough clusters for n_levels={n_levels}, reducing to {len(n_cl_list)}...' ) print('Cutting trees :( ...') cl = cut_tree(dendro, n_clusters=n_cl_list) dfs = pd.DataFrame(cl.astype(str), index=adata.var_names, columns=[f'level_{i}' for i in range(len(n_cl_list))]) assert np.all(dfs.nunique().values == n_cl_list) for key in dfs.columns: dfs[key] = pd.Categorical(dfs[key], categories=natsorted(np.unique(dfs[key]))) adata.varm[f'paris_partitions{key_added}'] = dfs adata.uns[f'paris{key_added}'] = { 'dendrogram': dendro, 'params': { layer: layer, n_levels: n_levels, corr: corr }, } _build_module_dict(adata, level=None, paris_key=None if not key_added else key_added[1:]) print('Removing duplicates and small modules...') # remove duplicate modules df = pd.DataFrame( adata.uns[f'paris{key_added}']['module_dict']).reset_index() df = df.melt(id_vars=['index'], value_name='genes', var_name='level').rename(columns={'index': 'module'}) df = df[~df.genes.isnull()] df['size'] = [len(x) for x in df.genes] small_idx = df['size'] < smallest_module dups_idx = df.duplicated('genes') print(f'{dups_idx.sum()} duplicates found...') print(f'{small_idx.sum()} small modules found...') rms = list(df[small_idx | dups_idx].reset_index(drop=True)[['module', 'level']].itertuples()) for rm in rms: del adata.uns[f'paris{key_added}']['module_dict'][rm.level][rm.module] # remove empty levels empty_levels = [ k for k, v in adata.uns[f'paris{key_added}']['module_dict'].items() if len(v) == 0 ] print(f'{len(empty_levels)} empty levels found...') newp = adata.varm[f'paris_partitions{key_added}'].iloc[:, ~adata.varm[ f'paris_partitions{key_added}'].columns.isin(empty_levels)] adata.varm[f'paris_partitions{key_added}'] = newp for l in empty_levels: del adata.uns[f'paris{key_added}']['module_dict'][l] print(f'{len(df[~(small_idx | dups_idx)])} total modules found.') print('Calculating module dependencies...') deps = _calculate_module_dependencies( adata, paris_key=None if not key_added else key_added[1:]) adata.uns[f'paris{key_added}']['module_dependencies'] = deps