Beispiel #1
0
class TestMetrics(unittest.TestCase):
    def setUp(self):
        self.paris = Paris()
        self.louvain_hierarchy = LouvainHierarchy()

    def test_undirected(self):
        adjacency = cyclic_graph(3)
        dendrogram = self.paris.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_cost(adjacency, dendrogram), 2.666, 2)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.111, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.0632, 3)
        self.assertAlmostEqual(
            tree_sampling_divergence(adjacency, dendrogram, normalized=False),
            0.0256, 3)
        adjacency = test_graph()
        dendrogram = self.paris.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_cost(adjacency, dendrogram), 4.26, 2)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.573, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.304, 2)
        dendrogram = self.louvain_hierarchy.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_cost(adjacency, dendrogram), 4.43, 2)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.555, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.286, 2)

    def test_directed(self):
        adjacency = test_digraph()
        dendrogram = self.paris.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.566, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.318, 2)
        dendrogram = self.louvain_hierarchy.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.55, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.313, 2)

    def test_disconnected(self):
        adjacency = test_graph_disconnect()
        dendrogram = self.paris.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.682, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.464, 2)
        dendrogram = self.louvain_hierarchy.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.670, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.594, 2)

    def test_options(self):
        adjacency = test_graph()
        dendrogram = self.paris.fit_transform(adjacency)
        self.assertAlmostEqual(
            dasgupta_score(adjacency, dendrogram, weights='degree'), 0.573, 2)
        self.assertAlmostEqual(
            tree_sampling_divergence(adjacency, dendrogram, weights='uniform'),
            0.271, 2)
        self.assertAlmostEqual(
            tree_sampling_divergence(adjacency, dendrogram, normalized=False),
            0.367, 2)
Beispiel #2
0
class TestMetrics(unittest.TestCase):
    def setUp(self):
        self.paris = Paris()
        self.louvain_hierarchy = LouvainHierarchy()

    def test_undirected(self):
        adjacency = test_graph()
        dendrogram = self.paris.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_cost(adjacency, dendrogram), 3.98, 2)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.602, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.450, 2)
        dendrogram = self.louvain_hierarchy.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_cost(adjacency, dendrogram), 4.45, 2)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.555, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.431, 2)

    def test_directed(self):
        adjacency = test_digraph()
        dendrogram = self.paris.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.586, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.376, 2)
        dendrogram = self.louvain_hierarchy.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.56, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.357, 2)

    def test_disconnected(self):
        adjacency = test_graph_disconnect()
        dendrogram = self.paris.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.752, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.627, 2)
        dendrogram = self.louvain_hierarchy.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.691, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.549, 2)

    def test_options(self):
        adjacency = test_graph()
        dendrogram = self.paris.fit_transform(adjacency)
        self.assertAlmostEqual(
            dasgupta_score(adjacency, dendrogram, weights='degree'), 0.602, 2)
        self.assertAlmostEqual(
            tree_sampling_divergence(adjacency, dendrogram, weights='uniform'),
            0.307, 2)
        self.assertAlmostEqual(
            tree_sampling_divergence(adjacency, dendrogram, normalized=False),
            0.545, 2)
Beispiel #3
0
    def compute_rank(self, file_name):
        x = csr_matrix((self.v, (self.b, self.a)),
                       shape=(len(self.destin_idx), len(self.destin_idx)),
                       dtype=float)
        print(x)
        adjacency = x.multiply(x.transpose())
        pagerank = PageRank()
        scores = pagerank.fit_transform(adjacency)
        image = svg_graph(adjacency,
                          names=self.destin_names,
                          scores=scores,
                          display_node_weight=True,
                          node_order=np.argsort(scores))
        with open(file_name, "w") as text_file:
            print(file_name)
            print(scores)
            text_file.write(image)

        print(self.v)
        print(self.destin_names)

        paris = Paris()
        dendrogram = paris.fit_transform(adjacency)

        image = svg_dendrogram(dendrogram,
                               self.destin_names,
                               n_clusters=5,
                               rotate=True)
        with open("dento_" + file_name, "w") as text_file:
            text_file.write(image)
 def _cluster(self):
     try:
         self.knn_graph
     except AttributeError:
         var_exists = False
     else:
         var_exists = True
     if not var_exists:
         self.make_distance_matrix()
     paris = Paris()
     self.dendrogram = paris.fit_transform(self.knn_graph)
    def test_aggregation(self):
        self.adjacency = karate_club()
        paris = Paris()
        self.dendrogram = paris.fit_transform(self.adjacency)

        n_clusters = 5
        dendrogram_, counts = aggregate_dendrogram(self.dendrogram,
                                                   n_clusters,
                                                   return_counts=True)
        self.assertEqual(dendrogram_.shape,
                         (n_clusters - 1, self.dendrogram.shape[1]))
        self.assertEqual(counts.sum(), self.dendrogram.shape[0] + 1)
 def test_undirected(self):
     adjacency = karate_club()
     paris = Paris()
     dendrogram = paris.fit_transform(adjacency)
     image = svg_dendrogram(dendrogram)
     self.assertEqual(image[1:4], 'svg')
     n = adjacency.shape[0]
     image = svg_dendrogram(dendrogram,
                            names=np.arange(n),
                            width=200,
                            height=200,
                            margin=10,
                            margin_text=5,
                            scale=3,
                            n_clusters=3,
                            color='green',
                            colors=['red', 'blue'],
                            font_size=14,
                            reorder=True,
                            rotate=True)
     self.assertEqual(image[1:4], 'svg')
     image = svg_dendrogram(dendrogram,
                            names=np.arange(n),
                            width=200,
                            height=200,
                            margin=10,
                            margin_text=5,
                            scale=3,
                            n_clusters=3,
                            color='green',
                            colors={
                                0: 'red',
                                1: 'blue'
                            },
                            font_size=14,
                            reorder=False,
                            rotate=True)
     self.assertEqual(image[1:4], 'svg')
     svg_dendrogram_top(dendrogram,
                        names=np.arange(n),
                        width=200,
                        height=200,
                        margin=10,
                        margin_text=5,
                        scale=3,
                        n_clusters=3,
                        color='green',
                        colors=np.array(['red', 'black', 'blue']),
                        font_size=14,
                        reorder=False,
                        rotate_names=True,
                        line_width=0.1)
Beispiel #7
0
 def test_directed(self):
     graph = painters(True)
     adjacency = graph.adjacency
     names = graph.names
     paris = Paris()
     dendrogram = paris.fit_transform(adjacency)
     image = svg_dendrogram(dendrogram)
     self.assertEqual(image[1:4], 'svg')
     image = svg_dendrogram(dendrogram,
                            names=names,
                            width=200,
                            height=200,
                            margin=10,
                            margin_text=5,
                            scale=3,
                            n_clusters=3,
                            color='green',
                            font_size=14,
                            reorder=True,
                            rotate=True)
     self.assertEqual(image[1:4], 'svg')
Beispiel #8
0
 def test_options(self):
     paris = Paris(weights='uniform')
     adjacency = test_graph()
     dendrogram = paris.fit_transform(adjacency)
     n = adjacency.shape[0]
     self.assertEqual(dendrogram.shape, (n - 1, 4))
Beispiel #9
0
 def setUp(self):
     paris = Paris()
     adjacency = karate_club()
     self.dendrogram = paris.fit_transform(adjacency)
Beispiel #10
0
def find_modules(
    adata,
    n_levels=100,
    level_start=2,
    level_end_size=2,
    n_pcs=100,
    layer=None,
    corr='pearson',
    corr_threshold=0,
    corr_power=2,
    method='complete',
    metric='correlation',
    smallest_module=3,
    key_added=None,
):

    if n_pcs is not None:
        print('Fitting PCA...')
        X = sc.pp.pca(adata,
                      n_comps=n_pcs,
                      copy=True,
                      use_highly_variable=False).varm['PCs'].T
    else:
        if layer is None:
            X = adata.X
        else:
            X = adata.layers[layer]
        X = X.A if sp.sparse.issparse(X) else X

    key_added = '' if key_added is None else '_' + key_added

    if method == 'paris':
        from sknetwork.hierarchy import Paris

        if corr == 'pearson':
            corr_df = np.corrcoef(X, rowvar=False)
        elif corr == 'spearman':
            corr_df = sp.stats.spearmanr(X)[0]
        else:
            raise ValueError('Unknown corr')

        corr_df = pd.DataFrame(corr_df,
                               columns=adata.var_names,
                               index=adata.var_names)

        adata.varp[f'paris_corr_raw{key_added}'] = corr_df.copy()

        if corr_threshold is not None:
            corr_df[corr_df < corr_threshold] = corr_threshold

        if corr_power is not None:
            corr_df = corr_df.pow(corr_power)

        adata.varp[f'paris_corr{key_added}'] = corr_df.values

        print('Fitting the Paris model...')
        # TODO: consider BiParis on the tp10k matrix
        model = Paris()
        corr_mat = sp.sparse.csr_matrix(corr_df.values)
        dendro = model.fit_transform(corr_mat)

    else:
        print('Hierarchical clustering...')
        dendro = linkage(X.T, method=method, metric=metric)

    level_end = round(adata.n_vars / level_end_size)
    dfs = []
    n_cl_list = np.linspace(level_start, level_end, n_levels, dtype=int)

    if len(set(n_cl_list)) != len(n_cl_list):
        n_cl_list = pd.Series(n_cl_list)
        n_cl_list = n_cl_list[~n_cl_list.duplicated()].tolist()
        print(
            f'Not enough clusters for n_levels={n_levels}, reducing to {len(n_cl_list)}...'
        )

    print('Cutting trees :( ...')
    cl = cut_tree(dendro, n_clusters=n_cl_list)
    dfs = pd.DataFrame(cl.astype(str),
                       index=adata.var_names,
                       columns=[f'level_{i}' for i in range(len(n_cl_list))])
    assert np.all(dfs.nunique().values == n_cl_list)

    for key in dfs.columns:
        dfs[key] = pd.Categorical(dfs[key],
                                  categories=natsorted(np.unique(dfs[key])))

    adata.varm[f'paris_partitions{key_added}'] = dfs
    adata.uns[f'paris{key_added}'] = {
        'dendrogram': dendro,
        'params': {
            layer: layer,
            n_levels: n_levels,
            corr: corr
        },
    }

    _build_module_dict(adata,
                       level=None,
                       paris_key=None if not key_added else key_added[1:])

    print('Removing duplicates and small modules...')
    # remove duplicate modules
    df = pd.DataFrame(
        adata.uns[f'paris{key_added}']['module_dict']).reset_index()
    df = df.melt(id_vars=['index'], value_name='genes',
                 var_name='level').rename(columns={'index': 'module'})
    df = df[~df.genes.isnull()]
    df['size'] = [len(x) for x in df.genes]

    small_idx = df['size'] < smallest_module
    dups_idx = df.duplicated('genes')

    print(f'{dups_idx.sum()} duplicates found...')
    print(f'{small_idx.sum()} small modules found...')

    rms = list(df[small_idx
                  | dups_idx].reset_index(drop=True)[['module',
                                                      'level']].itertuples())
    for rm in rms:
        del adata.uns[f'paris{key_added}']['module_dict'][rm.level][rm.module]

    # remove empty levels
    empty_levels = [
        k for k, v in adata.uns[f'paris{key_added}']['module_dict'].items()
        if len(v) == 0
    ]
    print(f'{len(empty_levels)} empty levels found...')
    newp = adata.varm[f'paris_partitions{key_added}'].iloc[:, ~adata.varm[
        f'paris_partitions{key_added}'].columns.isin(empty_levels)]
    adata.varm[f'paris_partitions{key_added}'] = newp

    for l in empty_levels:
        del adata.uns[f'paris{key_added}']['module_dict'][l]

    print(f'{len(df[~(small_idx | dups_idx)])} total modules found.')
    print('Calculating module dependencies...')

    deps = _calculate_module_dependencies(
        adata, paris_key=None if not key_added else key_added[1:])
    adata.uns[f'paris{key_added}']['module_dependencies'] = deps