Ejemplo n.º 1
0
    def compute_rank(self, file_name):
        x = csr_matrix((self.v, (self.b, self.a)),
                       shape=(len(self.destin_idx), len(self.destin_idx)),
                       dtype=float)
        print(x)
        adjacency = x.multiply(x.transpose())
        pagerank = PageRank()
        scores = pagerank.fit_transform(adjacency)
        image = svg_graph(adjacency,
                          names=self.destin_names,
                          scores=scores,
                          display_node_weight=True,
                          node_order=np.argsort(scores))
        with open(file_name, "w") as text_file:
            print(file_name)
            print(scores)
            text_file.write(image)

        print(self.v)
        print(self.destin_names)

        paris = Paris()
        dendrogram = paris.fit_transform(adjacency)

        image = svg_dendrogram(dendrogram,
                               self.destin_names,
                               n_clusters=5,
                               rotate=True)
        with open("dento_" + file_name, "w") as text_file:
            text_file.write(image)
Ejemplo n.º 2
0
class TestMetrics(unittest.TestCase):
    def setUp(self):
        self.paris = Paris()
        self.louvain_hierarchy = LouvainHierarchy()

    def test_undirected(self):
        adjacency = cyclic_graph(3)
        dendrogram = self.paris.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_cost(adjacency, dendrogram), 2.666, 2)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.111, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.0632, 3)
        self.assertAlmostEqual(
            tree_sampling_divergence(adjacency, dendrogram, normalized=False),
            0.0256, 3)
        adjacency = test_graph()
        dendrogram = self.paris.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_cost(adjacency, dendrogram), 4.26, 2)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.573, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.304, 2)
        dendrogram = self.louvain_hierarchy.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_cost(adjacency, dendrogram), 4.43, 2)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.555, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.286, 2)

    def test_directed(self):
        adjacency = test_digraph()
        dendrogram = self.paris.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.566, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.318, 2)
        dendrogram = self.louvain_hierarchy.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.55, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.313, 2)

    def test_disconnected(self):
        adjacency = test_graph_disconnect()
        dendrogram = self.paris.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.682, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.464, 2)
        dendrogram = self.louvain_hierarchy.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.670, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.594, 2)

    def test_options(self):
        adjacency = test_graph()
        dendrogram = self.paris.fit_transform(adjacency)
        self.assertAlmostEqual(
            dasgupta_score(adjacency, dendrogram, weights='degree'), 0.573, 2)
        self.assertAlmostEqual(
            tree_sampling_divergence(adjacency, dendrogram, weights='uniform'),
            0.271, 2)
        self.assertAlmostEqual(
            tree_sampling_divergence(adjacency, dendrogram, normalized=False),
            0.367, 2)
Ejemplo n.º 3
0
 def setUp(self):
     self.paris = Paris(engine='python')
     self.biparis = BiParis(engine='python')
     if is_numba_available:
         self.paris_numba = Paris(engine='numba')
         self.biparis_numba = BiParis(engine='numba')
     else:
         with self.assertRaises(ValueError):
             Paris(engine='numba')
Ejemplo n.º 4
0
    def fit_paris(self):
        """ Uses a super useful library scikit-network to fit a PARIS clusterer on the kNN graph.
        PARIS clustering is hierarchical, so it returns a dendrogram instead of clusters. Later we cut the dendrogram.
        see: Hierarchical Graph Clustering using Node Pair Sampling by Bonald et al  https://arxiv.org/abs/1806.01664"""

        if self.verbose:
            print('fitting PARIS hierarchical clustering')
        paris = Paris()        
        paris.fit(self.adj)
        self.dendrogram = paris.dendrogram_
 def _cluster(self):
     try:
         self.knn_graph
     except AttributeError:
         var_exists = False
     else:
         var_exists = True
     if not var_exists:
         self.make_distance_matrix()
     paris = Paris()
     self.dendrogram = paris.fit_transform(self.knn_graph)
    def test_aggregation(self):
        self.adjacency = karate_club()
        paris = Paris()
        self.dendrogram = paris.fit_transform(self.adjacency)

        n_clusters = 5
        dendrogram_, counts = aggregate_dendrogram(self.dendrogram,
                                                   n_clusters,
                                                   return_counts=True)
        self.assertEqual(dendrogram_.shape,
                         (n_clusters - 1, self.dendrogram.shape[1]))
        self.assertEqual(counts.sum(), self.dendrogram.shape[0] + 1)
 def test_undirected(self):
     adjacency = karate_club()
     paris = Paris()
     dendrogram = paris.fit_transform(adjacency)
     image = svg_dendrogram(dendrogram)
     self.assertEqual(image[1:4], 'svg')
     n = adjacency.shape[0]
     image = svg_dendrogram(dendrogram,
                            names=np.arange(n),
                            width=200,
                            height=200,
                            margin=10,
                            margin_text=5,
                            scale=3,
                            n_clusters=3,
                            color='green',
                            colors=['red', 'blue'],
                            font_size=14,
                            reorder=True,
                            rotate=True)
     self.assertEqual(image[1:4], 'svg')
     image = svg_dendrogram(dendrogram,
                            names=np.arange(n),
                            width=200,
                            height=200,
                            margin=10,
                            margin_text=5,
                            scale=3,
                            n_clusters=3,
                            color='green',
                            colors={
                                0: 'red',
                                1: 'blue'
                            },
                            font_size=14,
                            reorder=False,
                            rotate=True)
     self.assertEqual(image[1:4], 'svg')
     svg_dendrogram_top(dendrogram,
                        names=np.arange(n),
                        width=200,
                        height=200,
                        margin=10,
                        margin_text=5,
                        scale=3,
                        n_clusters=3,
                        color='green',
                        colors=np.array(['red', 'black', 'blue']),
                        font_size=14,
                        reorder=False,
                        rotate_names=True,
                        line_width=0.1)
class ParisClusterer(object):
    def __init__(self, featureMatrix):
        self.featureMatrix = featureMatrix

    def buildAdjacency(self, type='pynndescent', nn=50, metric='dice'):
        print('Building nearest neighbor graph (slowest step)...')
        if type == 'pynndescent':
            nn_index = NNDescent(self.featureMatrix,
                                 n_neighbors=nn,
                                 metric=metric)
            n, d = nn_index.neighbor_graph
            self.n = n
            self.d = d
        print('Done')
        print('Building weighted, directed adjacency matrix...')
        wdAdj = sparse.dok_matrix(
            (self.featureMatrix.shape[0], self.featureMatrix.shape[0]),
            dtype=float)
        for neighbours, distances in tqdm(zip(n, d)):
            instanceIndex = neighbours[0]
            for neighbourIndex, distance in zip(neighbours[1:], distances[1:]):
                wdAdj[instanceIndex,
                      neighbourIndex] += 1 - distance  #similarity = 1-distance
        self.wdAdj = sparse.csr_matrix(wdAdj).astype(float)

    def fit(self):
        self.paris = Paris(engine='numba')
        self.paris.fit(self.wdAdj)

    def balanced_cut(self, max_cluster_size):
        n_nodes = self.paris.dendrogram_.shape[0] + 1
        labels = np.zeros(n_nodes, dtype=int)
        cluster = {node: [node] for node in range(n_nodes)}
        completed_clusters = list()

        for t in range(n_nodes - 1):
            currentID = n_nodes + t
            left = cluster[int(self.paris.dendrogram_[t][0])]
            right = cluster[int(self.paris.dendrogram_[t][1])]
            if len(left) + len(right) > max_cluster_size:
                for clust in [left, right]:
                    if len(clust) < max_cluster_size:
                        completed_clusters.append(clust)

            cluster[currentID] = cluster.pop(int(
                self.paris.dendrogram_[t][0])) + cluster.pop(
                    int(self.paris.dendrogram_[t][1]))

        for count, indices in enumerate(completed_clusters):
            labels[indices] = count

        self.labels_ = labels
Ejemplo n.º 9
0
 def test(self):
     louvain = LouvainHierarchy()
     louvain_options = LouvainHierarchy(resolution=2, depth=1)
     paris = Paris()
     paris_options = Paris(weights='uniform', reorder=False)
     for algo in [louvain, louvain_options, paris, paris_options]:
         for input_matrix in [test_graph(), test_digraph(), test_bigraph()]:
             dendrogram = algo.fit_transform(input_matrix)
             self.assertEqual(dendrogram.shape,
                              (input_matrix.shape[0] - 1, 4))
             if algo.bipartite:
                 self.assertEqual(algo.dendrogram_full_.shape,
                                  (sum(input_matrix.shape) - 1, 4))
Ejemplo n.º 10
0
class TestMetrics(unittest.TestCase):
    def setUp(self):
        self.paris = Paris()
        self.louvain_hierarchy = LouvainHierarchy()

    def test_undirected(self):
        adjacency = test_graph()
        dendrogram = self.paris.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_cost(adjacency, dendrogram), 3.98, 2)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.602, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.450, 2)
        dendrogram = self.louvain_hierarchy.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_cost(adjacency, dendrogram), 4.45, 2)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.555, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.431, 2)

    def test_directed(self):
        adjacency = test_digraph()
        dendrogram = self.paris.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.586, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.376, 2)
        dendrogram = self.louvain_hierarchy.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.56, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.357, 2)

    def test_disconnected(self):
        adjacency = test_graph_disconnect()
        dendrogram = self.paris.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.752, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.627, 2)
        dendrogram = self.louvain_hierarchy.fit_transform(adjacency)
        self.assertAlmostEqual(dasgupta_score(adjacency, dendrogram), 0.691, 2)
        self.assertAlmostEqual(tree_sampling_divergence(adjacency, dendrogram),
                               0.549, 2)

    def test_options(self):
        adjacency = test_graph()
        dendrogram = self.paris.fit_transform(adjacency)
        self.assertAlmostEqual(
            dasgupta_score(adjacency, dendrogram, weights='degree'), 0.602, 2)
        self.assertAlmostEqual(
            tree_sampling_divergence(adjacency, dendrogram, weights='uniform'),
            0.307, 2)
        self.assertAlmostEqual(
            tree_sampling_divergence(adjacency, dendrogram, normalized=False),
            0.545, 2)
Ejemplo n.º 11
0
 def test_directed(self):
     graph = painters(True)
     adjacency = graph.adjacency
     names = graph.names
     paris = Paris()
     dendrogram = paris.fit_transform(adjacency)
     image = svg_dendrogram(dendrogram)
     self.assertEqual(image[1:4], 'svg')
     image = svg_dendrogram(dendrogram,
                            names=names,
                            width=200,
                            height=200,
                            margin=10,
                            margin_text=5,
                            scale=3,
                            n_clusters=3,
                            color='green',
                            font_size=14,
                            reorder=True,
                            rotate=True)
     self.assertEqual(image[1:4], 'svg')
Ejemplo n.º 12
0
class TestMetrics(unittest.TestCase):

    def setUp(self):
        self.paris = Paris()
        self.karate_club_graph = karate_club_graph()

    def test_karate_club_graph(self):
        adjacency = self.karate_club_graph
        dendrogram = self.paris.fit(adjacency).dendrogram_
        tsd = tree_sampling_divergence(adjacency, dendrogram, normalized=True)
        self.assertAlmostEqual(tsd, .65, 2)
        dc = dasgupta_cost(adjacency, dendrogram, normalized=True)
        self.assertAlmostEqual(dc, .33, 2)
Ejemplo n.º 13
0
class TestParis(unittest.TestCase):
    def setUp(self):
        self.paris = Paris(engine='python')
        self.biparis = BiParis(engine='python')
        if is_numba_available:
            self.paris_numba = Paris(engine='numba')
            self.biparis_numba = BiParis(engine='numba')
        else:
            with self.assertRaises(ValueError):
                Paris(engine='numba')

    # noinspection PyTypeChecker
    def test_unknown_types(self):
        with self.assertRaises(TypeError):
            self.paris.fit(sparse.identity(1))

    # noinspection DuplicatedCode
    def test_undirected(self):
        house_graph = house()
        if is_numba_available:
            self.paris_numba.fit(house_graph)
            self.assertEqual(self.paris_numba.dendrogram_.shape[0], 4)
            labels = straight_cut(self.paris_numba.dendrogram_,
                                  sorted_clusters=True)
            self.assertTrue(np.array_equal(labels, np.array([0, 0, 1, 1, 0])))
        self.paris.fit(house_graph)
        self.assertEqual(self.paris.dendrogram_.shape[0], 4)
        labels = straight_cut(self.paris.dendrogram_, sorted_clusters=True)
        self.assertTrue(np.array_equal(labels, np.array([0, 0, 1, 1, 0])))

        karate_club_graph = karate_club()
        self.paris.fit(karate_club_graph)
        self.assertEqual(self.paris.dendrogram_.shape[0], 33)
        labels = straight_cut(self.paris.dendrogram_)
        self.assertEqual(np.max(labels), 1)

    def test_bipartite(self):
        star_wars_graph = star_wars_villains()
        self.biparis.fit(star_wars_graph)
        dendrogram = self.biparis.dendrogram_
        self.assertEqual(dendrogram.shape, (6, 4))
        if is_numba_available:
            self.biparis_numba.fit(star_wars_graph)
            dendrogram = self.biparis_numba.dendrogram_
            self.assertEqual(dendrogram.shape, (6, 4))
Ejemplo n.º 14
0
 def setUp(self):
     self.paris = Paris()
     self.house_graph = house_graph()
     self.karate_club_graph = karate_club_graph()
Ejemplo n.º 15
0
class TestParis(unittest.TestCase):

    def setUp(self):
        self.paris = Paris()
        self.house_graph = house_graph()
        self.karate_club_graph = karate_club_graph()

    def test_unknown_types(self):
        with self.assertRaises(TypeError):
            self.paris.fit(identity(1))

        with self.assertRaises(TypeError):
            self.paris.fit(identity(2, format='csr'), node_weights=1)

    def test_unknown_options(self):
        with self.assertRaises(ValueError):
            self.paris.fit(identity(2, format='csr'), node_weights='unknown')

    def test_house_graph(self):
        self.paris.fit(self.house_graph)
        self.assertEqual(self.paris.dendrogram_.shape[0], 4)
        labels = self.paris.predict(sorted_clusters=True)
        self.assertTrue(np.array_equal(labels, np.array([0, 0, 1, 1, 0])))

    def test_karate_club_graph(self):
        self.paris.fit(self.karate_club_graph)
        self.assertEqual(self.paris.dendrogram_.shape[0], 33)
        labels = self.paris.predict()
        self.assertEqual(np.max(labels), 1)
Ejemplo n.º 16
0
 def test_options(self):
     paris = Paris(weights='uniform')
     adjacency = test_graph()
     dendrogram = paris.fit_transform(adjacency)
     n = adjacency.shape[0]
     self.assertEqual(dendrogram.shape, (n - 1, 4))
Ejemplo n.º 17
0
 def setUp(self):
     self.paris = Paris()
     self.louvain_hierarchy = LouvainHierarchy()
Ejemplo n.º 18
0
def find_modules(
    adata,
    n_levels=100,
    level_start=2,
    level_end_size=2,
    n_pcs=100,
    layer=None,
    corr='pearson',
    corr_threshold=0,
    corr_power=2,
    method='complete',
    metric='correlation',
    smallest_module=3,
    key_added=None,
):

    if n_pcs is not None:
        print('Fitting PCA...')
        X = sc.pp.pca(adata,
                      n_comps=n_pcs,
                      copy=True,
                      use_highly_variable=False).varm['PCs'].T
    else:
        if layer is None:
            X = adata.X
        else:
            X = adata.layers[layer]
        X = X.A if sp.sparse.issparse(X) else X

    key_added = '' if key_added is None else '_' + key_added

    if method == 'paris':
        from sknetwork.hierarchy import Paris

        if corr == 'pearson':
            corr_df = np.corrcoef(X, rowvar=False)
        elif corr == 'spearman':
            corr_df = sp.stats.spearmanr(X)[0]
        else:
            raise ValueError('Unknown corr')

        corr_df = pd.DataFrame(corr_df,
                               columns=adata.var_names,
                               index=adata.var_names)

        adata.varp[f'paris_corr_raw{key_added}'] = corr_df.copy()

        if corr_threshold is not None:
            corr_df[corr_df < corr_threshold] = corr_threshold

        if corr_power is not None:
            corr_df = corr_df.pow(corr_power)

        adata.varp[f'paris_corr{key_added}'] = corr_df.values

        print('Fitting the Paris model...')
        # TODO: consider BiParis on the tp10k matrix
        model = Paris()
        corr_mat = sp.sparse.csr_matrix(corr_df.values)
        dendro = model.fit_transform(corr_mat)

    else:
        print('Hierarchical clustering...')
        dendro = linkage(X.T, method=method, metric=metric)

    level_end = round(adata.n_vars / level_end_size)
    dfs = []
    n_cl_list = np.linspace(level_start, level_end, n_levels, dtype=int)

    if len(set(n_cl_list)) != len(n_cl_list):
        n_cl_list = pd.Series(n_cl_list)
        n_cl_list = n_cl_list[~n_cl_list.duplicated()].tolist()
        print(
            f'Not enough clusters for n_levels={n_levels}, reducing to {len(n_cl_list)}...'
        )

    print('Cutting trees :( ...')
    cl = cut_tree(dendro, n_clusters=n_cl_list)
    dfs = pd.DataFrame(cl.astype(str),
                       index=adata.var_names,
                       columns=[f'level_{i}' for i in range(len(n_cl_list))])
    assert np.all(dfs.nunique().values == n_cl_list)

    for key in dfs.columns:
        dfs[key] = pd.Categorical(dfs[key],
                                  categories=natsorted(np.unique(dfs[key])))

    adata.varm[f'paris_partitions{key_added}'] = dfs
    adata.uns[f'paris{key_added}'] = {
        'dendrogram': dendro,
        'params': {
            layer: layer,
            n_levels: n_levels,
            corr: corr
        },
    }

    _build_module_dict(adata,
                       level=None,
                       paris_key=None if not key_added else key_added[1:])

    print('Removing duplicates and small modules...')
    # remove duplicate modules
    df = pd.DataFrame(
        adata.uns[f'paris{key_added}']['module_dict']).reset_index()
    df = df.melt(id_vars=['index'], value_name='genes',
                 var_name='level').rename(columns={'index': 'module'})
    df = df[~df.genes.isnull()]
    df['size'] = [len(x) for x in df.genes]

    small_idx = df['size'] < smallest_module
    dups_idx = df.duplicated('genes')

    print(f'{dups_idx.sum()} duplicates found...')
    print(f'{small_idx.sum()} small modules found...')

    rms = list(df[small_idx
                  | dups_idx].reset_index(drop=True)[['module',
                                                      'level']].itertuples())
    for rm in rms:
        del adata.uns[f'paris{key_added}']['module_dict'][rm.level][rm.module]

    # remove empty levels
    empty_levels = [
        k for k, v in adata.uns[f'paris{key_added}']['module_dict'].items()
        if len(v) == 0
    ]
    print(f'{len(empty_levels)} empty levels found...')
    newp = adata.varm[f'paris_partitions{key_added}'].iloc[:, ~adata.varm[
        f'paris_partitions{key_added}'].columns.isin(empty_levels)]
    adata.varm[f'paris_partitions{key_added}'] = newp

    for l in empty_levels:
        del adata.uns[f'paris{key_added}']['module_dict'][l]

    print(f'{len(df[~(small_idx | dups_idx)])} total modules found.')
    print('Calculating module dependencies...')

    deps = _calculate_module_dependencies(
        adata, paris_key=None if not key_added else key_added[1:])
    adata.uns[f'paris{key_added}']['module_dependencies'] = deps
 def fit(self):
     self.paris = Paris(engine='numba')
     self.paris.fit(self.wdAdj)
Ejemplo n.º 20
0
 def setUp(self):
     paris = Paris()
     adjacency = karate_club()
     self.dendrogram = paris.fit_transform(adjacency)