Exemple #1
0
 def test_bilouvain(self):
     biadjacency = star_wars()
     adjacency = bipartite2undirected(biadjacency)
     louvain = Louvain(modularity='newman')
     labels1 = louvain.fit_transform(adjacency)
     louvain.fit(biadjacency)
     labels2 = np.concatenate((louvain.labels_row_, louvain.labels_col_))
     self.assertTrue((labels1 == labels2).all())
Exemple #2
0
 def setUp(self):
     self.louvain = Louvain(engine='python')
     self.bilouvain = BiLouvain(engine='python')
     if is_numba_available:
         self.louvain_numba = Louvain(engine='numba')
         self.bilouvain_numba = BiLouvain(engine='numba')
     else:
         with self.assertRaises(ValueError):
             Louvain(engine='numba')
    def test_modularity(self):
        adjacency = karate_club()
        louvain_d = Louvain(modularity='dugue')
        louvain_n = Louvain(modularity='newman')
        labels_d = louvain_d.fit_transform(adjacency)
        labels_n = louvain_n.fit_transform(adjacency)
        self.assertTrue((labels_d == labels_n).all())

        louvain_p = Louvain(modularity='potts')
        louvain_p.fit_transform(adjacency)
Exemple #4
0
 def louvain(cls, g, labels):
     x, y, z = labels.shape
     t = cls.ragToAdjacencyMatrix(g, 'similarity')
     louvain = Louvain()
     l = louvain.fit_transform(t)
     rep = labels.copy()
     for k in range(x):
         for j in range(y):
             for i in range(z):
                 rep[k, j, i] = l[labels[k, j, i] - 1]
     return rep
Exemple #5
0
 def test_shuffling(self):
     self.louvain_shuffle_first = Louvain(engine='python',
                                          shuffle_nodes=True,
                                          random_state=0)
     self.louvain_shuffle_second = Louvain(engine='python',
                                           shuffle_nodes=True,
                                           random_state=123)
     self.bow_tie = bow_tie()
     self.louvain_shuffle_first.fit(self.bow_tie)
     self.assertEqual(self.louvain_shuffle_first.labels_[1], 1)
     self.louvain_shuffle_second.fit(self.bow_tie)
     self.assertEqual(self.louvain_shuffle_second.labels_[1], 1)
Exemple #6
0
 def test_undirected(self):
     self.louvain_high_resolution = Louvain(engine='python', resolution=2)
     self.louvain_null_resolution = Louvain(engine='python', resolution=0)
     self.karate_club = karate_club()
     self.louvain.fit(self.karate_club)
     labels = self.louvain.labels_
     self.assertEqual(labels.shape, (34, ))
     self.assertAlmostEqual(modularity(self.karate_club, labels), 0.42, 2)
     if is_numba_available:
         self.louvain_numba.fit(self.karate_club)
         labels = self.louvain_numba.labels_
         self.assertEqual(labels.shape, (34, ))
         self.assertAlmostEqual(modularity(self.karate_club, labels), 0.42,
                                2)
     self.louvain_high_resolution.fit(self.karate_club)
     labels = self.louvain_high_resolution.labels_
     self.assertEqual(labels.shape, (34, ))
     self.assertAlmostEqual(modularity(self.karate_club, labels), 0.34, 2)
     self.louvain_null_resolution.fit(self.karate_club)
     labels = self.louvain_null_resolution.labels_
     self.assertEqual(labels.shape, (34, ))
     self.assertEqual(len(set(self.louvain_null_resolution.labels_)), 1)
    def test_options(self):
        adjacency = karate_club()

        # resolution
        louvain = Louvain(resolution=2)
        labels = louvain.fit_transform(adjacency)
        self.assertEqual(len(set(labels)), 7)

        # tolerance
        louvain = Louvain(resolution=2, tol_aggregation=0.1)
        labels = louvain.fit_transform(adjacency)
        self.assertEqual(len(set(labels)), 12)

        # shuffling
        louvain = Louvain(resolution=2, shuffle_nodes=True, random_state=42)
        labels = louvain.fit_transform(adjacency)
        self.assertEqual(len(set(labels)), 9)

        # aggregate graph
        louvain = Louvain(return_aggregate=True)
        labels = louvain.fit_transform(adjacency)
        n_labels = len(set(labels))
        self.assertEqual(louvain.adjacency_.shape, (n_labels, n_labels))

        # aggregate graph
        Louvain(n_aggregations=1, sort_clusters=False).fit(adjacency)
 def test_disconnected(self):
     adjacency = test_graph_disconnect()
     n = adjacency.shape[0]
     labels = Louvain().fit_transform(adjacency)
     self.assertEqual(len(labels), n)
 def test_invalid(self):
     adjacency = karate_club()
     louvain = Louvain(modularity='toto')
     with self.assertRaises(ValueError):
         louvain.fit(adjacency)
Exemple #10
0
    def test_options_with_64_bit(self):
        adjacency = karate_club()
        # force 64-bit index
        adjacency.indices = adjacency.indices.astype(np.int64)
        adjacency.indptr = adjacency.indptr.astype(np.int64)

        # resolution
        louvain = Louvain(resolution=2)
        labels = louvain.fit_transform(adjacency)
        self.assertEqual(len(set(labels)), 7)

        # tolerance
        louvain = Louvain(resolution=2, tol_aggregation=0.1)
        labels = louvain.fit_transform(adjacency)
        self.assertEqual(len(set(labels)), 12)

        # shuffling
        louvain = Louvain(resolution=2, shuffle_nodes=True, random_state=42)
        labels = louvain.fit_transform(adjacency)
        self.assertEqual(len(set(labels)), 9)

        # aggregate graph
        louvain = Louvain(return_aggregate=True)
        labels = louvain.fit_transform(adjacency)
        n_labels = len(set(labels))
        self.assertEqual(louvain.aggregate_.shape, (n_labels, n_labels))

        # aggregate graph
        Louvain(n_aggregations=1, sort_clusters=False).fit(adjacency)

        # check if labels are 64-bit
        self.assertEqual(labels.dtype, np.int64)
def clustering(adata,
               sequence_coordinates,
               cluster_chains,
               cluster_strength,
               cluster_labels_1,
               n_clusters=None,
               method=None,
               smoothing=False,
               distance='cosine'):

    average_cl_d = np.zeros((len(cluster_chains), len(cluster_chains)))

    for i in tqdm(range(len(average_cl_d))):
        for j in range(len(average_cl_d)):
            average_cl_d[i, j] = hausdorff_distance(cluster_chains[i],
                                                    cluster_chains[j],
                                                    distance=distance)

    average_cl_d_affinity = -average_cl_d
    average_cl_d_affinity = sparse.csr_matrix(
        minmax_scale(average_cl_d_affinity, axis=1))

    print('Clustering using hausdorff distances')
    if n_clusters is None:
        #cluster_labels = OPTICS(metric="precomputed").fit_predict(average_cl_d)
        #cluster_labels = AffinityPropagation(affinity="precomputed", convergence_iter=100).fit_predict(average_cl_d_affinity)
        cluster_labels = Louvain(
            resolution=0.75).fit_transform(average_cl_d_affinity)
        clusters = np.unique(cluster_labels)
    else:
        if method == 'kmeans':
            cluster_labels = KMeans(
                n_clusters=n_clusters,
                precompute_distances=True).fit_predict(average_cl_d_affinity)
            clusters = np.unique(cluster_labels)

    all_trajectories_labels = np.zeros((len(cluster_labels_1)))

    for i in range(len(cluster_labels)):
        all_trajectories_labels[np.where(
            cluster_labels_1 == i)] = cluster_labels[i]

    cluster_strength = np.asarray(cluster_strength, dtype=int)

    final_cluster_strength = np.empty([len(clusters)], dtype=int)
    for i in range(len(clusters)):
        final_cluster_strength[i] = len(
            np.where(all_trajectories_labels == i)[0].astype(int))

    print('Forming trajectory by aligning clusters')

    # pairwise allignment until only 1 avg. trajectory remains per cluster
    final_cluster = []
    all_chains = sequence_coordinates
    for i in tqdm(range(len(clusters))):
        index_cluster = np.where(all_trajectories_labels == clusters[i])
        aver_tr_cluster = all_chains[index_cluster]
        if len(aver_tr_cluster) > 1:
            average_trajectory = aver_tr_cluster
            while len(average_trajectory) > 1:
                pair_range = range(len(average_trajectory))
                pairs = []
                for j in range(
                        int((len(average_trajectory) -
                             (len(average_trajectory) % 2)) / 2)):
                    pairs.append(
                        [pair_range[j * 2], pair_range[((j) * 2) + 1]])
                if (len(average_trajectory) % 2) != 0:
                    pairs.append([pair_range[-2], pair_range[-1]])

                average_traj_while = []
                for l in range(len(pairs)):
                    alligned_trajectory = fastdtw(
                        average_trajectory[pairs[l][0]],
                        average_trajectory[pairs[l][1]])[1]
                    alligned_trajectory = np.asarray(alligned_trajectory)
                    alligned_tr = np.zeros((2, len(average_trajectory[0][0])))
                    alligned_av = np.zeros((len(alligned_trajectory),
                                            len(average_trajectory[0][0])))
                    for n in range(len(alligned_trajectory)):
                        alligned_tr[0, :] = average_trajectory[pairs[l][0]][
                            alligned_trajectory[n, 0]]
                        alligned_tr[1, :] = average_trajectory[pairs[l][1]][
                            alligned_trajectory[n, 1]]
                        alligned_av[n, :] = np.mean(alligned_tr, axis=0)
                    average_traj_while.append(alligned_av)
                average_trajectory = average_traj_while
            average_alligned_tr = average_trajectory[0]
        else:
            average_alligned_tr = aver_tr_cluster[0]

        final_cluster.append(average_alligned_tr)

    # TODO: Moving average concept needs development

    if smoothing:
        for i in range(len(final_cluster)):
            window_width = 4
            cumsum_vec = np.cumsum(np.insert(final_cluster[i][:, 0], 0, 0))
            ma_vec_Y = (cumsum_vec[window_width:] -
                        cumsum_vec[:-window_width]) / window_width
            cumsum_vec = np.cumsum(np.insert(final_cluster[i][:, 1], 0, 0))
            ma_vec_X = (cumsum_vec[window_width:] -
                        cumsum_vec[:-window_width]) / window_width
            final_cluster[i] = np.zeros((len(ma_vec_X), 2))
            final_cluster[i][:, 0] = ma_vec_Y
            final_cluster[i][:, 1] = ma_vec_X

    return final_cluster, final_cluster_strength
Exemple #12
0
class TestLouvainClustering(unittest.TestCase):
    def setUp(self):
        self.louvain = Louvain(engine='python')
        self.bilouvain = BiLouvain(engine='python')
        if is_numba_available:
            self.louvain_numba = Louvain(engine='numba')
            self.bilouvain_numba = BiLouvain(engine='numba')
        else:
            with self.assertRaises(ValueError):
                Louvain(engine='numba')

    def test_unknown_types(self):
        with self.assertRaises(TypeError):
            self.louvain.fit(sparse.identity(1))

    def test_single_node_graph(self):
        self.assertEqual(
            self.louvain.fit_transform(sparse.identity(1, format='csr')), [0])

    def test_simple_graph(self):
        self.simple_directed_graph = simple_directed_graph()
        self.louvain.fit(directed2undirected(self.simple_directed_graph))
        self.assertEqual(len(self.louvain.labels_), 10)

    def test_undirected(self):
        self.louvain_high_resolution = Louvain(engine='python', resolution=2)
        self.louvain_null_resolution = Louvain(engine='python', resolution=0)
        self.karate_club = karate_club()
        self.louvain.fit(self.karate_club)
        labels = self.louvain.labels_
        self.assertEqual(labels.shape, (34, ))
        self.assertAlmostEqual(modularity(self.karate_club, labels), 0.42, 2)
        if is_numba_available:
            self.louvain_numba.fit(self.karate_club)
            labels = self.louvain_numba.labels_
            self.assertEqual(labels.shape, (34, ))
            self.assertAlmostEqual(modularity(self.karate_club, labels), 0.42,
                                   2)
        self.louvain_high_resolution.fit(self.karate_club)
        labels = self.louvain_high_resolution.labels_
        self.assertEqual(labels.shape, (34, ))
        self.assertAlmostEqual(modularity(self.karate_club, labels), 0.34, 2)
        self.louvain_null_resolution.fit(self.karate_club)
        labels = self.louvain_null_resolution.labels_
        self.assertEqual(labels.shape, (34, ))
        self.assertEqual(len(set(self.louvain_null_resolution.labels_)), 1)

    def test_directed(self):
        self.painters = painters(return_labels=False)

        self.louvain.fit(self.painters)
        labels = self.louvain.labels_
        self.assertEqual(labels.shape, (14, ))
        self.assertAlmostEqual(modularity(self.painters, labels), 0.32, 2)

        self.bilouvain.fit(self.painters)
        n1, n2 = self.painters.shape
        row_labels = self.bilouvain.row_labels_
        col_labels = self.bilouvain.col_labels_
        self.assertEqual(row_labels.shape, (n1, ))
        self.assertEqual(col_labels.shape, (n2, ))

    def test_bipartite(self):
        star_wars_graph = star_wars_villains()
        self.bilouvain.fit(star_wars_graph)
        row_labels = self.bilouvain.row_labels_
        col_labels = self.bilouvain.col_labels_
        self.assertEqual(row_labels.shape, (4, ))
        self.assertEqual(col_labels.shape, (3, ))
        if is_numba_available:
            self.bilouvain_numba.fit(star_wars_graph)
            row_labels = self.bilouvain_numba.row_labels_
            col_labels = self.bilouvain_numba.col_labels_
            self.assertEqual(row_labels.shape, (4, ))
            self.assertEqual(col_labels.shape, (3, ))

    def test_shuffling(self):
        self.louvain_shuffle_first = Louvain(engine='python',
                                             shuffle_nodes=True,
                                             random_state=0)
        self.louvain_shuffle_second = Louvain(engine='python',
                                              shuffle_nodes=True,
                                              random_state=123)
        self.bow_tie = bow_tie()
        self.louvain_shuffle_first.fit(self.bow_tie)
        self.assertEqual(self.louvain_shuffle_first.labels_[1], 1)
        self.louvain_shuffle_second.fit(self.bow_tie)
        self.assertEqual(self.louvain_shuffle_second.labels_[1], 1)
katzrank_org = katzrank_org.rank(ascending=False)
katzrank_org = katzrank_org.to_frame()

indegreerank_org = data.groupby('Organization')['indegreerank'].mean()
indegreerank_org.to_csv('indegreerank_org.csv')
indegreerank_org = indegreerank_org.rank(ascending=False)
indegreerank_org = indegreerank_org.to_frame()

org_rank = pagerank_org.join(katzrank_org)
org_rank = org_rank.join(indegreerank_org)
org_rank = org_rank.mean(axis=1)
org_rank.to_csv('org_rank.csv')

#####pattern detection
adjacency = nx.adjacency_matrix(G)
louvain = Louvain()
labels = louvain.fit_transform(adjacency)
labels_unique, counts = np.unique(labels, return_counts=True)

optimal_modularity = modularity(adjacency, labels)

#####modularity of the attribute
organization = network_data['Organization']
organization = organization.to_numpy()
organization_label = pd.factorize(organization)[0]

organization_modularity = modularity(adjacency, organization_label)

hireable = network_data['hireable']
hireable = hireable.to_numpy()
hireable_label = pd.factorize(hireable)[0]
def preclustering(adata,
                  all_seq_cluster,
                  sequence_coordinates,
                  basis="umap",
                  distance='cosine'):

    if basis is None:
        map_state = adata.layers['Ms']
    elif type(basis) == list:
        map_state = adata[:, basis].layers['Ms']
    else:
        map_state = adata.obsm['X_' + basis]

    all_chains = sequence_coordinates

    # Calculate hausdorff distance
    print('Calculating hausdorff distances')
    distances = np.zeros((len(all_chains), len(all_chains)))
    for i in tqdm(range(len(distances))):
        for j in range(len(distances) - i):
            haus = hausdorff_distance(all_chains[j + i],
                                      all_chains[i],
                                      distance=distance)
            distances[i, j + i] = haus
            distances[j + i, i] = haus

    affinity = -distances
    affinity = sparse.csr_matrix(minmax_scale(affinity, axis=1))

    # Perform clustering using hausdorff distance
    print('Clustering using hausdorff distances')

    #cluster_labels = AffinityPropagation(affinity="precomputed").fit_predict(affinity)
    cluster_labels = Louvain(resolution=1.25).fit_transform(affinity)
    clusters = np.unique(cluster_labels)

    # Pairwise alignment of chains within stage 1 clusters using DTW

    cluster_chains = []
    cluster_strength = []

    print('Forming trajectory by aligning clusters')
    # Pairwise allignment until only 1 avg. trajectory remains per cluster
    for i in tqdm(range(len(clusters))):

        index_cluster = np.where(cluster_labels == clusters[i])[0]
        aver_tr_cluster = all_chains[index_cluster]
        cluster_strength.append(len(index_cluster))

        if len(aver_tr_cluster) > 1:
            average_trajectory = aver_tr_cluster

            while len(average_trajectory) > 1:
                pair_range = range(len(average_trajectory))
                pairs = []
                for j in range(
                        int((len(average_trajectory) -
                             (len(average_trajectory) % 2)) / 2)):
                    pairs.append(
                        [pair_range[j * 2], pair_range[((j) * 2) + 1]])
                if (len(average_trajectory) % 2) != 0:
                    pairs.append([pair_range[-2], pair_range[-1]])

                average_traj_while = []
                for l in range(len(pairs)):
                    alligned_trajectory = fastdtw(
                        average_trajectory[pairs[l][0]],
                        average_trajectory[pairs[l][1]])[1]
                    alligned_trajectory = np.asarray(alligned_trajectory)
                    alligned_tr = np.zeros((2, len(average_trajectory[0][0])))
                    alligned_av = np.zeros((len(alligned_trajectory),
                                            len(average_trajectory[0][0])))

                    for n in range(len(alligned_trajectory)):
                        alligned_tr[0, :] = average_trajectory[pairs[l][0]][
                            alligned_trajectory[n, 0]]
                        alligned_tr[1, :] = average_trajectory[pairs[l][1]][
                            alligned_trajectory[n, 1]]
                        alligned_av[n, :] = np.mean(alligned_tr, axis=0)
                    average_traj_while.append(alligned_av)
                average_trajectory = average_traj_while
            average_alligned_tr = average_trajectory[0]
        else:
            average_alligned_tr = aver_tr_cluster[0]

        cluster_chains.append(average_alligned_tr)

    return cluster_chains, cluster_strength, cluster_labels
Exemple #15
0
def make_structure_louvain_gismo_embedding(gismo,
                                           tree,
                                           keywords_indexes,
                                           root=True,
                                           depth=3):
    """
    Builds a tree structure from Louvain clusterising method
    Args:
        gismo: the gismo built from the dataset
        tree: the empty node that will contain
        keywords_indexes:
        root:
        depth:
    Returns:
        None, it fills in the empty note that is given at first recursively
    """

    # À la racine, tous les mots sont dans le cluster
    if root:
        tree.members = [
            gismo.embedding.features[indice] for indice in keywords_indexes
        ]
        tree.centroid = sum([
            gismo.embedding.query_projection(member)[0]
            for member in tree.members
        ])
        tree.title = " ".join([
            gismo.embedding.features[i] for i in gismo.diteration.y_order
            if gismo.embedding.features[i] in tree.members
        ][:10])

    if depth == 0 or len(tree.members) == 1:
        return None

    # Clustering des membres
    words_vectors = gismo.embedding.y[keywords_indexes, :]
    words_adjacency = cosine_similarity(words_vectors, dense_output=False)
    # words_adjacency.setdiag(scipy.zeros(len(keywords_indexes)))

    # à initialiser avant le premier appel de fonction pour ne pas le refaire plusieurs fois ?
    louvain = Louvain()
    labels = louvain.fit_transform(words_adjacency)
    labels_unique, counts = np.unique(labels, return_counts=True)

    # Il y a autant d'enfants que de clusters
    children = [Node() for i in range(len(labels_unique))]
    for l in labels_unique:  # on remplit members de chaque dico
        words_indexes = keywords_indexes[np.where(labels == l)]
        words = [
            gismo.embedding.features[word_index]
            for word_index in words_indexes
        ]
        children[l].members = words
        children[l].centroid = sum(
            [gismo.embedding.query_projection(word)[0] for word in words])
        children[l].title = " ".join([
            gismo.embedding.features[i] for i in gismo.diteration.y_order
            if gismo.embedding.features[i] in words
        ][:10])
    tree.children = children

    for child in tree.children:
        make_structure_louvain_gismo_embedding(
            gismo,
            child,
            np.array([
                gismo.embedding.features.index(word) for word in child.members
            ]),
            root=False,
            depth=depth - 1)
 def test_modularity(self):
     adjacency = karate_club()
     labels = Louvain().fit_transform(adjacency)
     self.assertAlmostEqual(0.42, modularity(adjacency, labels), places=2)
Exemple #17
0
def make_structure_louvain_W2V(
    keywords,
    words_vectors,
    tree,
    gismo,
    root=True,
    depth=3,
):
    """
    Builds a tree structure from Louvain clusterising method
    Args:
        tree: the empty node that will contain
        root:
        depth:
    Returns:
        None, it fills in the empty note that is given at first recursively
    """

    # À la racine, tous les mots sont dans le cluster
    if root:
        tree.members = keywords
        tree.centroid = sum([
            gismo.embedding.query_projection(member)[0]
            for member in tree.members
        ])
        tree.title = " ".join([
            gismo.embedding.features[i] for i in gismo.diteration.y_order
            if gismo.embedding.features[i] in tree.members
        ][:10])

    if depth == 0 or len(tree.members) == 1:
        return None

    # Creation de la matrice des mots
    # words_adjacency = cosine_similarity(words_vectors, dense_output = False)
    words_adjacency = building_distances_matrix(words_vectors)
    max_vector = np.ones(np.shape(
        words_adjacency.data)) * np.max(words_adjacency)
    words_adjacency.data = max_vector - words_adjacency.data
    if sum([i for i in words_adjacency.data]) == 0:
        return None

    # Clustering
    louvain = Louvain()
    labels = louvain.fit_transform(words_adjacency)
    labels_unique, counts = np.unique(labels, return_counts=True)
    if len(labels_unique) == 1:
        return None

    # Il y a autant d'enfants que de clusters
    children = [Node() for l in labels_unique]
    children_members_indexes = [[] for child in children]
    print(labels_unique)
    print(keywords)
    for l in labels_unique:  # on remplit members de chaque dico
        children_members_indexes[l] = np.where(labels == l)[0].tolist()
        try:
            words = [
                keywords[word_index]
                for word_index in children_members_indexes[l]
            ]
        except:
            print("plantage avec les mots clef : ", keywords,
                  " et les étiquettes : ", labels_unique)
            return None
        children[l].members = words
        children[l].centroid = sum(
            [gismo.embedding.query_projection(word)[0] for word in words])
        tree.title = " ".join([
            gismo.embedding.features[i] for i in gismo.diteration.y_order
            if gismo.embedding.features[i] in words
        ][:10])
    tree.children = children

    for (l, child) in enumerate(tree.children):
        make_structure_louvain_W2V(
            keywords=child.members,
            words_vectors=words_vectors[
                children_members_indexes[l], :],  # to do
            gismo=gismo,
            tree=child,
            root=False,
            depth=depth - 1)
Exemple #18
0
# creo matrice sparse per clustering
### IN PIU
with open(out_csv_path, 'w', newline="") as f:
    writer = csv.writer(f)
    writer.writerows(adj)
### IN PIU

leng = len(utgs)
network = sp.sparse.csr_matrix((adj[2], (adj[0], adj[1])), shape=(leng, leng))
#print(network.get_shape())

# modularity opt for community detection
logger.info('Louvain alg with optimization level = ' + str(opt_par))
opt_lev = opt_par  # 0.001
louvain = Louvain(random_state=0,
                  tol_aggregation=opt_lev,
                  tol_optimization=opt_lev)
out = louvain.fit_transform(network)

clusters, n_out = cu.get_clusters(out)
n_groups = len(clusters)
logger.info('Number of clusters: ' + str(n_groups))

### REPRESENTATIVES CHOICE ###

#evaluating degree of each utg
deg = np.zeros(leng, dtype=np.uint32)
for i in range(len(adj[0])):
    deg[adj[0][i]] += adj[2][i]
    deg[adj[1][i]] += adj[2][i]