Esempio n. 1
0
    def test_options(self):
        adjacency = karate_club()

        # resolution
        louvain = Louvain(resolution=2)
        labels = louvain.fit_transform(adjacency)
        self.assertEqual(len(set(labels)), 7)

        # tolerance
        louvain = Louvain(resolution=2, tol_aggregation=0.1)
        labels = louvain.fit_transform(adjacency)
        self.assertEqual(len(set(labels)), 12)

        # shuffling
        louvain = Louvain(resolution=2, shuffle_nodes=True, random_state=42)
        labels = louvain.fit_transform(adjacency)
        self.assertEqual(len(set(labels)), 9)

        # aggregate graph
        louvain = Louvain(return_aggregate=True)
        labels = louvain.fit_transform(adjacency)
        n_labels = len(set(labels))
        self.assertEqual(louvain.adjacency_.shape, (n_labels, n_labels))

        # aggregate graph
        Louvain(n_aggregations=1, sort_clusters=False).fit(adjacency)
Esempio n. 2
0
    def test_options_with_64_bit(self):
        adjacency = karate_club()
        # force 64-bit index
        adjacency.indices = adjacency.indices.astype(np.int64)
        adjacency.indptr = adjacency.indptr.astype(np.int64)

        # resolution
        louvain = Louvain(resolution=2)
        labels = louvain.fit_transform(adjacency)
        self.assertEqual(len(set(labels)), 7)

        # tolerance
        louvain = Louvain(resolution=2, tol_aggregation=0.1)
        labels = louvain.fit_transform(adjacency)
        self.assertEqual(len(set(labels)), 12)

        # shuffling
        louvain = Louvain(resolution=2, shuffle_nodes=True, random_state=42)
        labels = louvain.fit_transform(adjacency)
        self.assertEqual(len(set(labels)), 9)

        # aggregate graph
        louvain = Louvain(return_aggregate=True)
        labels = louvain.fit_transform(adjacency)
        n_labels = len(set(labels))
        self.assertEqual(louvain.aggregate_.shape, (n_labels, n_labels))

        # aggregate graph
        Louvain(n_aggregations=1, sort_clusters=False).fit(adjacency)

        # check if labels are 64-bit
        self.assertEqual(labels.dtype, np.int64)
    def test_modularity(self):
        adjacency = karate_club()
        louvain_d = Louvain(modularity='dugue')
        louvain_n = Louvain(modularity='newman')
        labels_d = louvain_d.fit_transform(adjacency)
        labels_n = louvain_n.fit_transform(adjacency)
        self.assertTrue((labels_d == labels_n).all())

        louvain_p = Louvain(modularity='potts')
        louvain_p.fit_transform(adjacency)
Esempio n. 4
0
 def test_bilouvain(self):
     biadjacency = star_wars()
     adjacency = bipartite2undirected(biadjacency)
     louvain = Louvain(modularity='newman')
     labels1 = louvain.fit_transform(adjacency)
     louvain.fit(biadjacency)
     labels2 = np.concatenate((louvain.labels_row_, louvain.labels_col_))
     self.assertTrue((labels1 == labels2).all())
Esempio n. 5
0
 def louvain(cls, g, labels):
     x, y, z = labels.shape
     t = cls.ragToAdjacencyMatrix(g, 'similarity')
     louvain = Louvain()
     l = louvain.fit_transform(t)
     rep = labels.copy()
     for k in range(x):
         for j in range(y):
             for i in range(z):
                 rep[k, j, i] = l[labels[k, j, i] - 1]
     return rep
Esempio n. 6
0
def make_structure_louvain_W2V(
    keywords,
    words_vectors,
    tree,
    gismo,
    root=True,
    depth=3,
):
    """
    Builds a tree structure from Louvain clusterising method
    Args:
        tree: the empty node that will contain
        root:
        depth:
    Returns:
        None, it fills in the empty note that is given at first recursively
    """

    # À la racine, tous les mots sont dans le cluster
    if root:
        tree.members = keywords
        tree.centroid = sum([
            gismo.embedding.query_projection(member)[0]
            for member in tree.members
        ])
        tree.title = " ".join([
            gismo.embedding.features[i] for i in gismo.diteration.y_order
            if gismo.embedding.features[i] in tree.members
        ][:10])

    if depth == 0 or len(tree.members) == 1:
        return None

    # Creation de la matrice des mots
    # words_adjacency = cosine_similarity(words_vectors, dense_output = False)
    words_adjacency = building_distances_matrix(words_vectors)
    max_vector = np.ones(np.shape(
        words_adjacency.data)) * np.max(words_adjacency)
    words_adjacency.data = max_vector - words_adjacency.data
    if sum([i for i in words_adjacency.data]) == 0:
        return None

    # Clustering
    louvain = Louvain()
    labels = louvain.fit_transform(words_adjacency)
    labels_unique, counts = np.unique(labels, return_counts=True)
    if len(labels_unique) == 1:
        return None

    # Il y a autant d'enfants que de clusters
    children = [Node() for l in labels_unique]
    children_members_indexes = [[] for child in children]
    print(labels_unique)
    print(keywords)
    for l in labels_unique:  # on remplit members de chaque dico
        children_members_indexes[l] = np.where(labels == l)[0].tolist()
        try:
            words = [
                keywords[word_index]
                for word_index in children_members_indexes[l]
            ]
        except:
            print("plantage avec les mots clef : ", keywords,
                  " et les étiquettes : ", labels_unique)
            return None
        children[l].members = words
        children[l].centroid = sum(
            [gismo.embedding.query_projection(word)[0] for word in words])
        tree.title = " ".join([
            gismo.embedding.features[i] for i in gismo.diteration.y_order
            if gismo.embedding.features[i] in words
        ][:10])
    tree.children = children

    for (l, child) in enumerate(tree.children):
        make_structure_louvain_W2V(
            keywords=child.members,
            words_vectors=words_vectors[
                children_members_indexes[l], :],  # to do
            gismo=gismo,
            tree=child,
            root=False,
            depth=depth - 1)
Esempio n. 7
0
def make_structure_louvain_gismo_embedding(gismo,
                                           tree,
                                           keywords_indexes,
                                           root=True,
                                           depth=3):
    """
    Builds a tree structure from Louvain clusterising method
    Args:
        gismo: the gismo built from the dataset
        tree: the empty node that will contain
        keywords_indexes:
        root:
        depth:
    Returns:
        None, it fills in the empty note that is given at first recursively
    """

    # À la racine, tous les mots sont dans le cluster
    if root:
        tree.members = [
            gismo.embedding.features[indice] for indice in keywords_indexes
        ]
        tree.centroid = sum([
            gismo.embedding.query_projection(member)[0]
            for member in tree.members
        ])
        tree.title = " ".join([
            gismo.embedding.features[i] for i in gismo.diteration.y_order
            if gismo.embedding.features[i] in tree.members
        ][:10])

    if depth == 0 or len(tree.members) == 1:
        return None

    # Clustering des membres
    words_vectors = gismo.embedding.y[keywords_indexes, :]
    words_adjacency = cosine_similarity(words_vectors, dense_output=False)
    # words_adjacency.setdiag(scipy.zeros(len(keywords_indexes)))

    # à initialiser avant le premier appel de fonction pour ne pas le refaire plusieurs fois ?
    louvain = Louvain()
    labels = louvain.fit_transform(words_adjacency)
    labels_unique, counts = np.unique(labels, return_counts=True)

    # Il y a autant d'enfants que de clusters
    children = [Node() for i in range(len(labels_unique))]
    for l in labels_unique:  # on remplit members de chaque dico
        words_indexes = keywords_indexes[np.where(labels == l)]
        words = [
            gismo.embedding.features[word_index]
            for word_index in words_indexes
        ]
        children[l].members = words
        children[l].centroid = sum(
            [gismo.embedding.query_projection(word)[0] for word in words])
        children[l].title = " ".join([
            gismo.embedding.features[i] for i in gismo.diteration.y_order
            if gismo.embedding.features[i] in words
        ][:10])
    tree.children = children

    for child in tree.children:
        make_structure_louvain_gismo_embedding(
            gismo,
            child,
            np.array([
                gismo.embedding.features.index(word) for word in child.members
            ]),
            root=False,
            depth=depth - 1)
Esempio n. 8
0
class TestLouvainClustering(unittest.TestCase):
    def setUp(self):
        self.louvain = Louvain(engine='python')
        self.bilouvain = BiLouvain(engine='python')
        if is_numba_available:
            self.louvain_numba = Louvain(engine='numba')
            self.bilouvain_numba = BiLouvain(engine='numba')
        else:
            with self.assertRaises(ValueError):
                Louvain(engine='numba')

    def test_unknown_types(self):
        with self.assertRaises(TypeError):
            self.louvain.fit(sparse.identity(1))

    def test_single_node_graph(self):
        self.assertEqual(
            self.louvain.fit_transform(sparse.identity(1, format='csr')), [0])

    def test_simple_graph(self):
        self.simple_directed_graph = simple_directed_graph()
        self.louvain.fit(directed2undirected(self.simple_directed_graph))
        self.assertEqual(len(self.louvain.labels_), 10)

    def test_undirected(self):
        self.louvain_high_resolution = Louvain(engine='python', resolution=2)
        self.louvain_null_resolution = Louvain(engine='python', resolution=0)
        self.karate_club = karate_club()
        self.louvain.fit(self.karate_club)
        labels = self.louvain.labels_
        self.assertEqual(labels.shape, (34, ))
        self.assertAlmostEqual(modularity(self.karate_club, labels), 0.42, 2)
        if is_numba_available:
            self.louvain_numba.fit(self.karate_club)
            labels = self.louvain_numba.labels_
            self.assertEqual(labels.shape, (34, ))
            self.assertAlmostEqual(modularity(self.karate_club, labels), 0.42,
                                   2)
        self.louvain_high_resolution.fit(self.karate_club)
        labels = self.louvain_high_resolution.labels_
        self.assertEqual(labels.shape, (34, ))
        self.assertAlmostEqual(modularity(self.karate_club, labels), 0.34, 2)
        self.louvain_null_resolution.fit(self.karate_club)
        labels = self.louvain_null_resolution.labels_
        self.assertEqual(labels.shape, (34, ))
        self.assertEqual(len(set(self.louvain_null_resolution.labels_)), 1)

    def test_directed(self):
        self.painters = painters(return_labels=False)

        self.louvain.fit(self.painters)
        labels = self.louvain.labels_
        self.assertEqual(labels.shape, (14, ))
        self.assertAlmostEqual(modularity(self.painters, labels), 0.32, 2)

        self.bilouvain.fit(self.painters)
        n1, n2 = self.painters.shape
        row_labels = self.bilouvain.row_labels_
        col_labels = self.bilouvain.col_labels_
        self.assertEqual(row_labels.shape, (n1, ))
        self.assertEqual(col_labels.shape, (n2, ))

    def test_bipartite(self):
        star_wars_graph = star_wars_villains()
        self.bilouvain.fit(star_wars_graph)
        row_labels = self.bilouvain.row_labels_
        col_labels = self.bilouvain.col_labels_
        self.assertEqual(row_labels.shape, (4, ))
        self.assertEqual(col_labels.shape, (3, ))
        if is_numba_available:
            self.bilouvain_numba.fit(star_wars_graph)
            row_labels = self.bilouvain_numba.row_labels_
            col_labels = self.bilouvain_numba.col_labels_
            self.assertEqual(row_labels.shape, (4, ))
            self.assertEqual(col_labels.shape, (3, ))

    def test_shuffling(self):
        self.louvain_shuffle_first = Louvain(engine='python',
                                             shuffle_nodes=True,
                                             random_state=0)
        self.louvain_shuffle_second = Louvain(engine='python',
                                              shuffle_nodes=True,
                                              random_state=123)
        self.bow_tie = bow_tie()
        self.louvain_shuffle_first.fit(self.bow_tie)
        self.assertEqual(self.louvain_shuffle_first.labels_[1], 1)
        self.louvain_shuffle_second.fit(self.bow_tie)
        self.assertEqual(self.louvain_shuffle_second.labels_[1], 1)
katzrank_org = katzrank_org.to_frame()

indegreerank_org = data.groupby('Organization')['indegreerank'].mean()
indegreerank_org.to_csv('indegreerank_org.csv')
indegreerank_org = indegreerank_org.rank(ascending=False)
indegreerank_org = indegreerank_org.to_frame()

org_rank = pagerank_org.join(katzrank_org)
org_rank = org_rank.join(indegreerank_org)
org_rank = org_rank.mean(axis=1)
org_rank.to_csv('org_rank.csv')

#####pattern detection
adjacency = nx.adjacency_matrix(G)
louvain = Louvain()
labels = louvain.fit_transform(adjacency)
labels_unique, counts = np.unique(labels, return_counts=True)

optimal_modularity = modularity(adjacency, labels)

#####modularity of the attribute
organization = network_data['Organization']
organization = organization.to_numpy()
organization_label = pd.factorize(organization)[0]

organization_modularity = modularity(adjacency, organization_label)

hireable = network_data['hireable']
hireable = hireable.to_numpy()
hireable_label = pd.factorize(hireable)[0]
hireable_modularity = modularity(adjacency, hireable_label)
Esempio n. 10
0
with open(out_csv_path, 'w', newline="") as f:
    writer = csv.writer(f)
    writer.writerows(adj)
### IN PIU

leng = len(utgs)
network = sp.sparse.csr_matrix((adj[2], (adj[0], adj[1])), shape=(leng, leng))
#print(network.get_shape())

# modularity opt for community detection
logger.info('Louvain alg with optimization level = ' + str(opt_par))
opt_lev = opt_par  # 0.001
louvain = Louvain(random_state=0,
                  tol_aggregation=opt_lev,
                  tol_optimization=opt_lev)
out = louvain.fit_transform(network)

clusters, n_out = cu.get_clusters(out)
n_groups = len(clusters)
logger.info('Number of clusters: ' + str(n_groups))

### REPRESENTATIVES CHOICE ###

#evaluating degree of each utg
deg = np.zeros(leng, dtype=np.uint32)
for i in range(len(adj[0])):
    deg[adj[0][i]] += adj[2][i]
    deg[adj[1][i]] += adj[2][i]

#create representatives based on deg
#max_length_grouped = 10000