def test_bilouvain(self): biadjacency = star_wars() adjacency = bipartite2undirected(biadjacency) louvain = Louvain(modularity='newman') labels1 = louvain.fit_transform(adjacency) louvain.fit(biadjacency) labels2 = np.concatenate((louvain.labels_row_, louvain.labels_col_)) self.assertTrue((labels1 == labels2).all())
def setUp(self): self.louvain = Louvain(engine='python') self.bilouvain = BiLouvain(engine='python') if is_numba_available: self.louvain_numba = Louvain(engine='numba') self.bilouvain_numba = BiLouvain(engine='numba') else: with self.assertRaises(ValueError): Louvain(engine='numba')
def test_modularity(self): adjacency = karate_club() louvain_d = Louvain(modularity='dugue') louvain_n = Louvain(modularity='newman') labels_d = louvain_d.fit_transform(adjacency) labels_n = louvain_n.fit_transform(adjacency) self.assertTrue((labels_d == labels_n).all()) louvain_p = Louvain(modularity='potts') louvain_p.fit_transform(adjacency)
def louvain(cls, g, labels): x, y, z = labels.shape t = cls.ragToAdjacencyMatrix(g, 'similarity') louvain = Louvain() l = louvain.fit_transform(t) rep = labels.copy() for k in range(x): for j in range(y): for i in range(z): rep[k, j, i] = l[labels[k, j, i] - 1] return rep
def test_shuffling(self): self.louvain_shuffle_first = Louvain(engine='python', shuffle_nodes=True, random_state=0) self.louvain_shuffle_second = Louvain(engine='python', shuffle_nodes=True, random_state=123) self.bow_tie = bow_tie() self.louvain_shuffle_first.fit(self.bow_tie) self.assertEqual(self.louvain_shuffle_first.labels_[1], 1) self.louvain_shuffle_second.fit(self.bow_tie) self.assertEqual(self.louvain_shuffle_second.labels_[1], 1)
def test_undirected(self): self.louvain_high_resolution = Louvain(engine='python', resolution=2) self.louvain_null_resolution = Louvain(engine='python', resolution=0) self.karate_club = karate_club() self.louvain.fit(self.karate_club) labels = self.louvain.labels_ self.assertEqual(labels.shape, (34, )) self.assertAlmostEqual(modularity(self.karate_club, labels), 0.42, 2) if is_numba_available: self.louvain_numba.fit(self.karate_club) labels = self.louvain_numba.labels_ self.assertEqual(labels.shape, (34, )) self.assertAlmostEqual(modularity(self.karate_club, labels), 0.42, 2) self.louvain_high_resolution.fit(self.karate_club) labels = self.louvain_high_resolution.labels_ self.assertEqual(labels.shape, (34, )) self.assertAlmostEqual(modularity(self.karate_club, labels), 0.34, 2) self.louvain_null_resolution.fit(self.karate_club) labels = self.louvain_null_resolution.labels_ self.assertEqual(labels.shape, (34, )) self.assertEqual(len(set(self.louvain_null_resolution.labels_)), 1)
def test_options(self): adjacency = karate_club() # resolution louvain = Louvain(resolution=2) labels = louvain.fit_transform(adjacency) self.assertEqual(len(set(labels)), 7) # tolerance louvain = Louvain(resolution=2, tol_aggregation=0.1) labels = louvain.fit_transform(adjacency) self.assertEqual(len(set(labels)), 12) # shuffling louvain = Louvain(resolution=2, shuffle_nodes=True, random_state=42) labels = louvain.fit_transform(adjacency) self.assertEqual(len(set(labels)), 9) # aggregate graph louvain = Louvain(return_aggregate=True) labels = louvain.fit_transform(adjacency) n_labels = len(set(labels)) self.assertEqual(louvain.adjacency_.shape, (n_labels, n_labels)) # aggregate graph Louvain(n_aggregations=1, sort_clusters=False).fit(adjacency)
def test_disconnected(self): adjacency = test_graph_disconnect() n = adjacency.shape[0] labels = Louvain().fit_transform(adjacency) self.assertEqual(len(labels), n)
def test_invalid(self): adjacency = karate_club() louvain = Louvain(modularity='toto') with self.assertRaises(ValueError): louvain.fit(adjacency)
def test_options_with_64_bit(self): adjacency = karate_club() # force 64-bit index adjacency.indices = adjacency.indices.astype(np.int64) adjacency.indptr = adjacency.indptr.astype(np.int64) # resolution louvain = Louvain(resolution=2) labels = louvain.fit_transform(adjacency) self.assertEqual(len(set(labels)), 7) # tolerance louvain = Louvain(resolution=2, tol_aggregation=0.1) labels = louvain.fit_transform(adjacency) self.assertEqual(len(set(labels)), 12) # shuffling louvain = Louvain(resolution=2, shuffle_nodes=True, random_state=42) labels = louvain.fit_transform(adjacency) self.assertEqual(len(set(labels)), 9) # aggregate graph louvain = Louvain(return_aggregate=True) labels = louvain.fit_transform(adjacency) n_labels = len(set(labels)) self.assertEqual(louvain.aggregate_.shape, (n_labels, n_labels)) # aggregate graph Louvain(n_aggregations=1, sort_clusters=False).fit(adjacency) # check if labels are 64-bit self.assertEqual(labels.dtype, np.int64)
def clustering(adata, sequence_coordinates, cluster_chains, cluster_strength, cluster_labels_1, n_clusters=None, method=None, smoothing=False, distance='cosine'): average_cl_d = np.zeros((len(cluster_chains), len(cluster_chains))) for i in tqdm(range(len(average_cl_d))): for j in range(len(average_cl_d)): average_cl_d[i, j] = hausdorff_distance(cluster_chains[i], cluster_chains[j], distance=distance) average_cl_d_affinity = -average_cl_d average_cl_d_affinity = sparse.csr_matrix( minmax_scale(average_cl_d_affinity, axis=1)) print('Clustering using hausdorff distances') if n_clusters is None: #cluster_labels = OPTICS(metric="precomputed").fit_predict(average_cl_d) #cluster_labels = AffinityPropagation(affinity="precomputed", convergence_iter=100).fit_predict(average_cl_d_affinity) cluster_labels = Louvain( resolution=0.75).fit_transform(average_cl_d_affinity) clusters = np.unique(cluster_labels) else: if method == 'kmeans': cluster_labels = KMeans( n_clusters=n_clusters, precompute_distances=True).fit_predict(average_cl_d_affinity) clusters = np.unique(cluster_labels) all_trajectories_labels = np.zeros((len(cluster_labels_1))) for i in range(len(cluster_labels)): all_trajectories_labels[np.where( cluster_labels_1 == i)] = cluster_labels[i] cluster_strength = np.asarray(cluster_strength, dtype=int) final_cluster_strength = np.empty([len(clusters)], dtype=int) for i in range(len(clusters)): final_cluster_strength[i] = len( np.where(all_trajectories_labels == i)[0].astype(int)) print('Forming trajectory by aligning clusters') # pairwise allignment until only 1 avg. trajectory remains per cluster final_cluster = [] all_chains = sequence_coordinates for i in tqdm(range(len(clusters))): index_cluster = np.where(all_trajectories_labels == clusters[i]) aver_tr_cluster = all_chains[index_cluster] if len(aver_tr_cluster) > 1: average_trajectory = aver_tr_cluster while len(average_trajectory) > 1: pair_range = range(len(average_trajectory)) pairs = [] for j in range( int((len(average_trajectory) - (len(average_trajectory) % 2)) / 2)): pairs.append( [pair_range[j * 2], pair_range[((j) * 2) + 1]]) if (len(average_trajectory) % 2) != 0: pairs.append([pair_range[-2], pair_range[-1]]) average_traj_while = [] for l in range(len(pairs)): alligned_trajectory = fastdtw( average_trajectory[pairs[l][0]], average_trajectory[pairs[l][1]])[1] alligned_trajectory = np.asarray(alligned_trajectory) alligned_tr = np.zeros((2, len(average_trajectory[0][0]))) alligned_av = np.zeros((len(alligned_trajectory), len(average_trajectory[0][0]))) for n in range(len(alligned_trajectory)): alligned_tr[0, :] = average_trajectory[pairs[l][0]][ alligned_trajectory[n, 0]] alligned_tr[1, :] = average_trajectory[pairs[l][1]][ alligned_trajectory[n, 1]] alligned_av[n, :] = np.mean(alligned_tr, axis=0) average_traj_while.append(alligned_av) average_trajectory = average_traj_while average_alligned_tr = average_trajectory[0] else: average_alligned_tr = aver_tr_cluster[0] final_cluster.append(average_alligned_tr) # TODO: Moving average concept needs development if smoothing: for i in range(len(final_cluster)): window_width = 4 cumsum_vec = np.cumsum(np.insert(final_cluster[i][:, 0], 0, 0)) ma_vec_Y = (cumsum_vec[window_width:] - cumsum_vec[:-window_width]) / window_width cumsum_vec = np.cumsum(np.insert(final_cluster[i][:, 1], 0, 0)) ma_vec_X = (cumsum_vec[window_width:] - cumsum_vec[:-window_width]) / window_width final_cluster[i] = np.zeros((len(ma_vec_X), 2)) final_cluster[i][:, 0] = ma_vec_Y final_cluster[i][:, 1] = ma_vec_X return final_cluster, final_cluster_strength
class TestLouvainClustering(unittest.TestCase): def setUp(self): self.louvain = Louvain(engine='python') self.bilouvain = BiLouvain(engine='python') if is_numba_available: self.louvain_numba = Louvain(engine='numba') self.bilouvain_numba = BiLouvain(engine='numba') else: with self.assertRaises(ValueError): Louvain(engine='numba') def test_unknown_types(self): with self.assertRaises(TypeError): self.louvain.fit(sparse.identity(1)) def test_single_node_graph(self): self.assertEqual( self.louvain.fit_transform(sparse.identity(1, format='csr')), [0]) def test_simple_graph(self): self.simple_directed_graph = simple_directed_graph() self.louvain.fit(directed2undirected(self.simple_directed_graph)) self.assertEqual(len(self.louvain.labels_), 10) def test_undirected(self): self.louvain_high_resolution = Louvain(engine='python', resolution=2) self.louvain_null_resolution = Louvain(engine='python', resolution=0) self.karate_club = karate_club() self.louvain.fit(self.karate_club) labels = self.louvain.labels_ self.assertEqual(labels.shape, (34, )) self.assertAlmostEqual(modularity(self.karate_club, labels), 0.42, 2) if is_numba_available: self.louvain_numba.fit(self.karate_club) labels = self.louvain_numba.labels_ self.assertEqual(labels.shape, (34, )) self.assertAlmostEqual(modularity(self.karate_club, labels), 0.42, 2) self.louvain_high_resolution.fit(self.karate_club) labels = self.louvain_high_resolution.labels_ self.assertEqual(labels.shape, (34, )) self.assertAlmostEqual(modularity(self.karate_club, labels), 0.34, 2) self.louvain_null_resolution.fit(self.karate_club) labels = self.louvain_null_resolution.labels_ self.assertEqual(labels.shape, (34, )) self.assertEqual(len(set(self.louvain_null_resolution.labels_)), 1) def test_directed(self): self.painters = painters(return_labels=False) self.louvain.fit(self.painters) labels = self.louvain.labels_ self.assertEqual(labels.shape, (14, )) self.assertAlmostEqual(modularity(self.painters, labels), 0.32, 2) self.bilouvain.fit(self.painters) n1, n2 = self.painters.shape row_labels = self.bilouvain.row_labels_ col_labels = self.bilouvain.col_labels_ self.assertEqual(row_labels.shape, (n1, )) self.assertEqual(col_labels.shape, (n2, )) def test_bipartite(self): star_wars_graph = star_wars_villains() self.bilouvain.fit(star_wars_graph) row_labels = self.bilouvain.row_labels_ col_labels = self.bilouvain.col_labels_ self.assertEqual(row_labels.shape, (4, )) self.assertEqual(col_labels.shape, (3, )) if is_numba_available: self.bilouvain_numba.fit(star_wars_graph) row_labels = self.bilouvain_numba.row_labels_ col_labels = self.bilouvain_numba.col_labels_ self.assertEqual(row_labels.shape, (4, )) self.assertEqual(col_labels.shape, (3, )) def test_shuffling(self): self.louvain_shuffle_first = Louvain(engine='python', shuffle_nodes=True, random_state=0) self.louvain_shuffle_second = Louvain(engine='python', shuffle_nodes=True, random_state=123) self.bow_tie = bow_tie() self.louvain_shuffle_first.fit(self.bow_tie) self.assertEqual(self.louvain_shuffle_first.labels_[1], 1) self.louvain_shuffle_second.fit(self.bow_tie) self.assertEqual(self.louvain_shuffle_second.labels_[1], 1)
katzrank_org = katzrank_org.rank(ascending=False) katzrank_org = katzrank_org.to_frame() indegreerank_org = data.groupby('Organization')['indegreerank'].mean() indegreerank_org.to_csv('indegreerank_org.csv') indegreerank_org = indegreerank_org.rank(ascending=False) indegreerank_org = indegreerank_org.to_frame() org_rank = pagerank_org.join(katzrank_org) org_rank = org_rank.join(indegreerank_org) org_rank = org_rank.mean(axis=1) org_rank.to_csv('org_rank.csv') #####pattern detection adjacency = nx.adjacency_matrix(G) louvain = Louvain() labels = louvain.fit_transform(adjacency) labels_unique, counts = np.unique(labels, return_counts=True) optimal_modularity = modularity(adjacency, labels) #####modularity of the attribute organization = network_data['Organization'] organization = organization.to_numpy() organization_label = pd.factorize(organization)[0] organization_modularity = modularity(adjacency, organization_label) hireable = network_data['hireable'] hireable = hireable.to_numpy() hireable_label = pd.factorize(hireable)[0]
def preclustering(adata, all_seq_cluster, sequence_coordinates, basis="umap", distance='cosine'): if basis is None: map_state = adata.layers['Ms'] elif type(basis) == list: map_state = adata[:, basis].layers['Ms'] else: map_state = adata.obsm['X_' + basis] all_chains = sequence_coordinates # Calculate hausdorff distance print('Calculating hausdorff distances') distances = np.zeros((len(all_chains), len(all_chains))) for i in tqdm(range(len(distances))): for j in range(len(distances) - i): haus = hausdorff_distance(all_chains[j + i], all_chains[i], distance=distance) distances[i, j + i] = haus distances[j + i, i] = haus affinity = -distances affinity = sparse.csr_matrix(minmax_scale(affinity, axis=1)) # Perform clustering using hausdorff distance print('Clustering using hausdorff distances') #cluster_labels = AffinityPropagation(affinity="precomputed").fit_predict(affinity) cluster_labels = Louvain(resolution=1.25).fit_transform(affinity) clusters = np.unique(cluster_labels) # Pairwise alignment of chains within stage 1 clusters using DTW cluster_chains = [] cluster_strength = [] print('Forming trajectory by aligning clusters') # Pairwise allignment until only 1 avg. trajectory remains per cluster for i in tqdm(range(len(clusters))): index_cluster = np.where(cluster_labels == clusters[i])[0] aver_tr_cluster = all_chains[index_cluster] cluster_strength.append(len(index_cluster)) if len(aver_tr_cluster) > 1: average_trajectory = aver_tr_cluster while len(average_trajectory) > 1: pair_range = range(len(average_trajectory)) pairs = [] for j in range( int((len(average_trajectory) - (len(average_trajectory) % 2)) / 2)): pairs.append( [pair_range[j * 2], pair_range[((j) * 2) + 1]]) if (len(average_trajectory) % 2) != 0: pairs.append([pair_range[-2], pair_range[-1]]) average_traj_while = [] for l in range(len(pairs)): alligned_trajectory = fastdtw( average_trajectory[pairs[l][0]], average_trajectory[pairs[l][1]])[1] alligned_trajectory = np.asarray(alligned_trajectory) alligned_tr = np.zeros((2, len(average_trajectory[0][0]))) alligned_av = np.zeros((len(alligned_trajectory), len(average_trajectory[0][0]))) for n in range(len(alligned_trajectory)): alligned_tr[0, :] = average_trajectory[pairs[l][0]][ alligned_trajectory[n, 0]] alligned_tr[1, :] = average_trajectory[pairs[l][1]][ alligned_trajectory[n, 1]] alligned_av[n, :] = np.mean(alligned_tr, axis=0) average_traj_while.append(alligned_av) average_trajectory = average_traj_while average_alligned_tr = average_trajectory[0] else: average_alligned_tr = aver_tr_cluster[0] cluster_chains.append(average_alligned_tr) return cluster_chains, cluster_strength, cluster_labels
def make_structure_louvain_gismo_embedding(gismo, tree, keywords_indexes, root=True, depth=3): """ Builds a tree structure from Louvain clusterising method Args: gismo: the gismo built from the dataset tree: the empty node that will contain keywords_indexes: root: depth: Returns: None, it fills in the empty note that is given at first recursively """ # À la racine, tous les mots sont dans le cluster if root: tree.members = [ gismo.embedding.features[indice] for indice in keywords_indexes ] tree.centroid = sum([ gismo.embedding.query_projection(member)[0] for member in tree.members ]) tree.title = " ".join([ gismo.embedding.features[i] for i in gismo.diteration.y_order if gismo.embedding.features[i] in tree.members ][:10]) if depth == 0 or len(tree.members) == 1: return None # Clustering des membres words_vectors = gismo.embedding.y[keywords_indexes, :] words_adjacency = cosine_similarity(words_vectors, dense_output=False) # words_adjacency.setdiag(scipy.zeros(len(keywords_indexes))) # à initialiser avant le premier appel de fonction pour ne pas le refaire plusieurs fois ? louvain = Louvain() labels = louvain.fit_transform(words_adjacency) labels_unique, counts = np.unique(labels, return_counts=True) # Il y a autant d'enfants que de clusters children = [Node() for i in range(len(labels_unique))] for l in labels_unique: # on remplit members de chaque dico words_indexes = keywords_indexes[np.where(labels == l)] words = [ gismo.embedding.features[word_index] for word_index in words_indexes ] children[l].members = words children[l].centroid = sum( [gismo.embedding.query_projection(word)[0] for word in words]) children[l].title = " ".join([ gismo.embedding.features[i] for i in gismo.diteration.y_order if gismo.embedding.features[i] in words ][:10]) tree.children = children for child in tree.children: make_structure_louvain_gismo_embedding( gismo, child, np.array([ gismo.embedding.features.index(word) for word in child.members ]), root=False, depth=depth - 1)
def test_modularity(self): adjacency = karate_club() labels = Louvain().fit_transform(adjacency) self.assertAlmostEqual(0.42, modularity(adjacency, labels), places=2)
def make_structure_louvain_W2V( keywords, words_vectors, tree, gismo, root=True, depth=3, ): """ Builds a tree structure from Louvain clusterising method Args: tree: the empty node that will contain root: depth: Returns: None, it fills in the empty note that is given at first recursively """ # À la racine, tous les mots sont dans le cluster if root: tree.members = keywords tree.centroid = sum([ gismo.embedding.query_projection(member)[0] for member in tree.members ]) tree.title = " ".join([ gismo.embedding.features[i] for i in gismo.diteration.y_order if gismo.embedding.features[i] in tree.members ][:10]) if depth == 0 or len(tree.members) == 1: return None # Creation de la matrice des mots # words_adjacency = cosine_similarity(words_vectors, dense_output = False) words_adjacency = building_distances_matrix(words_vectors) max_vector = np.ones(np.shape( words_adjacency.data)) * np.max(words_adjacency) words_adjacency.data = max_vector - words_adjacency.data if sum([i for i in words_adjacency.data]) == 0: return None # Clustering louvain = Louvain() labels = louvain.fit_transform(words_adjacency) labels_unique, counts = np.unique(labels, return_counts=True) if len(labels_unique) == 1: return None # Il y a autant d'enfants que de clusters children = [Node() for l in labels_unique] children_members_indexes = [[] for child in children] print(labels_unique) print(keywords) for l in labels_unique: # on remplit members de chaque dico children_members_indexes[l] = np.where(labels == l)[0].tolist() try: words = [ keywords[word_index] for word_index in children_members_indexes[l] ] except: print("plantage avec les mots clef : ", keywords, " et les étiquettes : ", labels_unique) return None children[l].members = words children[l].centroid = sum( [gismo.embedding.query_projection(word)[0] for word in words]) tree.title = " ".join([ gismo.embedding.features[i] for i in gismo.diteration.y_order if gismo.embedding.features[i] in words ][:10]) tree.children = children for (l, child) in enumerate(tree.children): make_structure_louvain_W2V( keywords=child.members, words_vectors=words_vectors[ children_members_indexes[l], :], # to do gismo=gismo, tree=child, root=False, depth=depth - 1)
# creo matrice sparse per clustering ### IN PIU with open(out_csv_path, 'w', newline="") as f: writer = csv.writer(f) writer.writerows(adj) ### IN PIU leng = len(utgs) network = sp.sparse.csr_matrix((adj[2], (adj[0], adj[1])), shape=(leng, leng)) #print(network.get_shape()) # modularity opt for community detection logger.info('Louvain alg with optimization level = ' + str(opt_par)) opt_lev = opt_par # 0.001 louvain = Louvain(random_state=0, tol_aggregation=opt_lev, tol_optimization=opt_lev) out = louvain.fit_transform(network) clusters, n_out = cu.get_clusters(out) n_groups = len(clusters) logger.info('Number of clusters: ' + str(n_groups)) ### REPRESENTATIVES CHOICE ### #evaluating degree of each utg deg = np.zeros(leng, dtype=np.uint32) for i in range(len(adj[0])): deg[adj[0][i]] += adj[2][i] deg[adj[1][i]] += adj[2][i]