def test_allnodes(self) : """ Test that all nodes are in a community """ g = nx.erdos_renyi_graph(50, 0.1) part = co.best_partition(g) for node in g.nodes() : self.assert_(node in part)
def test_girvan(self) : """ Test that community found are good using Girvan & Newman benchmark """ g = girvan_graphs(4)#use small zout, with high zout results may change part = co.best_partition(g) for node, com in part.items() : self.assertEqual(com, part[node%4])
def modularity_community_detection_and_write1(graph, outfile_name): out_file_name = 'output/' + outfile_name + '.txt' list_com = [] partition1 = community.best_partition(graph) partition2 = community.best_partition(graph) node = 0 with open(out_file_name, 'w') as outfile: for com in set(partition1.values()): list_nodes1 = [ nodes for nodes in partition1.keys() if partition1[nodes] == com ] list_nodes2 = [ nodes for nodes in partition2.keys() if partition2[nodes] == com ] list_nodes = list(set(list_nodes1) & set(list_nodes2)) list_com.append(list_nodes) for node in list_nodes: outfile.write(str(node) + " ") outfile.write("\n") return list_com[0], list_com[1], list_com[2], list_com[3], list_com[4]
def modularity_community_detection_and_write2(graph, outfile_name): out_file_name = 'output/' + outfile_name + '.txt' list_com = [] partition = community.best_partition(graph) node = 0 with open(out_file_name, 'w') as outfile: for com in set(partition.values()): list_nodes = [ nodes for nodes in partition.keys() if partition[nodes] == com ] list_com.append(list_nodes) for node in list_nodes: outfile.write(str(node) + " ") outfile.write("\n") return list_com
def cal_community(G): # Starting with an initial partition of the graph and running the Louvain algorithm for Community Detection partition = community.best_partition(G, weight='MsgCount') values = [partition.get(node) for node in G.nodes()] list_com = partition.values() # Creating a dictionary like {community_number:list_of_participants} dict_nodes = {} # Populating the dictionary with items for each_item in partition.items(): community_num = each_item[1] community_node = each_item[0] if community_num in dict_nodes: value = dict_nodes.get(community_num) + ' | ' + str(community_node) dict_nodes.update({community_num: value}) else: dict_nodes.update({community_num: community_node}) # Creating a dataframe from the diet, and getting the output into excel community_df = pd.DataFrame.from_dict(dict_nodes, orient='index', columns=['Members']) community_df.index.rename('Community_Num', inplace=True) community_df.to_csv('Community_List_snippet.csv') # Creating a new graph to represent the communities created by the Louvain algorithm matplotlib.rcParams['figure.figsize'] = [12, 8] G_comm = nx.Graph() # Populating the data from the node dictionary created earlier G_comm.add_nodes_from(dict_nodes) # Calculating modularity and the total number of communities mod = community.modularity(partition, G) print("Modularity: ", mod) print("Total number of Communities=", len(G_comm.nodes())) # Creating the Graph and also calculating Modularity matplotlib.rcParams['figure.figsize'] = [12, 8] pos_louvain = nx.spring_layout(G_comm) nx.draw_networkx(G_comm, pos_louvain, with_labels=True, node_size=160, font_size=20, label='Modularity =' + str(round(mod, 3)) + '\nCommunities=' + str(len(G_comm.nodes()))) plt.suptitle('Community structure (Louvain Algorithm)', fontsize=22, fontname='Arial') plt.box(on=None) plt.axis('off') plt.legend(bbox_to_anchor=(0, 1), loc='best', ncol=1) plt.show()
def draw_community(G): # first compute the best partition partition = community.best_partition(G) # drawing size = float(len(set(partition.values()))) pos = nx.spring_layout(G) count = 0. for com in set(partition.values()): count = count + 1. list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com] nx.draw_networkx_nodes(G, pos, list_nodes, node_size=20, node_color=str(count / size)) G.remove_nodes_from(list(nx.isolates(G))) G.remove_edges_from(nx.selfloop_edges(G)) nx.draw_networkx_edges(G, pos, alpha=0.5) plt.show()
def test_ring(self) : """ Test that community found are good using a ring of cliques """ for num_test in range(self.numtest) : size_clique = random.randint(5, 20) num_clique = random.randint(5, 20) g = nx.Graph() for i in range(num_clique) : clique_i = nx.complete_graph(size_clique) g = nx.union(g, clique_i, rename=("",str(i)+"_")) if i > 0 : g.add_edge(str(i)+"_0", str(i-1)+"_1") g.add_edge("0_0", str(num_clique-1)+"_1") part = co.best_partition(g) for clique in range(num_clique) : p = part[str(clique) + "_0"] for node in range(size_clique) : self.assertEqual(p, part[str(clique) + "_" + str(node)])
nodes for nodes in partition.keys() if partition[nodes] == com ] nx.draw_networkx_nodes(G, pos, list_nodes, node_size=20, node_color=str(count / size)) nx.draw_networkx_edges(G, pos, alpha=0.5) plt.show() #print(location) ### USING LOUVAIN COMMUNITY - NAIVE MODEL partition = community.best_partition(G) communities = extracting_communities(partition) #print(extracting_communities(partition)) """echo_graph ??????????????????????????????????????????????""" def keywithmaxval(d): max = 0 for p_key, p_value in d.items(): if p_value >= max: max = p_value key = p_key return p_key
#Percolation method #Fluid communities algorithm #Girvan-Newman method #When the number of communities to detect has to be specified as a parameter, you will use the coverage metric to select the appropriate number (ranging from 2 to 5). #Finally, for each community algorithm, you will add an attribute to each node of the graph. The value of the attribute will be the identifier of the community tne node belongs to (ranging from 0 to nbCommunity -1). # ============================================================================= #parts = community_louvain.best_partition(g) #values = [parts.get(node) for node in g.nodes()] #node list print(g.nodes) nodes=['Myriel', 'Napoleon', 'MlleBaptistine','MmeMagloire', 'CountessDeLo', 'Geborand', 'Champtercier', 'Cravatte', 'Count', 'OldMan', 'Labarre', 'Valjean', 'Marguerite', 'MmeDeR', 'Isabeau', 'Gervais', 'Tholomyes', 'Listolier', 'Fameuil', 'Blacheville', 'Favourite', 'Dahlia', 'Zephine', 'Fantine', 'MmeThenardier', 'Thenardier', 'Cosette', 'Javert', 'Fauchelevent', 'Bamatabois', 'Perpetue', 'Simplice', 'Scaufflaire', 'Woman1', 'Judge', 'Champmathieu', 'Brevet', 'Chenildieu', 'Cochepaille', 'Pontmercy', 'Boulatruelle', 'Eponine', 'Anzelma', 'Woman2', 'MotherInnocent', 'Gribier', 'Jondrette', 'MmeBurgon', 'Gavroche', 'Gillenormand', 'Magnon', 'MlleGillenormand', 'MmePontmercy', 'MlleVaubois', 'LtGillenormand', 'Marius', 'BaronessT', 'Mabeuf', 'Enjolras', 'Combeferre', 'Prouvaire', 'Feuilly', 'Courfeyrac', 'Bahorel', 'Bossuet', 'Joly', 'Grantaire', 'MotherPlutarch', 'Gueulemer', 'Babet', 'Claquesous', 'Montparnasse', 'Toussaint', 'Child1', 'Child2', 'Brujon', 'MmeHucheloup'] #identification du nombre ideal des communautés : 5 communautés identifiés import community communities = community.best_partition(g) for value in sorted(communities.values()): print (value) #1.Kernighan–Lin bipartition algorithm bisectionID=nx.community.kernighan_lin_bisection(g) bisectionID=[list(x) for x in bisectionID] keys=[] values=[] for item in bisectionID[0]: keys.append(item) values.append(0) for j in bisectionID[1]: keys.append(j) values.append(1)
from EdgeWeightedNetworkBuilding import build_network_from_excel, build_network_from_df # In[3]: df_getNetwork = build_network_from_excel(file_path = "/Users/iris/Documents/QMUL-2018/Individual_Project/coding/datasets/mcf7_ntera2_hl60_ksea.xlsm",key="MCF7", sheet_name = "zScorenodes.edges", threshold = 0.2) cell_line = "MCF7" # In[4]: '''Next Cells Calculate Network Communities Using Different Algorithms''' #Louvain partition = community.best_partition(df_getNetwork, resolution=1.2) print "Louvain Modularity: ", community.modularity(partition, df_getNetwork) print "Louvain Partition: ", partition # In[5]: #drawing Louvain size = float(len(set(partition.values()))) pos = nx.spring_layout(df_getNetwork) count = 0. for com in set(partition.values()) : count += 1. list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com]
current_idx = 0 for module in partition: ax.add_patch(patches.Rectangle((current_idx, current_idx), len(module), # Width len(module), # Height facecolor="none", edgecolor=color, linewidth="1")) current_idx += len(module) # In[14]: # Run louvain community finding algorithm louvain_community_dict = community.best_partition(G, resolution=1.2) louvain_community_dict # In[15]: from collections import defaultdict # In[17]: # Run louvain community finding algorithm louvain_community_dict = community.best_partition(G, resolution=1.3)
def run_xmap(dataset=None, n_neighbors=None, negative_sample_rate=None, seed=None, model_folder="models", return_step=STEP.DATA_CLEANED, learn_mode=LEARN.UNSUPERVISED, runall=False): global f, SEED, NNK, NS, cmap, NFEATURE # runall = True SEED = seed NNK = n_neighbors NS = negative_sample_rate # define the distance measure to be used in UMAP. Many options are available in UMAP distancem = "euclidean" np.random.seed(SEED) # define the path to store intermediate outputs pathname = model_folder + "/" + "xmap_k{}_ns{}_s{}_".format(NNK, NS, SEED) + dataset current_step = STEP.INITIALIZING t0 = time.time() """ STEP 1: Loading the dataset """ if runall or not os.path.exists(pathname + ".cleandata"): f = open( "outputs/xmap_k{}_ns{}_s{}_".format(NNK, NS, SEED) + dataset + ".log", 'w') print_output("Loading Data ...") print_output("\tData set: {}".format(dataset)) data = pd.read_csv("../data/{}.csv".format(dataset)) feature_names = [ c.replace("_", "ubar").replace(".", "dot") for c in data.columns ][1:] target_name = data.columns[0] nfeatures = len(feature_names) NFEATURE = nfeatures data = data.values Y = data[:, 0].reshape(-1, 1) X = data[:, 1:] # scale the dataset scaler = MinMaxScaler() scaler.fit(X) X_norm = scaler.transform(X) last_step = STEP.DATA_CLEANED pickle.dump((X_norm, Y, scaler, nfeatures, feature_names, target_name), open(pathname + ".cleandata", "wb")) else: f = open( "outputs/xmap_k{}_ns{}_s{}_".format(NNK, NS, SEED) + dataset + ".log", 'w+') print_output("\tLoad cleaned data from " + pathname + ".cleandata") X_norm, Y, scaler, nfeatures, feature_names, target_name = pickle.load( open(pathname + ".cleandata", "rb")) current_step = STEP.DATA_CLEANED if current_step == return_step: return X_norm, Y, scaler, nfeatures, feature_names, target_name """ STEP 2: Learn the latent representation of the dataset (unsupervised or supervised) """ print_output("Learning UMAP ...") print(learn_mode) if learn_mode == LEARN.UNSUPERVISED: umapname = ".unsupervised_umap" else: umapname = ".supervised_umap" if runall or not os.path.exists(pathname + umapname): reducer = umap.UMAP(random_state=SEED, n_neighbors=NNK, negative_sample_rate=NS, metric=distancem) # reducer = PCA(n_components=2) if learn_mode == LEARN.UNSUPERVISED: reducer.fit(X_norm) else: reducer.fit(X_norm, y=Y) last_step = STEP.UMAP_TRAINED pickle.dump(reducer, open(pathname + umapname, "wb")) else: print_output("\tLoad trained umap from " + pathname + umapname) reducer = pickle.load(open(pathname + umapname, "rb")) embedding = reducer.transform(X_norm) current_step = STEP.UMAP_TRAINED if current_step == return_step: return embedding lnames = ["Negative", "Positive"] Y = Y.reshape(-1) cmap = [ "blue", "red", "purple", "hotpink", "black", "green", "orange", "teal", "brown", "lightsteelblue", "gray", "lime", "coral", "plum", "gold", "c", "tomato", "blueviolet", "darkseagreen" ] if return_step == None: plot_embedding_space(embedding, labels=Y, label_index=[0, 1], lnames=lnames, data_name="gt_" + dataset) """ STEP 3: summarise latent data using ATL and cluster data using learned graph from ATL """ # nepoch: number of times the data passed to ATL; age_max: maximum age of a connection; age increases if the # connection (different from the second best) links to the BMU. If max_age is too small the topological relationships will # be prematurely destroyed. Meanwhile if max_age is too large, some useless connections may survive because of randomness # or noise --> ATL needs to run longer to get the accurate results and more relationships will be preserved. # lamb: is the number of steps (or number of processed inputs) before ATL checks and cleans up the network. Lambda has # a similar effect as compared to max_age, i.e. small lamb leads to unstable network (unable to establish topological # relationhips) while large lamb may lead to redundant nodes and connections. print_output( "Learning topological relations and Determining contexts ....") if learn_mode == LEARN.UNSUPERVISED: soinnname = ".unsupervised_soinn" else: soinnname = ".supervised_soinn" if runall or not os.path.exists(pathname + soinnname): lamb = 200 # if data.shape[0] < lamb: # lamb = data.shape[0] nodes, connection, classes = atl.learning(input_data=embedding, max_nepoch=5, spread_factor=1.0, lamb=lamb) classes = 0 * classes cmap = cmap * 10 if return_step == None: plot_atl(nodes, connection) # create a network representation of the learned ATL graph G = nx.Graph() for i in range(0, nodes.shape[0]): for j in range(0, nodes.shape[0]): if connection[i, j] != 0: G.add_edge(i, j, weight=1.0) # use community detection algorithms to discover the subgraph community network_cd_alg = "best" n_components = nx.number_connected_components(G) max_context = int(n_components + np.sqrt(n_components)) threshold = 0.2 if network_cd_alg == "gn": from networkx.algorithms import community communities_generator = community.girvan_newman(G) while True: level_communities = next(communities_generator) size_com = [len(c) for c in level_communities if len(c) > 1] if min(size_com) < threshold * sum( [len(c) for c in level_communities]): break cms = sorted(map(sorted, level_communities)) elif network_cd_alg == "best" or network_cd_alg == "dendo": import community if network_cd_alg == "best": cms = community.best_partition(G, resolution=0.5) else: dendrogram = community.generate_dendrogram(G) sized = len(dendrogram) cms = community.partition_at_level(dendrogram, sized - 2) coms = set([cms[i] for i in cms]) cdict = {} for k in coms: cdict[k] = [] for i in cms: cdict[cms[i]].append(i) cms = [] for k in coms: cms.append(list(cdict[k])) else: # cms = list(nx.connected_components(G)) from networkx.algorithms import community communities_generator = community.girvan_newman(G) level_communities = next(communities_generator) cms = sorted(map(sorted, level_communities)) components = [c for c in cms if len(c) > 1] # print(components) count = 1 for comp in components: for n in comp: classes[n] = count count += 1 # each components or subgraphs can be treated as clusters as the ATL only links nodes with similar patterns # togethers. The lack of connections between two nodes indicates that there two nodes or data matching these # two nodes should belong to the same cluster. nclusters = len(components) nbrs = NearestNeighbors(n_neighbors=1).fit(nodes) distances, indices = nbrs.kneighbors(embedding) node_indices = list(indices.reshape(-1)) indices = np.array( [classes[node_indices[i]] for i in range(len(node_indices))]) last_step = STEP.ATL_TRAINED pickle.dump( (nodes, connection, classes, nclusters, node_indices, indices), open(pathname + soinnname, "wb")) else: print_output("\tLoad trained umap from " + pathname + soinnname) nodes, connection, classes, nclusters, node_indices, indices = pickle.load( open(pathname + soinnname, "rb")) current_step = STEP.ATL_TRAINED if current_step == return_step: return embedding, nodes, connection, classes, nclusters, node_indices, indices """ STEP 4: Try to explain context/cluster using Context Description Approximation (CDA). Can work with original features or interactive terms (representing or/and relations). """ cid = [c for c in range(nclusters + 1)] cid = cid + [100] if learn_mode == LEARN.UNSUPERVISED: explainnname = ".unsupervised_explain" else: explainnname = ".supervised_explain" if True or runall or not os.path.exists(pathname + explainnname): finteraction = False # True if interactive terms are considered interactionAND = False # True if AND relation is used n_identity_feature = 5 # determine the number of features/variables used to describe the cluster/context active_threshold = 0.01 # threshold to determine if a feature value can represent a given cluster/context cmap = 10 * [ "red", "blue", "purple", "hotpink", "black", "green", "orange", "teal", "brown", "lightsteelblue", "gray", "lime", "coral", "plum", "gold", "c", "tomato", "blueviolet", "darkseagreen" ] if return_step == None: plot_embedding_space(embedding, labels=indices, label_index=cid, lnames=["context__# " + str(c) for c in cid], data_name="pointcontext_" + dataset) cluster_sizes = plot_embedding_space( embedding, labels=indices, label_index=cid, lnames=["context__# " + str(c) for c in cid], data_name="highlightcontext_" + dataset) cluster_id_ranked_by_size = (-np.array(cluster_sizes)).argsort() poly = PolynomialFeatures(interaction_only=True, include_bias=False) cluster_explainer_dict = {} if nclusters > 1: print_output("Explaining contexts ...") xcluster_id = np.zeros(embedding.shape[0]) xcluster_id_details = np.zeros((embedding.shape[0], nclusters)) outputs = np.zeros((nclusters, len(feature_names))) cluster_characteristic_dict = {} feature_names_I = None XX = X_norm feature_names = [ ff.replace("ubar", "_").replace("dot", ".") for ff in feature_names ] if finteraction: if not interactionAND: XX = poly.fit_transform(1 - X_norm) XX = 1 - XX else: XX = poly.fit_transform(X_norm) feature_names_I = str(poly.get_feature_names()) for fi in range(nfeatures): feature_names_I = feature_names_I.replace( "'x" + str(fi) + "'", feature_names[fi]) feature_names_I = feature_names_I.replace( "'x" + str(fi) + " ", feature_names[fi] + " ") feature_names_I = feature_names_I.replace( " x" + str(fi) + "'", " " + feature_names[fi]) if not interactionAND: feature_names_I = feature_names_I.replace("[", "").replace( "]", "").replace("'", "").replace(", ", ",").replace(" ", " or ") else: feature_names_I = feature_names_I.replace("[", "").replace( "]", "").replace("'", "").replace(", ", ",").replace(" ", " and ") feature_names_I = feature_names_I.split(",") feature_names = feature_names_I outputs = np.zeros((nclusters, len(feature_names))) # for each cluster/context, repetitive patterns will be determined for i in range(nclusters): cluster_id = i #cluster_id_ranked_by_size[i] print_output("Context #" + str(cluster_id + 1)) Xc = XX[indices == cluster_id + 1] from scipy.stats import iqr for fi in range(len(feature_names)): outputs[cluster_id][fi] = min( np.sum(Xc[::, fi]) / len(Xc[::, fi]), 1 - np.sum(Xc[::, fi]) / len(Xc[::, fi])) true_features = [] false_features = [] numeric_features = [] impure_features = [] ranked_features = np.argsort(outputs[cluster_id]) for fi in ranked_features: if outputs[cluster_id][fi] <= active_threshold: (values, counts) = np.unique(Xc[::, fi], return_counts=True) ind = np.argmax(counts) val = values[ind] if val == 1.0: true_features.append(fi) elif val == 0.0: false_features.append(fi) else: numeric_features.append(fi) else: impure_features.append( (fi, np.min(Xc[::, fi]), np.max(Xc[::, fi]), np.average(Xc[::, fi]))) nzeros = len(feature_names) - np.count_nonzero( outputs[cluster_id]) mask = np.ones((embedding.shape[0], ), dtype=bool) countf = 0 print_output("\tTrue Features") count = 0 filter_true = [] for fi in true_features: if countf >= n_identity_feature: break countf += 1 count += 1 fmask = XX[::, fi] == 1.0 mask = mask & fmask filter_true.append(fi) if count > 0: print_output("\t\t" + str( sorted([ feature_names[ii] for ii in true_features[:count] ]))) print_output("\tFalse Features") count = 0 filter_false = [] for fi in false_features: if countf >= n_identity_feature: break countf += 1 count += 1 fmask = XX[::, fi] == 0.0 mask = mask & fmask filter_false.append(fi) true_features, false_features = filter_true, filter_false cluster_explainer_dict[cluster_id] = (finteraction, true_features, false_features, numeric_features, impure_features) if count > 0: print_output("\t\t" + str( sorted([ feature_names[ii] for ii in false_features[:count] ]))) print_output("\tNumeric Features") count = 0 for fi in numeric_features: if countf >= n_identity_feature: break countf += 1 count += 1 if count > 0: print_output("\t\t" + str([(feature_names[ii[0]], ii[1], ii[2]) for ii in numeric_features[:count]])) xcluster_id_details[mask, cluster_id] = 1 print_output("\t" + 20 * '-') print_output("\t" + 20 * '=') print_output("") last_step = STEP.CONTEXT_EXPLAINED pickle.dump(last_step, open(pathname + ".notes", "wb")) pickle.dump( (cluster_explainer_dict, xcluster_id_details, feature_names), open(pathname + explainnname, "wb")) else: print_output("\tLoad explainer from " + pathname + explainnname) cluster_explainer_dict, xcluster_id_details, feature_names = pickle.load( open(pathname + explainnname, "rb")) current_step = STEP.CONTEXT_EXPLAINED if current_step == return_step: return embedding, nodes, connection, classes, nclusters, node_indices, indices, cluster_explainer_dict, xcluster_id_details, feature_names run_time = time.time() - t0 print_output('Run in %.3f s' % run_time) print_output("Complete!!!")
def findCommunity(): global graph import community comm = community.best_partition(graph) return comm