Example #1
0
    def create_networks(self):

        print "creating networks"

        # WORDS
        print "-"*10+" Words"
        words_graph= self.create_network(self.words, self.words_to_words, self.limit_words, 1)

        if words_graph.order() != 0: 
            self.words_allowed=[self.words[int(w)] for w in words_graph.nodes()]
            print "%d words_allowed"%len(self.words_allowed)

            self.words_communities = community.best_partition(words_graph.to_undirected()) 
            print "Number of words partitions : ", len(set(self.words_communities.values()))

        # CITATIONS
        print
        print "-"*10+" Citations"
        citations_graph= self.create_network(self.cited, self.citations, self.limit_citations, 0)

        if citations_graph.order() != 0:
            self.cited_allowed=[self.cited[int(w)] for w in citations_graph.nodes()]
            print "%d cited_allowed"%len(self.cited_allowed)

            # Communities
            self.citations_communities = community.best_partition(citations_graph.to_undirected()) 
            print "Number of citations partitions : ", len(set(self.citations_communities.values()))
Example #2
0
def louvain(graph):
    """Computes clusters using the Louvain algorithm

    Parameters
    ----------
    graph : A NetworkX Graph to cluster. This object will also be modified.
            Nodes will gain a new attribute called 'cluster' indicating which
            cluster it belongs to.

    Returns
    -------
    A dictionary of clusters to node lists
    """

    try:
        clust = community.best_partition(graph)    # attempt louvain method
    except:
        clust = {}                # if clustering fails, assign all nodes to the same cluster
        for x in graph:
            clust[x] = 0

    clustDict = {}
    for x in clust:
        graph.node[x]['cluster'] = clust[x] # tag nodes by clusterID
        if clust[x] in clustDict: # rework dictionary
            clustDict[clust[x]].append(x)
        else:
            clustDict[clust[x]] = [x]

    return clustDict
Example #3
0
def groupGraph(G, userNodeId):
    """docstring for groupGraph"""
    G.node[userNodeId]['group'] = 0
    Gc = nx.Graph(G)
    Gc.remove_node(userNodeId)
    
    if len(Gc.edges()) < 1:
        partition = {}
        for n in Gc.nodes():
            partition[n] = 1
    else:
        partition = community.best_partition(Gc)
    
    for nodes in partition.keys():
        G.node[nodes]['group'] = partition[nodes] + 1
    
    #For Connected Sub Graphs
    #Gcc=nx.connected_component_subgraphs(Gc)
    Gcc = []
    for com in set(partition.values()) :
        list_nodes = [nodes for nodes in partition.keys()
                                    if partition[nodes] == com]
        Gcc.append(G.subgraph(list_nodes))
        
    
    for SG in Gcc:
        if len(SG.nodes()) > 3:
            bm, cm, dm = CentralityNoself(SG)
            G.node[bm]['central'] =  1
            #G.node[em]['central'] =  2
            G.node[cm]['central'] =  3
            G.node[dm]['central'] =  4
            
    return G, len(set(partition.values()))
    def detect_communities(self):
        partition = community.best_partition(self.G)
        for n in partition:
            nx.set_node_attributes(self.G, 'community', {n: partition[n]})

        self.l.append("community")
        return self
def prepare_network(df):
    df.set_index('yearID', inplace=True)

    # Create co-occurrence matrix
    cooc = df.dot(df.T) * (1 - np.eye(df.shape[0]))
    cooc.to_csv('cooc.csv')

    slicing = 3
    weights = cooc[cooc >= slicing]
    weights = weights.stack()
    weights = weights / weights.max()
    cd_network = weights.to_dict()
    cd_network = {key: float(value) for key, value in cd_network.items()}

    player_network = nx.Graph()
    player_network.add_edges_from(cd_network)
    nx.set_edge_attributes(player_network, 'weight', cd_network)

    partition = community.best_partition(player_network)
    nx.set_node_attributes(player_network, 'part', partition)

    if not os.path.isdir('results'):
        os.mkdir('results')

    with open('results/player_network.graphml', 'wb') as ofile:
        nx.write_graphml(player_network, ofile)
    return
Example #6
0
def get_topics_noun_phrases(num_news, draw=False, url='http://cnn.com'):

    texts = get_news(url, num_news)

    gb = NounPhraseGraphBuilder(text_processing.clean_punctuation_and_stopwords)
    gb.load_texts(texts)
    G = gb.create_graph()
    print "Graph built"

    partition = community.best_partition(G)
    words_by_part = get_words_by_partition(partition)

    print_topics_from_partitions(G, words_by_part, 10)

    mod = community.modularity(partition,G)
    print("modularity:", mod)

    #print_topics_from_partitions(G, words_by_part, 10)
    if draw:
        values = [partition.get(node) for node in G.nodes()]
        nx.draw_spring(G, cmap = plt.get_cmap('jet'), node_color = values, node_size=30, with_labels=False)
        plt.show()

    topics = get_topics_from_partitions(G, words_by_part, 10)

    return G, topics
Example #7
0
def add_metrics(g):
	"""
	Adds centrality metrics and community number attributes to each node in the given graph.
	Returns the graph with new node attributes.
	"""
	# Each function returns a dict keyed by node id with the computed metric as value
	deg_cent = nx.degree_centrality(g)
	close_cent = nx.closeness_centrality(g)
	between_cent = nx.betweenness_centrality(g)
	com = community.best_partition(g)
	# Only interested in communities with more than one member - get a list
	# of multimember communities, sorted by community number
	sorted_coms = get_sorted_multimember_coms(com)

	# Loop through nodes in the graph and give them new attributes
	for vertex in self.graph.node.keys():
		g.node[vertex]["deg_cent"] = deg_cent[vertex]
		g.node[vertex]["close_cent"] = close_cent[vertex]
		g.node[vertex]["between_cent"] = between_cent[vertex]

		# Only nodes in a multimember community get a community number
		if com[vertex] in sorted_coms:
			# So community numbers start at 1, change community numbers to their position in the sorted_coms
			# list, plus 1
			# e.g. first multimember community number may be 3, this makes it 0 (position in list) + 1
			new_com_num = sorted_coms.index(com[vertex]) + 1
			g.node[vertex]["com"] = new_com_num
		# If node not in a multimember community, gets False as com number attribute
		else:
			g.node[vertex]["com"] = False

	return g
def _partitions(graph):
    """
    Internal use only.
    Finds partitions with louvain method and returns dict
    """
    # Get Partitions with Louvain method
    part = community.best_partition(graph)

    # Structure them for info purpose
    parts = {}
    for item in part:
        n = part[item]  # Partition number
        if n not in parts:  # Check if list for partition exists
            parts[n] = []

        # add node with degree in graph (for sorting) as tuples to list
        parts[n].append(item)  # ((item, graph.degree()[item]))

    # Use degree to find name for category
    names = {key: max(
        [(item, graph.degree(weight='weight')[item]) for item in parts[key]],
        key=itemgetter(1))
        [0] for key in parts}

    # New return dict. ToDo: make it like thisright away
    res = {key: {'name': names[key], 'categories': parts[key]}
           for key in parts}

    return res
 def data(self, **kw):
     try:
         with closing(open('cache.json', 'r')) as data_file:
             print 'Reading from cache'
             return data_file.read()
     except IOError:
         print 'Fetching data'
         with closing(open('cache.json', 'w')) as data_file:
             foaf_graph = None
             try:
                 with closing(open('graph_cache.json', 'r')) as graph_file:
                     print 'Reading from graph cache'
                     foaf_graph = jg.load(graph_file)
             except IOError:
                 foaf_graph = retrieve_foaf(FBTOKEN)
             clusters = community.best_partition(foaf_graph)
             degree_distribution = get_histograms(foaf_graph)
             cluster_counts = get_cluster_counts(clusters)
             top10 = get_top_degree(foaf_graph, 10)
             foaf_json_graph = json.loads(jg.dumps(foaf_graph))
             ob = foaf_graph.degree()
             infos = {
                 'graph':foaf_json_graph,
                 'clusters':clusters,
                 'cluster_counts':cluster_counts,
                 'degree_distribution':degree_distribution,
                 'degree':foaf_graph.degree(),
                 'top10':top10
             }
             foaf_data = json.dumps(infos)
             data_file.write(foaf_data)
             return foaf_data
Example #10
0
    def detect_communities(self, graph, users, resolution, fraction):

        partitions = community.best_partition(graph.to_undirected(), resolution=resolution)

        counter = Counter(partitions.values())
        number_of_nodes = sum(counter.values())

        self._logger.info("Counter %s", counter)

        communities = [i for i in counter.items() if i[1] > fraction * number_of_nodes]

        self._logger.info("Number of nodes: %d", number_of_nodes)
        self._logger.info("Number of communities to map: %d", len(communities))
        self._logger.info("Communities: %s", communities)

        partitions_to_com = dict.fromkeys(set(partitions.values()), CommunityUser.UNCLASSIFIED)

        output = {}

        for com, _ in communities:
            com_nodes = [users[n].get_classification() for n in partitions.keys() if partitions[n] == com]
            com_classes = Counter(com_nodes)

            self._logger.info("%d: %s", com, com_classes)
            partitions_to_com[com] = com_classes.most_common(1)[0][0]
            output[com] = (partitions_to_com[com], com_classes)

        for node in graph.nodes():
            c = partitions[node]
            graph.node[node]["community"] = c
            graph.node[node]["classification"] = partitions_to_com[c]

        # json.dump(output, open("per_classification.txt", "w"))

        return graph
Example #11
0
def detect_communities(graph, verbose=False):
    graph = graph_from_csv(graph)
    partition = community.best_partition(graph)
    if verbose:
        print "%i partitions" % len(set(partition.values()))
    nx.set_node_attributes(graph, 'partition', partition)
    return graph, partition
Example #12
0
def detect_communities(graph, users, resolution=1.0, fraction=0.05):
    partitions = community.best_partition(graph.to_undirected(), resolution=resolution)

    counter = Counter(partitions.values())
    number_of_nodes = sum(counter.values())
    communities = [i for i in counter.items() if i[1] > fraction * number_of_nodes]

    partitions_to_com = dict.fromkeys(set(partitions.values()), CommunityUser.UNCLASSIFIED)

    output = {}

    for com, _ in communities:
        com_nodes = [users[n].get_classification() for n in partitions.keys() if partitions[n] == com]
        com_classes = Counter(com_nodes)
        partitions_to_com[com] = com_classes.most_common(1)[0][0]
        output[com] = (partitions_to_com[com], com_classes)

    for node in graph.nodes():
        c = partitions[node]
        graph.node[node]["community"] = c
        graph.node[node]["classification"] = partitions_to_com[c]
        if partitions_to_com[c] != "Unclassified" and users[node].get_classification() != "Unclassified":
            if partitions_to_com[c] != users[node].get_classification():
                print node, partitions_to_com[c], users[node].get_classification()
                wrongs.append({"user":node,"louvian": partitions_to_com[c], "class": users[node].get_classification()})
    return graph
Example #13
0
 def cluster(self):
     #first compute the best partition
     partition = community.best_partition(self.G)
     # print partition
     category = [(c,) for i, c in partition.items()]
     # print category
     return category
def community_structure(G, candidates):
    partition = community.best_partition(G)
    to_return = {}
    candidates_found = 0
    for candidate in candidates:
        to_return[candidate] = {}
    candidate_to_pacs_in_community = {}
    candidate_to_community_size = {}
    nodes_so_far = 0
    candidates_per_community = []
    pacs_per_community = [] 
    community_size = []
    for com in set(partition.values()) :
        list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com]
        num_candidates = 0
        num_pacs = 0
        for node in list_nodes:
        	if node[0:2] == 'C0': num_pacs += 1
        	if node in candidates: num_candidates += 1
        for node in list_nodes:
            if node in candidates:
                to_return[node]['community_size'] = len(list_nodes)
                to_return[node]['pacs_in_community'] = num_pacs
                candidates_found += 1
                print 'found ', node
        if (candidates_found >= len(candidates)): break
    for candidate in candidates:
        if 'community_size' not in to_return[candidate]:
            to_return[candidate]['community_size']  = 0
            print 'didnt find ',candidate
    for candidate in candidates:
        if 'pacs_in_community' not in to_return[candidate]:
            to_return[candidate]['pacs_in_community']  = 0
    return to_return
Example #15
0
    def _with_networkx(documents, threshold=1):
        G = nx.Graph()
        G.add_nodes_from(documents)
        nodes = G.nodes()
        for i, node in enumerate(nodes):
            for other in nodes[i+1:]:
                a = set(node.keywords)
                b = set(other.keywords)
                intersection = a.intersection(b)
                if len(intersection) > threshold:
                    G.add_edge(node, other)
                    G[node][other]['weight'] = len(intersection)

        # remove any isolated vertices before we perform community detection
        orphans = []
        for node in G.nodes():
            if not G.neighbors(node):
                G.remove_node(node)
                orphans.append(node)
        partition_lookup = community.best_partition(G).iteritems()
        G.add_nodes_from(orphans)
        partitions = {node.r_id: value for node, value in partition_lookup}
        as_json = json_graph.node_link_data(G)
        frontend_compatable = {}
        frontend_compatable['nodes'] = [node['id'] for node in as_json['nodes']]
        for node in frontend_compatable['nodes']:
            if G.neighbors(node):
                node.partition = partitions[node.r_id]
        frontend_compatable['nodes'] = [json.loads(node.to_json()) for node in frontend_compatable['nodes']]
        for node in frontend_compatable['nodes']:
            if node['_id'] in partitions:
                node['partition'] = partitions[node['_id']]
        frontend_compatable['edges'] = as_json['links']
        return frontend_compatable
Example #16
0
def louvain_method(G):
    partition = community.best_partition(G)
    print "Graph nodes:", len(G.nodes()), "egdes:", len(G.edges())
    print "Partitions:", len(set(partition.values())),\
          "Modularity:", community.modularity(partition, G.to_undirected())
    print "\n\n"
    return partition
def create_3comms_bipartite(n,m,p,No_isolates=True):
    
    import community as comm

    from networkx.algorithms import bipartite as bip
    u=0
    while  True:
        G=nx.bipartite_random_graph(n,m,p)
        list_of_isolates=nx.isolates(G)
        if No_isolates:
            G.remove_nodes_from(nx.isolates(G))
        partition=comm.best_partition(G)
        sel=max(partition.values())
        if sel==2 and nx.is_connected(G):
            break
        u+=1
        print u,sel
    ndlss=bip.sets(G)
    ndls=[list(i) for i in ndlss]
    slayer1=ndls[0]
    slayer2=ndls[1]
    layer1=[i for i,v in partition.items() if v==0]
    layer2=[i for i,v in partition.items() if v==1]
    layer3=[i for i,v in partition.items() if v==2]
    edgeList=[]
    for e in G.edges():
        if (e[0] in slayer1 and e[1] in slayer2) or (e[0] in slayer2 and e[1] in slayer1):
            edgeList.append(e)
    return G,layer1,layer2,layer3,slayer1,slayer2,edgeList,partition
Example #18
0
def printStats(filename):
	'''
	Converts json adjacency list into networkx to calculate and print the
	graphs's 
	  - average clustering coefficient
	  - overall clustering coefficient
	  - maximum diameter
	  - average diameter
	  - number of paritions using community.best_parition
	  - modularity of community.best_partition
	'''
	g = makeGraphFromJSON(filename)
	
	print "Average Clustering Coefficient: %f" % nx.average_clustering(g)
	print "Overall Clustering Coefficient: %f" % nx.transitivity(g)
	
	connected_subgraphs = list(nx.connected_component_subgraphs(g))
	largest = max(nx.connected_component_subgraphs(g), key=len)
	print "# Connected Components: %d" % len(connected_subgraphs)
	print "    Maximal Diameter: %d" % nx.diameter(largest)
	print "    Average Diameter: %f" % nx.average_shortest_path_length(largest)

	# Find partition that maximizes modularity using Louvain's algorithm
	part = community.best_partition(g)	
	print "# Paritions: %d" % (max(part.values()) + 1)
	print "Louvain Modularity: %f" % community.modularity(part, g)
def evaluate():

    texts = get_texts()
    gb = words_graph.SimpleGraphBuilder(text_processing.clean_punctuation_and_stopwords, stem_words=False)
    #gb = words_graph.SimpleGraphBuilder(text_processing.only_non_dictionary_words, stem_words=False)
    #gb = words_graph.WindowGraphBuilder(text_processing.clean_punctuation_and_stopwords, stem_words=False)
    #gb = words_graph.NounPhraseGraphBuilder(text_processing.only_non_dictionary_words, stem_words=False)

    gb.load_texts(texts)
    G = gb.create_graph()

    partition = community.best_partition(G)
    #words_by_part = topics.get_words_by_partition(partition)
    words_by_part = graph_cluster.get_overlap_clusters(G, 9, 1)

    computed_topics = topics.get_topics_from_partitions(G, words_by_part)

    #Word splitter
    # computed_topics2 = []
    # for topic in computed_topics:
    #     new_topic = []
    #     for phrase in topic:
    #         new_topic.extend(phrase.split(' '))
    #     computed_topics2.append(new_topic)

    print compute_score(computed_topics, true_topics)
Example #20
0
def graphmltojson(graphfile, outfile):
	"""
	Converts GraphML file to json while adding communities/modularity groups
	using python-louvain. JSON output is usable with D3 force layout.
	Usage:
	>>> python convert.py -i mygraph.graphml -o outfile.json
	"""
	
	G = nx.read_graphml(graphfile)
	G = nx.Graph(G)
	#G = nx.DiGraph.to_undirected(G)

	#karate = Nexus.get(G)
	#cl = karate.community_fastgreedy()
	#k = 57
	#cl.as_clustering(k).membership

	#finds best community using louvain
	partition = community.best_partition(G)
 
	#adds partition/community number as attribute named 'modularitygroup'
	for n,d in G.nodes_iter(data=True):
		d['group'] = partition[n]
 
	node_link = json_graph.node_link_data(G)
	json = json_graph.dumps(node_link)
	
	# Write to file
	fo = open(outfile, "w")
	fo.write(json);
	fo.close()
Example #21
0
    def test_karate(self):
        """"test modularity on Zachary's karate club"""
        graph = nx.karate_club_graph()
        part = co.best_partition(graph)
        self.assertTrue(co.modularity(part, graph) > 0.41)

        for e1, e2 in graph.edges_iter():
            graph[e1][e2]["test_weight"] = 1.

        part_weight = co.best_partition(graph, weight="test_weight")
        self.assertAlmostEqual(co.modularity(part, graph),
                               co.modularity(part_weight, graph, "test_weight"))

        part_res_low = co.best_partition(graph, resolution=0.1)
        self.assertTrue(
            len(set(part.values())) < len(set(part_res_low.values())))
Example #22
0
def plot_modularity(G):
	outFile = outDir + 'communities'
	modularity = collections.Counter()
	mod = community.best_partition(G)
	modList = mod.values()

	for i in np.arange(len(modList)):
		modularity[modList[i]] += 1

	mean = np.mean(modularity.values())
	std_dev = np.std(modularity.values())
	start = min(modularity.keys(), key=int)
	end = max(modularity.keys(), key=int)

	fig, ax = pylab.subplots()
	ax.scatter(modularity.keys(),modularity.values(),color=colors[0])
	pylab.axhline(mean,color=colors[2],label="mean")
	pylab.axhline(mean+std_dev,color=colors[1],label="standard deviation")
	pylab.axhline(mean-std_dev,color=colors[1])
	ax.set_ylabel('Number of LPs')
	ax.set_xlabel('Modularity Class')
	ax.ticklabel_format(useOffset=False)
	ya = ax.get_yaxis()
	ya.set_major_locator(pylab.MaxNLocator(integer=True))
	pylab.xticks(np.arange(start, end+1,10)) # change 10 to 1 (or smaller number) if # of communities is small
	pylab.title('Communities in LP Communication Graph')
	pylab.legend(loc='best', shadow=True)
	display_graph(outFile)
Example #23
0
def main():

    genedict = read_raw_csv(finalURL)

    filterGenes = json.load(open(filterURL, 'r'))

    tempdict = {}
    gtoidict = {}
    itogdict = {}

    for movie in genedict.keys():
        for gene in genedict[movie]['Genes']:
            if gene in filterGenes:
                continue
            gene = str(gene).lower()
            tempdict[gene] = 0

    count = 0
    for gene in tempdict.keys():
        gtoidict[gene] = count
        itogdict[count] = gene
        count += 1

    G = fill_Graph(genedict, count, gtoidict, itogdict, filterGenes)
    mat = NX.to_numpy_matrix(G)
    res = C.best_partition(G)

    get_components_of_G(mat, res, gtoidict, itogdict)
Example #24
0
def testGraph(fileName, r=1):
    G = nx.read_graphml(fileName)
    partitions = community.best_partition(G, resolution=r)

    inv_map = {}
    for k, v in partitions.iteritems():
        inv_map[v] = inv_map.get(v, [])
        inv_map[v].append(k)

    intra_community_distance = {}
    for c in inv_map:
        intra_community_distance[c] = get_inter_nodes_distance(G, inv_map[c])
        inter_community_distance = {}

    for c1, c2 in combinations(inv_map.keys(), 2):
        inter_community_distance[(c1, c2)] = get_intra_community_distance(G,
                                                                          inv_map[c1],
                                                                          inv_map[c2])

    inter_node_ratios = {}
    for c1, c2 in inter_community_distance:
        inter_node_ratios[(c1, c2)] = np.power(inter_community_distance[(c1, c2)], 2) / (
            intra_community_distance[c1] * intra_community_distance[c2]
        )

    pprint.pprint(intra_community_distance)
    pprint.pprint(inter_community_distance)

    # pprint.pprint(inter_node_ratios)

    return np.average(inter_node_ratios.values()), np.std(inter_node_ratios.values())
def t_delta_partition(t_delta_matrix,sm,verbose=False):
    import community;
    g=nx.to_networkx_graph(t_delta_matrix+t_delta_matrix.T - np.diag(t_delta_matrix.diagonal()) ,create_using=nx.Graph()); 
    if verbose==True:
        plt.figure, plt.pcolor(np.array(nx.to_numpy_matrix(g))), plt.colorbar();
        plt.show()
    return community.best_partition(g);
def assign_community(graph):
    g=nx.Graph(graph)
    partition=community.best_partition(g)
    print "Partition found: ",len(set(partition.values()))
    for n in g.nodes_iter():
        g.node[n]["partition"]=partition[n]
    return g
Example #27
0
def saveCluster(outPutDirectory, subGraphs, idToApps, edgeLimit, outputName ):
    counter = 0
    
    for graph in subGraphs:
        if graph.number_of_nodes() > 1:
            output = open(str(outPutDirectory) + "/subGraphNodesFiltered_" + outputName + "_" + str(int(edgeLimit)) + "_" + str(counter) + ".txt", 'w')
            for node in graph.nodes():
                output.write(str(idToApps[node]) + "\n")
            output.close()
            counter += 1
        if graph.number_of_nodes() > 50:
            partition = community.best_partition(graph)
            counter2 = 0
            processing = 1
            while(processing == 1):
                processing = 0
                for p in partition:
                    if partition[p] == counter2:
                        processing = 1
                if processing == 1:
                    output2 = open(str(outPutDirectory) + "/subGraphNodesFilteredBigCluster_"+ outputName + "_" + str(int(edgeLimit)) + "_" + str(counter2) + ".txt", 'w')
                    for p in partition:
                        if partition[p] == counter2:
                            output2.write(str(idToApps[p]) + "\n")
                    output2.close()
                counter2 += 1
Example #28
0
def InitClusterAnalysis(graph):
    global comMemClean
    global comMemNames
    global comsizeClean
    global partition
    print "starting best partition algorithm (will take a while)...."
    partition = community.best_partition(graph)
    modularity = community.modularity(partition, graph)
    LogPrint("the modularity is %f"%modularity)
    if partition !=None:
        for node in partition.iteritems():
            if comSize.has_key(node[1]):
                comSize[node[1]]= comSize[node[1]]+1
                comMem[node[1]].append(node[0])
            else:
                comSize[node[1]]=1
                comMem[node[1]]=[]
    for cSize in comSize.iteritems():
        if cSize[1] >1:
            print "cSize[1]=",cSize[1]
            comsizeClean[cSize[0]] =cSize[1]
            if len(comMem[cSize[0]])==1:
                print "way this value is only one member...",comMem[cSize[0]]
            comMemClean[cSize[0]] = comMem[cSize[0]]
    
    for memberIDs in comMemClean.iteritems():
        comMemNames[memberIDs[0]]=[]
        for member in memberIDs[1]:
            comMemNames[memberIDs[0]].append(utils.GetNodeName(member,graph))  
def getRandomPageRanks(filename):
	Ga=nx.read_graphml(sys.argv[1])

	# create a copy of the graph and extract giant component
	# get component size distribution
	cc=nx.connected_components(Ga)
	cc_dict={}
	for x in range(0,len(cc)):
		try:
			cc_dict[len(cc[x])].append(x)
		except KeyError:
			cc_dict[len(cc[x])]=[]
			cc_dict[len(cc[x])].append(x)

	isolates=nx.isolates(Ga)

	rg=nx.fast_gnp_random_graph(Ga.number_of_nodes(),2.0*Ga.number_of_edges()/(Ga.number_of_nodes()*(Ga.number_of_nodes()-1)))
	c_rg=nx.average_clustering(rg)
	rg_cc=nx.connected_component_subgraphs(rg)[0]
	rg_asp=nx.algorithms.shortest_paths.generic.average_shortest_path_length(rg_cc)

	p_rg=community.best_partition(rg_cc)
	m_rg=community.modularity(p_rg,rg_cc)

	pageranks = nx.pagerank_numpy(rg)
	return pageranks
def analyze_graph(G):    
    #centralities and node metrics
    out_degrees = G.out_degree()
    in_degrees = G.in_degree()
    betweenness = nx.betweenness_centrality(G)
    eigenvector = nx.eigenvector_centrality_numpy(G)
    closeness = nx.closeness_centrality(G)
    pagerank = nx.pagerank(G)
    avg_neighbour_degree = nx.average_neighbor_degree(G)
    redundancy = bipartite.node_redundancy(G)
    load = nx.load_centrality(G)
    hits = nx.hits(G)
    vitality = nx.closeness_vitality(G)
    
    for name in G.nodes():
        G.node[name]['out_degree'] = out_degrees[name]
        G.node[name]['in_degree'] = in_degrees[name]
        G.node[name]['betweenness'] = betweenness[name]
        G.node[name]['eigenvector'] = eigenvector[name]
        G.node[name]['closeness'] = closeness[name]
        G.node[name]['pagerank'] = pagerank[name]
        G.node[name]['avg-neigh-degree'] = avg_neighbour_degree[name]
        G.node[name]['redundancy'] = redundancy[name]
        G.node[name]['load'] = load[name]
        G.node[name]['hits'] = hits[name]
        G.node[name]['vitality'] = vitality[name]
        
    #communities
    partitions = community.best_partition(G)
    for member, c in partitions.items():
        G.node[member]['community'] = c   
    
    return G
Example #31
0
        'eleinamazing', 'A_boy_and_his_boston', 'lebbe', 'GlobTrotters',
        'Nichchk', 'hellobutno', 'Moskau50', 'Turd111', 'RogueSexToy',
        'Blackhk', '22_hours_ago', 'humanity_is_doomed', 'ASketchyLlama',
        'leftrighttopdown', 'IronKanabo', 'ZWF0cHVzc3k', 'simian_ninja',
        'Eitoku_K', 'pomelopomelo'
    ]
    user_graph, weight_dict, id_user_dic = query_api("2019-07-01", 30, 10000,
                                                     july_karma_list)
    weight_edges = [(x, y, val) for (x, y), val in weight_dict.items()]
    user_graph.add_weighted_edges_from(weight_edges)
    july_final_karma = fake_list + july_karma_list
    if action == 'attack':
        run_attack(user_graph, july_final_karma, weight_dict)
    elif action == 'partition':
        run_partition(user_graph, july_final_karma, weight_dict)
    elif action == 'pruning':
        run_pruning(user_graph, july_final_karma, weight_dict)
    elif action == 'practical':
        # practical implementation
        for ele in july_karma_list:
            user_graph.remove_node(ele)
            if ele in weight_dict:
                del weight_dict[ele]
    # Community detection
    attack_graph = nx.read_gml("m3_oct_attack.gml")
    part3 = community.best_partition(attack_graph)
    origin_graph = nx.read_gml("m2_aug.gml")
    part1 = community.best_partition(origin_graph)
    defense_graph = nx.read_gml("m3_oct_defense.gml")
    part2 = community.best_partition(defense_graph)
Example #32
0
#python src\graph-construction\louvain.py -e data/normalized/points_delaunay_Chinese_edge_full.csv -d data/normalized/louvain_dict_edge.json -o data/normalized/louvain_edge.csv
parser = argparse.ArgumentParser(description='Detect communities with Louvain')
parser.add_argument('--edgefile',
                    '-f',
                    help='csv of edges',
                    default='data/graph_calgary_knn_20.csv')
parser.add_argument('--dictfile',
                    '-o',
                    help='outfile',
                    default="data/louvain_calgary_dict_knn_20.json")
args = parser.parse_args()

edge = pd.read_csv(args.edgefile, ' ', header=0)
graph = nx.from_pandas_edgelist(edge, source='r1', target='r2')

partition = community.best_partition(
    graph)  #dictionarity of node_id --> community #

# assignments = {} #dict from community # --> list of nodes in community
# for part, idx in partition.iteritems():
# 	if idx not in assignments:
# 		assignments[idx] = [part]
# 	else:
# 		assignments[idx].append(part)
# # print assignments
# with open(args.outfile, "w+") as f:
# 	for idx, assignment in tqdm(assignments.iteritems()):
# 		print len(assignment)
# 		# print assignment
# 		f.write(", ".join(assignment))
# 		f.write("\n")
Example #33
0
def calculateModularity(network):
    bestPartition = community.best_partition(network)
    return community.modularity(bestPartition, network)
Example #34
0
        # create separate file to save the degree of each node
        degrees = cur_graph.degree()

        with open(
                infilename.replace(".csv", "") + "-degree-dist-pval" +
                str(alpha) + ".csv", "w") as out_file:
            w = csv.DictWriter(out_file, degrees.keys())
            w.writeheader()
            w.writerow(degrees)

        # calculate number of nodes w/ degree > 0 and total number of edges
        num_nodes = len(cur_graph.nodes())
        num_edges = len(cur_graph.edges())

        # calculate clustering coefficients
        avg_clustering_coeff = nx.average_clustering(cur_graph)

        # calculate shortest paths
        avg_shortest_path_length = nx.average_shortest_path_length(cur_graph)

        # calculate communities and modularity value
        partition = community.best_partition(cur_graph)
        num_communities = len(set(partition.values()))
        modularity_value = community.modularity(partition, cur_graph)

        network_outfile.write(
            str(alpha) + "," + str(num_nodes) + "," + str(num_edges) + "," +
            str(avg_clustering_coeff) + "," + str(avg_shortest_path_length) +
            "," + str(num_communities) + "," + str(modularity_value) + "\n")
# time = datetime.now().strftime('%H:%M:%S')
# print "*** Starting ploting whole graph... @:" + time
# igraph.plot(g, "complete-graph.pdf", **visual_style)
# time = datetime.now().strftime('%H:%M:%S')
# print "*** Graph ploting ENDED ... @:" + time


# make the graph with the networkX library
gx = nx.Graph()  # this is undirected graph
# gx = nx.DiGraph()  # this is directed graph
gx.add_nodes_from(nodes)
gx.add_edges_from(edges)
# find communities
time = datetime.now().strftime('%H:%M:%S')
print "*** Starting community calculation... @:" + time
clusters = community.best_partition(gx)
time = datetime.now().strftime('%H:%M:%S')
print "*** Community calculation ENDED ... @:" + time

# time = datetime.now().strftime('%H:%M:%S')
# print "*** Starting community ploting... @:" + time
# nx.draw_spring(gx, cmap=plt.get_cmap('jet'), node_color='#A0CBE2',edge_color='#BB0000', node_size=25, with_labels=False)
# plt.savefig("communities-graph.pdf")
# time = datetime.now().strftime('%H:%M:%S')
# print "*** Community ploting ENDED ... @:" + time

# Find Communities with iGraph (had problem)
# clusters = g.community_multilevel(return_levels=True)
# igraph.plot(clusters, "communities-graph.png", mark_groups=True, **visual_style)

# an adjacency list is a collection of unordered lists used to represent a finite graph.
Example #36
0
    rows = edgecsv.read().split('\n')
    edges = [r.split(',')[:2] for r in rows[1:]]
    weights = [r.split(',')[-1] for r in rows[1:]]
    edge_tuples = [(e[0], e[1], int(weights[i])) for i, e in enumerate(edges)]

# Only get edges for the select nodes in the node csv.
edges = []
for e in edge_tuples:
    if all(x in list(node_ids) for x in e[:2]):
        edges.append(e)

# Initialize graph, add nodes and edges, calculate modularity and centrality.
G = nx.Graph()
G.add_nodes_from(list(node_ids))
G.add_weighted_edges_from(edges)
groups = community.best_partition(G)
degree = cn.degree_centrality(G)
betweenness = cn.betweenness_centrality(G, weight='weight')
eigenvector = cn.eigenvector_centrality(G, weight='weight')

# Add node attributes for name, modularity, and three types of centrality.
nx.set_node_attributes(G, 'name', node_dict)
nx.set_node_attributes(G, 'group', groups)
nx.set_node_attributes(G, 'degree', degree)
nx.set_node_attributes(G, 'betweenness', betweenness)
nx.set_node_attributes(G, 'eigenvector', eigenvector)

# Create json representation of the graph (for d3).
data = json_graph.node_link_data(G)

# You could create the needed json without NetworkX (but you would forfeit network metrics).
Example #37
0
import community
import networkx as nx
import matplotlib.pyplot as plt

#better with karate_graph() as defined in networkx example.
#erdos renyi don't have true community structure
G = nx.erdos_renyi_graph(30, 0.05)

#first compute the best partition
partition = community.best_partition(G)

#drawing
size = float(len(set(partition.values())))
pos = nx.spring_layout(G)
count = 0.
for com in set(partition.values()):
    count = count + 1.
    list_nodes = [
        nodes for nodes in partition.keys() if partition[nodes] == com
    ]
    nx.draw_networkx_nodes(G,
                           pos,
                           list_nodes,
                           node_size=20,
                           node_color=str(count / size))

nx.draw_networkx_edges(G, pos, alpha=0.5)
plt.show()
Example #38
0
def clustering_scores(args,
                      latent,
                      labels,
                      cells,
                      dataset,
                      suffix,
                      tlabels,
                      louvain_num=15,
                      prediction_algorithm="knn",
                      X_tf=None,
                      ensemble=False,
                      batch_indices=None,
                      save_cluster=False,
                      seed=42):
    from scipy.spatial import distance

    vec = latent
    mat = kneighbors_graph(latent,
                           louvain_num,
                           mode='distance',
                           include_self=True).todense()
    print('mat', mat.shape)

    alg = 'louvain'
    if alg == 'louvain':
        labels_pred = []
        G = nx.from_numpy_matrix(mat)
        partition = community.best_partition(G, random_state=seed)
        for i in range(mat.shape[0]):
            labels_pred.append(partition[i])
    elif alg == 'leiden':
        vcount = max(mat.shape)
        sources, targets = mat.nonzero()
        edgelist = zip(sources.tolist(), targets.tolist())
        g = ig.Graph(vcount, edgelist)
        partition = leidenalg.find_partition(
            g, leidenalg.ModularityVertexPartition)
        print(partition.membership)

        labels_pred = partition.membership

    labels_pred = np.array(labels_pred)

    if args.plot == 'tsne':
        embedding = TSNE(random_state=seed, perplexity=50).fit_transform(vec)
    elif args.plot == 'umap':
        embedding = umap.UMAP(random_state=42).fit_transform(vec)

    print('pred labels is', labels_pred.shape, np.max(labels_pred), vec[0, :5],
          embedding[:5], labels_pred[:100])
    print('labels is', np.array(labels).shape)
    show_tsne(
        embedding, labels_pred, 'result/%s/%s-GMVAE-%s-%s-pred.png' %
        (dataset, suffix, 'alpha-gan', ensemble))
    np.savetxt('result/%s/labels.txt' % (dataset), labels_pred)

    #if labels is not None:
    result_filename = 'result/%s-%d-%d-%d-cluster_result.csv' % (
        dataset, args.n_hidden, args.n_latent, louvain_num)
    if len(labels) == 0:
        with open(result_filename, 'w') as f:
            f.write('cell,predicted label,tsne-1,tsne-2\n')
            for cell, pred, t in zip(cells, labels_pred, embedding):
                f.write('%s,%d,%f,%f\n' % (cell, pred, t[0], t[1]))
        if batch_indices is not None:
            print('batch', batch_indices)
            show_tsne(embedding,
                      batch_indices,
                      'result/%s/%s-%s-batch.png' %
                      (dataset, suffix, 'alpha-gan'),
                      tlabels=batch_indices)
    else:
        show_tsne(embedding,
                  labels,
                  'result/%s/%s-GMVAE-%s-%s-true.png' %
                  (dataset, suffix, 'alpha-gan', ensemble),
                  tlabels=tlabels)
        if batch_indices is None:
            with open(result_filename, 'w') as f:
                f.write('cell,tlabel id,label,predicted label,tsne-1,tsne-2\n')
                for cell, label, tlabel, pred, t in zip(
                        cells, labels, tlabels, labels_pred, embedding):
                    f.write('%s,%d,%s,%d,%f,%f\n' %
                            (cell, label, tlabel, pred, t[0], t[1]))
        else:
            with open(result_filename, 'w') as f:
                f.write(
                    'cell,tlabel id,label,predicted label,tsne-1,tsne-2,batch\n'
                )
                for cell, label, tlabel, pred, t, batch in zip(
                        cells, labels, tlabels, labels_pred, embedding,
                        batch_indices):
                    f.write('%s,%d,%s,%d,%f,%f,%d\n' %
                            (cell, label, tlabel, pred, t[0], t[1], batch))

        #print(labels, labels_pred, latent)
        #asw_score = silhouette_score(latent, labels)
        asw_score = 0
        nmi_score = NMI(labels, labels_pred)
        ari_score = ARI(labels, labels_pred)
        homo_score = homogeneity_score(labels, labels_pred)
        #uca_score = unsupervised_clustering_accuracy(labels, labels_pred)
        print(
            "Clustering Scores:\nHOMO: %.4f\nNMI: %.4f\nARI: %.4f\nUCA: %.4f" %
            (homo_score, nmi_score, ari_score, 0))

        if batch_indices is not None:
            show_tsne(embedding,
                      batch_indices,
                      'result/%s/%s-%s-batch.png' %
                      (dataset, suffix, 'alpha-gan'),
                      tlabels=batch_indices)
        return asw_score, nmi_score, ari_score, 0
Example #39
0
File: test6.py Project: lee818/test
 print c_g
 print "##########now is the degree sequence########## "
 d_g = graph_utils.get_degrees(A)
 d_b = graph_utils.get_degrees(B_matrix)
 print "this is the degree  of the original Graph G and new graph B"
 print d_b
 print d_g
 cov = nm.cov(d_b, d_g, ddof=0)[0][1]
 standard_d_b = nm.std(d_b, ddof=0)
 standard_d_g = nm.std(d_g, ddof=0)
 ppcc = cov / (standard_d_b * standard_d_g)
 print(str(ppcc))
 d_pcc.append(ppcc)
 print(str(cov))
 print "##########now is the partition##########"
 p_g = community.best_partition(G)
 p_b = community.best_partition(B)
 c_a_g = graph_utils.average_clustering(A)
 c_a_b = graph_utils.average_clustering(B_matrix)
 print "this is the partition  of the original Graph G and new graph B"
 print p_b
 print p_g
 list1 = list(p_b.values())
 list2 = list(p_g.values())
 temp = 0
 for i in range(len(list1)):
     if list1[i] == list2[i]:
         temp += 1
 print float(temp) / len(list1)
 partiton_ratio.append(float(temp) / len(list1))
 print "this is the avg clustering of the original Graph G and new graph B"
Example #40
0
 def rodando_louvain(self, porcentagem_do_sample):
     self.criando_matriz_de_similaridade(
         porcentagem_do_sample=porcentagem_do_sample)
     self.clusters = (community.best_partition(
         self.G, weight='weight', randomize=True))
Example #41
0
def get_data_v3(cuda=True):

    # Here the data is obtained from pytorch-geometric to eliminate unnecessary shuffling done in Kipf's code
    edge_index = pk.load(open("graph.pkl", "rb"))
    row, col = edge_index
    edges = [(int(u), int(v)) for u, v in zip(row.tolist(), col.tolist())]
    g = nx.Graph()
    g.add_edges_from(edges)
    print("Graph Read ")

    nnodes = nx.number_of_nodes(g)
    nodes = nx.nodes(g)
    #print(nodes)
    cr = dict(nx.core_number(g))
    cr_vals = set(v for v in cr.values())
    cr_dict = {}
    for d in cr_vals:
        tmp = []
        for k, v in cr.items():
            if v == d:
                tmp.append(k)
        cr_dict[d] = tmp
    print("core numbers of original graph", len(cr_vals))

    print("number of nodes--", nnodes)
    cut = int(0.1 * nnodes)
    print("cut value--", cut)
    #print("number of nodes,edges ",g.number_of_nodes(),g.number_of_edges())
    adj = np.zeros(
        (torch.max(edge_index).item() + 1, torch.max(edge_index).item() + 1))
    for u, v in list(g.edges()):
        adj[u, v] = 1
        adj[v, u] = 1
    adj = nx.to_numpy_array(g, dtype=np.float)
    adj = adj + np.eye(adj.shape[0])
    adj = sp.sparse.coo_matrix(adj)
    print("Adjacency Made")

    adj = torch.FloatTensor(adj.todense())
    features = pk.load(open("feature.pkl", "rb"))
    features = normalize_features(features.numpy())
    features = torch.FloatTensor(features)
    print("Features Normalized ")

    labels = pk.load(open("label.pkl", "rb"))
    lb = labels.numpy()
    ground_dict = Counter(lb)
    classes = len(ground_dict)

    #community detection --Infomap
    info = infomap.Infomap("--two-level --silent -s 8")
    for e in list(g.edges()):
        info.addLink(*e)
    info.run()
    c = info.getModules()  #node:community
    z = defaultdict(list)
    for u in c:
        z[c[u]].append(u)  #community:[nodes]
    #print("number of communities detected")
    #print (len(z))
    com_size = {}
    for k, v in z.items():
        com_size[k] = len(v)
    #print(com_size)

    #community detection-- Louvain
    partition = community.best_partition(g)  #node:community
    com = defaultdict(list)
    for p in partition:
        com[partition[p]].append(p)
    print("number of communities detected")
    print(len(com))

    a = set()
    a_wt = []
    for te in edges:
        u = te[0]
        v = te[1]
        com_u = partition[u]
        com_v = partition[v]
        t = (com_u, com_v)
        a.add(t)
        if com_u > com_v:
            m = (com_v, com_u)
            a_wt.append(m)
        else:
            a_wt.append(t)

    edge_wt = Counter(a_wt)
    #print(edge_wt)

    meta_wt_edge = {}
    #print(len(a))
    meta_nodes = list(com.keys())
    #print (len(meta_nodes))
    per = list(permutations(meta_nodes, 2))
    b = set()
    for cc in per:
        b.add(cc)
    meta_edge = a.intersection(b)

    for k, v in edge_wt.items():
        if k in meta_edge:
            meta_wt_edge[k] = v

    #print("meta edges")
    #print(meta_wt_edge)

    meta_net = nx.Graph()
    meta_net.add_nodes_from(meta_nodes)
    meta_net.add_edges_from(meta_edge)
    print("meta graph formed")

    m_nodes = nx.number_of_nodes(meta_net)
    print("number of meta nodes", m_nodes)

    m_edges = meta_net.number_of_edges()
    print("number of meta edges", m_edges)

    train_ids = []

    edge_set = set(edges)
    for m in meta_nodes:
        coms = com[m]
        perm = set(permutations(coms, 2))
        in_edges = edge_set.intersection(perm)
        #print(in_edges)
        in_net = nx.Graph()
        in_net.add_edges_from(in_edges)
        #print(in_net.edges())
        in_clus = nx.clustering(in_net)
        #print("clustering",in_clus)
        h = max(in_clus.items(), key=operator.itemgetter(1))[0]
        train_ids.append(h)

    #meta_edgelist = list(meta_net.edges())
    '''cores = dict(nx.core_number(meta_net))

    mst = nx.minimum_spanning_tree(meta_net, algorithm='prim')
    #print("tree edges",mst.edges())
    mst_edgelist = list(sorted(mst.edges()))
    mst_nodes =  list(mst.nodes())
    mst_adj = {}
    for s in mst_nodes:
        mst_l = []
        for e in mst_edgelist:
            if s == e[0] :
                mst_l.append(e[1])
        mst_adj[s] = mst_l

    #print(mst_adj)
    #print(mst_edgelist)
    core_vals = set(v for v in cores.values())
    core_dict = {}
    for d in core_vals:
        tmp = []
        for k,v in cores.items():
            if v == d:
                tmp.append(k)
        core_dict[d] = tmp'''

    #print(core_dict)
    #print ("number of cores in meta network:", len(core_dict))
    '''core_class = {}
    for k,v in core_dict.items():
        cls = []
        for m in v:
            nd = z[m]
            for x in nd:
                cl = lb[x]
                cls.append(cl)
        core_lb = Counter(cls)
        mm =  max(v for k,v in core_lb.items())
        for k1,v1 in core_lb.items():
            if v1 == mm:
                core_class[k]=k1
    print("class information per core--")
    print(core_class)   #The class information/core is printed

    com_class = {}
    for mn in meta_nodes:
        cls = []
        nd = z[mn]
        for x in nd:
            cl = lb[x]
            cls.append(cl)
        com_lb = Counter(cls)
        mm = max(v for k,v in com_lb.items())
        for k1,v1 in com_lb.items():
            if v1 == mm :
                com_class[mn] = k1

    print("class information per community--")
    #print(com_class) #The class information/community is printed

    com_cls = []
    for k,v in com_class.items():
        com_cls.append(v)
    print(Counter(com_cls))

    sorted_core = dict(OrderedDict(sorted(core_dict.items(),reverse=True)))

    reverse_core = dict(OrderedDict(sorted(sorted_core.items())))'''

    '''t_n = []
    for v in sorted_core[25]:
        for t in z[v]:
            t_n.append(t)
    t_lb = []
    for t in t_n:
        t_lb.append(lb[t])''' #for checking the class labels distribution in each core

    #build 2nd order network--
    '''meta_info = infomap.Infomap("--two-level --silent -s 8")
    for e in list(meta_net.edges()):
        meta_info.addLink(*e)
    meta_info.run()
    cc = meta_info.getModules() #node:community
    zz = defaultdict(list)
    for u in cc:
        zz[cc[u]].append(u) #community:[nodes]
    print("number of meta communities detected")
    print (len(zz))

    meta_coms = {}
    for k,v in zz.items():
        cls = []
        for b in v:
            lbl = com_class[b]
            cls.append(lbl)
        metacom_lb = Counter(cls)
        meta_coms[k] = metacom_lb

    print("class information of meta communities of 2nd order network--")
    print(meta_coms)

    meta_cr = dict(nx.core_number(meta_net))
    meta_cr_vals = set(v for v in meta_cr.values())
    meta_cr_dict = {}
    for d in meta_cr_vals:
        tmp = []
        for k,v in meta_cr.items():
            if v == d:
                tmp.append(k)
        meta_cr_dict[d] = tmp
    print("cores in 2nd order network--")
    print(meta_cr_dict)

    #Selection of training nodes
    core_window =3

    t_cores = []
    cnt = 0
    for cr,coms in sorted_core.items():
        t_cores.append(cr)
        cnt += 1
        if cnt == core_window:
            break
    print("t_cores--",t_cores)
    #print("t_coms--",len(t_coms))
   
    #build adjacency matrix of edges--
    t_coms = core_dict[7]
    p = len(t_coms)
    rows,cols = (p,p)
    adje = [[0]*cols]*rows
    for me in meta_edgelist:
        u = me[0]
        v = me[1]
        if u in t_coms:
            if v in t_coms:
                #h += 1
                ui = t_coms.index(u)
                vi = t_coms.index(v)
                adje[ui][vi] += 1
    #print(adje)'''
    '''for me in meta_edge:
        u = me[0]
        if u == 5:
            print(me)'''
    '''t_arr = []
    for i in range(core_window):
        t_arr.append(0)

    
    tr_dict = {}
    for cls in range(classes):
        tr_nodes = []
        fl  = 0
        ar = 0
        cnt_cls = int(0.1*(ground_dict[cls]))
        print("cls and count--",cls,cnt_cls)
        while(True):
            for cr in t_cores:
                coms = core_dict[cr]
                j = t_arr[ar]
                cm = coms[j]
                j = (j+1)%len(coms)
                t_arr[ar] = j
                ar += 1
                #cm = int(np.random.choice(coms,1))
                nn = z[cm]
                n = int(np.random.choice(nodes,1))
                l = lb[n]
                if l == cls and n not in tr_nodes:
                    tr_nodes.append(n)
                    if len(tr_nodes) == cnt_cls:
                        fl = 1
                        break
                if ar == core_window:
                    ar = 0
            if fl == 1:
                tr_dict[cls] = tr_nodes
                break



    t_lbls = []
    for k,v in tr_dict.items():
        for t in v:
            lbl = lb[t]
            t_lbls.append(lbl)
            
    print("class level distribution--training labels",Counter(t_lbls))

    train_ids = []
    val_ids = []
    test_ids = []
    test_mask_ids = []
    for k,v in tr_dict.items():
        for t in v:
            train_ids.append(t)
    #for n in nodes2:
        #train_ids.append(n)
    f = 0
    while True:
        if len(train_ids)<cut:
            r = int(np.random.choice(nodes,1,replace = False))
            if r not in train_ids:
                train_ids.append(r)
                if len(train_ids)==cut:
                    f = 1
        if f == 1:
            break
    #print("train ids--",len(train_ids))'''
    #sorted_core = dict(OrderedDict(sorted(core_dict.items(),reverse=True)))
    #print(sorted_core)

    #c_meta_nodes = sorted_core[7]
    #y = int(np.random.choice(c_meta_nodes,1))

    #train_ids = []
    #train_coms = bfs(mst_adj,y)
    #print(train_coms)
    '''f = 0
    while True:
        for tc in train_coms:
            yy = z[tc]
            x = int(np.random.choice(yy,1))
            train_ids.append(x)
            if len(train_ids) == cut :
                f = 1
                break
        if f == 1:
            break
        else:
            continue'''

    #print(train_ids)

    #train-test nodes choice
    '''for m in meta_nodes:
        f_nodes = z[m]
        x = int(np.random.choice(f_nodes,1,replace=False))
        train_ids.append(x)'''

    val_ids = []
    test_ids = []
    rm_ids = []

    for n in nodes:
        if n not in train_ids:
            #if n not in nodes2:
            rm_ids.append(n)
    #print ("test ids--",len(test_ids))

    #val_ids.extend(rm_ids[0:int(0.1*len(nodes))])
    val_ids = np.random.choice(rm_ids, len(train_ids), replace=False)

    r_ids = []
    for n in rm_ids:
        if n not in val_ids:
            r_ids.append(n)
    #val_ids= np.random.choice(test_ids,int(0.1*len(nodes)),replace= False)
    test_ids = np.random.choice(r_ids, 1084, replace=False)
    #val_ids = np.random.choice(test_ids,int(0.1*len(nodes)),replace= False)
    #test_mask_ids = np.random.choice(test_ids,1084,replace = False)

    with open("test_labels_infomap.txt", 'wb') as fp:
        pk.dump(test_ids, fp)

    with open("training_labels_infomap.txt", "wb") as fp:
        pk.dump(train_ids, fp)

    idx_train = np.array(train_ids)
    idx_val = np.array(val_ids)
    idx_test = np.array(test_ids)
    print("Train Validation Test ", len(idx_train), len(idx_val),
          len(idx_test))

    if cuda:
        features = features.cuda()
        adj = adj.cuda()
        labels = labels.cuda()
        #idx_train = idx_train.cuda()
        #idx_val = idx_val.cuda()
        #idx_test = idx_test.cuda()
    #return g,adj,features,labels,idx_train,idx_val,idx_test
    return idx_train, idx_test, idx_val
g = nx.read_graphml(sys.argv[1])

# no self-loops
g.remove_edges_from(g.selfloop_edges())

if g.order() == 0:
    print 'The graph contains no nodes'
    sys.exit(1)
else:
    try:
        # determine Louvain modularity score of entire graph
        if g.size() == 0:
            # no edges, modularity is 1
            mod = 1.0
        else:
            part = com.best_partition(g)
            mod = com.modularity(part, g)
    except Exception as e:
        print 'An exception occurred during modularity analysis'
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print exc_type, fname, exc_tb.tb_lineno
        sys.exit(1)

# count isolates
n_iso = len(nx.isolates(g))

# mean degree
mean_deg = np.mean(nx.degree(g).values())

# median degree
Example #43
0
def plot_graph(in_dir, item):
    '''plot the communities graph which nodes and edges are extracted from
    the json file coocnetworks.json built by the function describe_corpus:

        {
        "nodes":[
            {"type":"AU","name":0,"item":"Abanades S","size":8},
            ...................................................
        ],
        "links":[
            {"type":"AU","source":0,"target":8,"Ncooc":5},
            ..............................................
        ]
        }

    where type = "AU", "S", "I", "CU", "S2", "IK", "AK", "TK", "R", "RJ"
    
    Returns the graph G.
    '''

    # Standard library imports
    import json
    import pprint
    from pathlib import Path

    # 3rd party imports
    import community as community_louvain
    import matplotlib.cm as cm
    import matplotlib.pyplot as plt
    import networkx as nx
    import numpy as np
    import pandas as pd

    # Local imports
    from .BiblioSpecificGlobals import LABEL_MEANING
    from .BiblioSpecificGlobals import VALID_LABEL_GRAPH


    assert (item in VALID_LABEL_GRAPH),\
            f'unknown type {TYPE}: should be {", ".join(VALID_LABEL_GRAPH)}'

    # Extract nodes and edgesSets the graph  from the json coocnetworks.json
    # for type=TYPE
    # -----------------------------------------------------------
    file_coocnetworks = in_dir / Path('coocnetworks.json')
    with open(file_coocnetworks, 'r') as read_file:
        cooc = json.load(read_file)

    df = pd.DataFrame(cooc['links']).query('type==@item')
    G = nx.from_pandas_edgelist(df, source='source', target='target')
    dg = pd.DataFrame(cooc['nodes']).query('type==@item')
    G.add_nodes_from(dg['name'])
    for index, row in dg.iterrows():
        src_attr_dict = {k: row.to_dict()[k] for k in ['item', 'size']}
        G.nodes[row['name']].update(src_attr_dict)

    # compute the best partition
    partition = community_louvain.best_partition(G)
    nx.set_node_attributes(G, partition, 'community_id')

    # draw the graph
    pos = nx.spring_layout(G)
    node_size = np.array(list(nx.get_node_attributes(G, 'size').values())) * 70
    cmap = cm.get_cmap('viridis', max(partition.values()) + 1)
    fig = plt.figure(figsize=(15, 15))
    nx.draw_networkx_nodes(G,
                           pos,
                           partition.keys(),
                           node_size=node_size,
                           cmap=cmap,
                           node_color=list(partition.values()))
    nx.draw_networkx_edges(
        G,
        pos,
        alpha=0.9,
        width=1.5,
        edge_color='k',
        style='solid',
    )

    labels = nx.draw_networkx_labels(G, pos=pos, font_size=8, font_color='w')

    plt.title(
        f'Graph partition using the {LABEL_MEANING[item]} and the Louvain algorithm'
    )
    plt.show()

    node = nx.get_node_attributes(G, 'item')
    pprint.pprint(node)

    df = pd.DataFrame({
        node_id: [num_partition, node[node_id]]
        for node_id, num_partition in partition.items()
    }).T

    for g in df.groupby([0]):
        print(f'N° partition:{g[0]}, items: {g[1][1].to_list()}')

    del df, dg, src_attr_dict, partition, labels, node

    return G
Example #44
0
def louvain(adata,
            resolution=None,
            random_state=0,
            restrict_to=None,
            key_added=None,
            adjacency=None,
            flavor='vtraag',
            directed=True,
            copy=False):
    """Cluster cells into subgroups [Blondel08]_ [Levine15]_ [Traag17]_.

    Cluster cells using the Louvain algorithm [Blondel08]_ in the implementation
    of [Traag17]_. The Louvain algorithm has been proposed for single-cell
    analysis by [Levine15]_.

    This requires to run :func:`~scanpy.api.pp.neighbors`, first.

    Parameters
    ----------
    adata : :class:`~scanpy.api.AnnData`
        The annotated data matrix.
    resolution : `float` or `None`, optional (default: 1)
        For the default flavor ('vtraag'), you can provide a resolution (higher
        resolution means finding more and smaller clusters), which defaults to
        1.0.
    random_state : `int`, optional (default: 0)
        Change the initialization of the optimization.
    restrict_to : `tuple`, optional (default: None)
        Restrict the clustering to the categories within the key for sample
        annotation, tuple needs to contain (obs key, list of categories).
    key_added : `str`, optional (default: 'louvain')
        Key under which to add the cluster labels.
    adjacency : sparse matrix or `None`, optional (default: `None`)
        Sparse adjacency matrix of the graph, defaults to
        `adata.uns['neighbors']['connectivities']`.
    flavor : {'vtraag', 'igraph'}
        Choose between to packages for computing the clustering. 'vtraag' is
        much more powerful.
    copy : `bool` (default: `False`)
        Copy adata or modify it inplace.

    Returns
    -------
    Depending on `copy`, returns or updates `adata` with the following fields.

    louvain : `pd.Series` (``adata.obs``, dtype `category`)
        Array of dim (number of samples) that stores the subgroup id ('0',
        '1', ...) for each cell.
    """
    logg.info('running Louvain clustering', r=True)
    adata = adata.copy() if copy else adata
    if adjacency is None and 'neighbors' not in adata.uns:
        raise ValueError(
            'You need to run `pp.neighbors` first to compute a neighborhood graph.'
        )
    if adjacency is None:
        adjacency = adata.uns['neighbors']['connectivities']
    if restrict_to is not None:
        restrict_key, restrict_categories = restrict_to
        if not isinstance(restrict_categories[0], str):
            raise ValueError('You need to use strings to label categories, '
                             'e.g. \'1\' instead of 1.')
        for c in restrict_categories:
            if c not in adata.obs[restrict_key].cat.categories:
                raise ValueError(
                    '\'{}\' is not a valid category for \'{}\''.format(
                        c, restrict_key))
        restrict_indices = adata.obs[restrict_key].isin(
            restrict_categories).values
        adjacency = adjacency[restrict_indices, :]
        adjacency = adjacency[:, restrict_indices]
    if flavor in {'vtraag', 'igraph'}:
        if flavor == 'igraph' and resolution is not None:
            logg.warn(
                '`resolution` parameter has no effect for flavor "igraph"')
        if directed and flavor == 'igraph':
            directed = False
        if not directed: logg.m('    using the undirected graph', v=4)
        g = utils.get_igraph_from_adjacency(adjacency, directed=directed)
        if flavor == 'vtraag':
            import louvain
            if resolution is None: resolution = 1
            try:
                logg.info('    using the "louvain" package of Traag (2017)')
                louvain.set_rng_seed(random_state)
                part = louvain.find_partition(
                    g,
                    louvain.RBConfigurationVertexPartition,
                    resolution_parameter=resolution)
                # adata.uns['louvain_quality'] = part.quality()
            except AttributeError:
                logg.warn('Did not find package louvain>=0.6, '
                          'the clustering result will therefore not '
                          'be 100% reproducible, '
                          'but still meaningful. '
                          'If you want 100% reproducible results, '
                          'update via "pip install louvain --upgrade".')
                part = louvain.find_partition(g,
                                              method='RBConfiguration',
                                              resolution_parameter=resolution)
        elif flavor == 'igraph':
            part = g.community_multilevel()
        groups = np.array(part.membership)
    elif flavor == 'taynaud':
        # this is deprecated
        import networkx as nx
        import community
        g = nx.Graph(adjacency)
        partition = community.best_partition(g)
        groups = np.zeros(len(partition), dtype=int)
        for k, v in partition.items():
            groups[k] = v
    else:
        raise ValueError(
            '`flavor` needs to be "vtraag" or "igraph" or "taynaud".')
    unique_groups = np.unique(groups)
    n_clusters = len(unique_groups)
    if restrict_to is None:
        groups = groups.astype('U')
        key_added = 'louvain' if key_added is None else key_added
        adata.obs[key_added] = pd.Categorical(values=groups,
                                              categories=natsorted(
                                                  unique_groups.astype('U')))
    else:
        key_added = restrict_key + '_R' if key_added is None else key_added
        all_groups = adata.obs[restrict_key].astype('U')
        prefix = '-'.join(restrict_categories) + ','
        new_groups = [prefix + g for g in groups.astype('U')]
        all_groups.iloc[restrict_indices] = new_groups
        adata.obs[key_added] = pd.Categorical(values=all_groups,
                                              categories=natsorted(
                                                  all_groups.unique()))
    adata.uns['louvain'] = {}
    adata.uns['louvain']['params'] = {
        'resolution': resolution,
        'random_state': random_state
    }
    logg.info('    finished',
              time=True,
              end=' ' if settings.verbosity > 2 else '\n')
    logg.hint('found {} clusters and added\n'
              '    \'{}\', the cluster labels (adata.obs, categorical)'.format(
                  n_clusters, key_added))
    return adata if copy else None
Example #45
0
    for i in grp:
        for j in grp[c + 1:]:
            weights = df[df['nodes'] == i][j].values[0]
            all_graphs[lab].add_edge(i, j, weight=weights)
        c += 1
    return


# Creates edges between nodes and assignes weights to edges
for i in all_graphs.keys():
    add_edges_with_weights(all_grps[i], all_dfs[i], i, all_graphs)

#%%
# Using python-louvain package allows to find best communities(partition) for nodes
for i in all_graphs.keys():
    partition = community_louvain.best_partition(all_graphs[i])
    all_dfs[i]['partition'] = all_dfs[i].nodes.apply(
        lambda node: partition[node])


#%%
def get_nodes_by_partition(df):
    """ Pulls indices of nodes within the same community(partition)

        Arguments:
            df: all_dfs['label'] -> 'label' is written as grp0 to grp43

        Returns: 
            Dict of nodes organized by community(partition)
    """
    temp_dict = {}
Example #46
0
File: ml.py Project: MaorRocky/mini
def train(d1, d2):
    file_to_machines_dic = {}
    clean_dict = {}
    unknown_set = set()
    file_sha1_to_size = {}
    fileAndDomain_to_machines_dic = {}
    data_name = 'Obf_oneInTenWeek1_d'
    suffix = '.tsv'
    G = nx.Graph()

    def add_edge(u, v, w):
        if G.has_edge(u, v):
            G[u][v]['weight'] += w
        else:
            G.add_edge(u, v, weight=w)

    for i in range(d1, d2):
        print('Running data number - {}'.format(i))
        data = pd.read_csv(Path().joinpath('data',
                                           data_name + str(i) + suffix),
                           sep='\t',
                           error_bad_lines=False,
                           index_col=False,
                           dtype='unicode')
        data = data.sort_values(by=data.columns[0])

        print('num of rows in data', len(data))

        # instead of using names we will use sha1
        # Number of distinct machines file was downloaded to from this domain. this will be the weight of and edge
        # name = data.columns[0]
        start = time.time()  # just to know how much time it runs.
        # fileAndDomain_to_machines_dic key:val -> (key) file&domain : (val) num of machines
        sha1 = data.columns[3]
        domain = data.columns[17]
        threat = data.columns[20]
        size = data.columns[24]
        machine = data.columns[13]
        fileAndDomain_to_machines_dic = {}

        for index, row in data.iterrows():
            file_sha1 = row[sha1]
            file_domain = row[domain]
            machine_guid = row[machine]
            fileAndDomain_to_machines_dic[(
                file_sha1, file_domain)] = fileAndDomain_to_machines_dic.get(
                    (file_sha1, file_domain), []) + [machine_guid]

        for index, row in data.iterrows():
            file_sha1 = row[sha1]
            machine_guid = row[machine]
            file_threat = row[threat]
            if isinstance(file_threat, str):
                file_to_machines_dic[file_sha1] = file_to_machines_dic.get(
                    file_sha1, []) + [machine_guid]
            else:
                clean_dict[file_sha1] = clean_dict.get(file_sha1,
                                                       []) + [machine_guid]

        for index, row in data.iterrows():
            file_sha1 = row[sha1]
            file_size = row[size]
            file_sha1_to_size[file_sha1] = file_size

    for key, val in fileAndDomain_to_machines_dic.items():
        fileAndDomain_to_machines_dic[key] = len(list(set(val)))

    fileAndDomain_to_machines_dic = sort_dic(fileAndDomain_to_machines_dic)

    for (file_sha1,
         file_domain), weight in fileAndDomain_to_machines_dic.items():
        add_edge(file_sha1, file_domain, weight)

    for key, val in file_to_machines_dic.items():
        file_to_machines_dic[key] = len(list(set(val)))

    for key, val in clean_dict.items():
        clean_dict[key] = len(list(set(val)))

    malicious_dict = {k: v for k, v in file_to_machines_dic.items() if v > 4}
    clean_dict = {k: v for k, v in clean_dict.items() if v > 8}
    print('number of malicious files:', len(malicious_dict))
    print('number clean files', len(clean_dict))

    counter = 0
    for key, val in malicious_dict.items():
        if key in clean_dict.keys():
            del clean_dict[key]
            counter += 1
    print(counter)
    sha1_set = unknown_set.copy()
    print('unknown_set before cleaning %d' % len(unknown_set))
    for file_sha1 in unknown_set.copy():
        if file_sha1 in clean_dict or file_sha1 in malicious_dict:
            unknown_set.remove(file_sha1)

    print('unknown_set after cleaning %d' % len(unknown_set))

    print("Num of nodes in G {}".format(len(G)))
    print('Number of edges in G %s' % (G.number_of_edges()))
    lst = list(G.degree)
    avg_degree = 0
    max_degree = 0
    for (item, deg) in lst:
        if deg > max_degree:
            max_degree = deg
        avg_degree += deg
    print('avg degree:', round(avg_degree / len(G), 2))
    print('max deg:', max_degree)

    # print('Average degree G %s' %(np.mean(nx.degree_histogram(G))))
    # now we have a graph G which has a edges between files and the domain it was downloaded from, with weight
    # which is the number of unique machines which downloaded the file from this domain.

    # this is just a print out of the weight of each edge.
    edge_to_weights_dic = nx.get_edge_attributes(G, 'weight')
    edge_to_weights_dic = sort_dic(edge_to_weights_dic)
    # for key, value in attr.items():
    #     print(key, ' : ', value)
    weight_array = np.array(
        [edge_to_weights_dic[k] for k in edge_to_weights_dic])
    print('average weight: ', weight_array.mean())
    print('max weight :', np.amax(weight_array))
    # print("len is ", len(attr))

    degree_sequence = sorted([d for n, d in G.degree()],
                             reverse=True)  # degree sequence
    file_sha1_to_degree_dict = {}
    domain_to_degree_dict = {}

    graph(degree_sequence, "Degree Histogram")

    for n, d in G.degree():
        if n in sha1_set:
            file_sha1_to_degree_dict[n] = d
        else:
            domain_to_degree_dict[n] = d
    # file graph for degree

    file_degree_lst = sorted(list(file_sha1_to_degree_dict.values()))
    domain_degree_list = sorted(list(domain_to_degree_dict.values()))

    graph(file_degree_lst, "File Degree Histogram")
    graph(domain_degree_list, "Domain Degree Histogram")

    partition = community.best_partition(G, weight='weight')
    partition = sort_dic(partition)
    domain_per_cluster = {}
    files_per_cluster = {}
    for key, val in partition.items():
        if key in sha1_set:
            files_per_cluster[val] = files_per_cluster.get(val, []) + [key]
        else:
            domain_per_cluster[val] = domain_per_cluster.get(val, []) + [key]
    print('total communities :', list(partition.values())[-1])
    print('average community size:', len(G) / list(partition.values())[-1])
    max_community_size_dict = {}
    for community_index in partition.values():
        max_community_size_dict[community_index] = max_community_size_dict.get(
            community_index, 0) + 1
    max_community_size_dict = sort_dic_rev(max_community_size_dict)
    print('Max community size:', list(max_community_size_dict.values())[0])

    dirty_precent_per_cluster_lst = []
    for file_list in files_per_cluster.values():
        file_list_len = len(file_list)
        counter = 0
        for file in file_list:
            if file in malicious_dict.keys():
                counter += 1
        dirty_precent_per_cluster_lst.append(
            int(round((counter / file_list_len), 2) * 100))
    print(sorted(dirty_precent_per_cluster_lst, reverse=True))

    machines_per_cluster = {}
    file_to_list_of_domains_per_cluster_dic = {}

    for index, (files_list, domains_list) in enumerate(
            zip(files_per_cluster.values(), domain_per_cluster.values())):
        for file in files_list:
            for domain in domains_list:
                if G.has_edge(file, domain):
                    file_to_list_of_domains_per_cluster_dic[
                        file] = file_to_list_of_domains_per_cluster_dic.get(
                            file, []) + [domain]
                    machines_per_cluster[index] = machines_per_cluster.get(
                        index, 0) + G[file][domain]['weight']

    machines_per_cluster = sort_dic(machines_per_cluster)

    # print(*machines_per_cluster.items(), sep='\n')

    domain_to_dirty_precent = {}
    cluster_to_file_precent_in_cluster = {}
    for index, (files_list, domains_list) in enumerate(
            zip(files_per_cluster.values(), domain_per_cluster.values())):
        cluster_to_file_precent_in_cluster[index] = len(files_list) / (
            len(files_list) + len(domains_list))
        for domain in domains_list:
            domain_total_files_counter = 0
            domain_dirty_files_counter = 0
            for file in files_list:
                if G.has_edge(domain, file):
                    domain_total_files_counter += 1
                    if file in malicious_dict.keys():
                        domain_dirty_files_counter += 1
            # print('%s / %s' % (domain_dirty_files_counter, domain_total_files_counter))
            domain_to_dirty_precent[domain] = int(
                round(
                    (domain_dirty_files_counter / domain_total_files_counter),
                    2) * 100)
    domain_to_dirty_precent = sort_dic(domain_to_dirty_precent)
    # print(*domain_to_dirty_precent.items(), sep='\n')

    dirty_precent_domains = {}
    for domain, percent in domain_to_dirty_precent.items():
        dirty_precent_domains[percent] = dirty_precent_domains.get(percent,
                                                                   0) + 1

    percent, counter = zip(
        *dirty_precent_domains.items())  # creating 2 arrays of keys , values
    fig, ax = plt.subplots(figsize=(8, 8))
    plt.bar(percent, counter, color='blue')
    plt.yscale('log')
    plt.title("Amount of domains with the number of dirty files percentage")
    plt.ylabel("Amount of domains")
    plt.xlabel("dirty file percentage")
    # ticks = np.arange(0, 105, 5)
    # ax.set_xticks(ticks)
    # ax.set_xticklabels(ticks)
    plt.savefig('graph_dirty_percent_domains.png')
    plt.show()

    # the amount of clusters with different amount of machines
    values_to_machines = {}
    for key, val in machines_per_cluster.items():
        values_to_machines[val] = values_to_machines.get(val, 0) + 1
    print(*values_to_machines.items(), sep='\n')
    fig, ax = plt.subplots(figsize=(9, 9))
    # values_to_machines = sort_dic(values_to_machines)
    t = np.arange(0., len(values_to_machines), 1)
    y = [val for val in values_to_machines.values()]
    plt.plot(t, y, 'r')
    plt.xlabel('Values')
    plt.yscale('symlog')
    plt.ylabel('Amount of clusters for value X')
    ticks = np.arange(1, 223, 20)
    ax.set_xticks(ticks)
    ax.set_xticklabels(ticks)
    plt.savefig('values_to_machines')
    plt.show()
    dirty_per_percent_dict = {}
    for percent in dirty_precent_per_cluster_lst:
        dirty_per_percent_dict[percent] = dirty_per_percent_dict.get(
            percent, 0) + 1
    dirty_per_percent_dict = {
        k: dirty_per_percent_dict[k]
        for k in sorted(dirty_per_percent_dict)
    }

    percent, cnt = zip(*dirty_per_percent_dict.items())
    fig, ax = plt.subplots(figsize=(8, 8))
    plt.bar(percent, cnt, color='green')
    plt.yscale('symlog')
    plt.title("Clusters dirty percentage Histogram")
    plt.ylabel("Amount of clusters with 'x' dirty files percentage")
    plt.xlabel("percetage")
    plt.savefig('graph_dirty_percent_clusters.png')
    plt.show()

    # feature_ extraction
    cluster_number_to_malicious_percent_dic = {
        index: percent
        for index, percent in enumerate(dirty_precent_per_cluster_lst)
    }
    cluster_per_file = {}
    for cluster_index, files_list in files_per_cluster.items():
        for file_sha1 in files_list:
            cluster_per_file[file_sha1] = cluster_index

    final_dic = {}
    for file_sha1, num_of_guid in clean_dict.items():
        final_dic[file_sha1] = [
            file_sha1,
            int(file_sha1_to_size[file_sha1]), num_of_guid,
            cluster_to_file_precent_in_cluster[cluster_per_file[file_sha1]],
            len(file_to_list_of_domains_per_cluster_dic[file_sha1]),
            cluster_per_file[file_sha1],
            cluster_number_to_malicious_percent_dic[
                cluster_per_file[file_sha1]],
            max_community_size_dict[cluster_per_file[file_sha1]], 0
        ]

    for file_sha1, num_of_guid in malicious_dict.items():
        final_dic[file_sha1] = [
            file_sha1,
            int(file_sha1_to_size[file_sha1]), num_of_guid,
            cluster_to_file_precent_in_cluster[cluster_per_file[file_sha1]],
            len(file_to_list_of_domains_per_cluster_dic[file_sha1]),
            cluster_per_file[file_sha1],
            cluster_number_to_malicious_percent_dic[
                cluster_per_file[file_sha1]],
            max_community_size_dict[cluster_per_file[file_sha1]], 1
        ]
    train_X = []
    train_y = []

    for file_sha1, features in final_dic.items():
        train_X.append(features[:-1])
        train_y.append(final_dic[file_sha1][-1])

    return train_X, train_y, final_dic
Example #47
0

# variables globales
inputf = sys.argv[1]
similarity = sys.argv[2]
weighting = sys.argv[3]
tokenization = sys.argv[4]
alpha = float(sys.argv[5])
outputf = sys.argv[6]

# leer documentos
documents = []
f = open(inputf, "r")
l = f.readline()
while l:
    l = l.rstrip('\r\n')
    documents.append(l)
    l = f.readline()

# generate graph
G = getsimmatrix()

print 'calling community detection algorithm'
import community
community.alpha = alpha
clustering = community.best_partition(G)
f = open(outputf, 'w')
for i in range(len(documents)):
    f.write(str(clustering[i]) + '\n')
f.close()
Example #48
0
    def generate_community_corpus(self, method=None):

        if 'number_of_communities' not in self.params.keys():
            raise ValueError("the number of topics parameter is missing!")

        self.number_of_communities = self.params['number_of_communities']

        if method == "lda":
            # Run GibbsLDA++
            if not os.path.exists(GIBBSLDA_PATH):
                raise ValueError("Invalid path of GibbsLDA++!")

            temp_lda_folder = os.path.join(self.temp_folder, "lda_temp")
            if not os.path.exists(temp_lda_folder):
                os.makedirs(temp_lda_folder)

            temp_dfile_path = os.path.join(temp_lda_folder, "gibblda_temp.dfile")
            # Save the walks into the dfile
            self.save_corpus(corpus_file=temp_dfile_path, with_title=True, corpus=self.corpus)

            initial_time = time.time()
            cmd = "{} -est ".format(GIBBSLDA_PATH)
            cmd += "-alpha {} ".format(self.params['lda_alpha'])
            cmd += "-beta {} ".format(self.params['lda_beta'])
            cmd += "-ntopics {} ".format(self.params['number_of_communities'])
            cmd += "-niters {} ".format(self.params['lda_number_of_iters'])
            cmd += "-savestep {} ".format(self.params['lda_number_of_iters'] + 1)
            cmd += "-dfile {} ".format(temp_dfile_path)
            os.system(cmd)

            print("-> The LDA algorithm run in {:.2f} secs".format(time.time() - initial_time))

            # Read wordmap file
            id2node = {}
            temp_wordmap_path = os.path.join(temp_lda_folder, "wordmap.txt")
            with open(temp_wordmap_path, 'r') as f:
                f.readline()  # skip the first line
                for line in f.readlines():
                    tokens = line.strip().split()
                    id2node[int(tokens[1])] = tokens[0]

            # Read phi file
            phi = np.zeros(shape=(self.number_of_communities, self.number_of_nodes), dtype=np.float)
            temp_phi_path = os.path.join(temp_lda_folder, "model-final.phi")
            with open(temp_phi_path, 'r') as f:
                for comm, line in enumerate(f.readlines()):
                    for id, value in enumerate(line.strip().split()):
                        phi[comm, int(id2node[id])] = value

            # Read the tassign file, generate topic corpus
            temp_tassing_path = os.path.join(temp_lda_folder, "model-final.tassign")
            self.topic_corpus = []
            with smart_open(temp_tassing_path, 'r') as f:
                for line in f:
                    tokens = line.strip().split()
                    self.topic_corpus.append([token.split(':')[1] for token in tokens])

            return phi

        elif method == "hmm":
            y = []
            for walk in self.corpus:
                seq = []

                for w in walk:
                    #seq.append(self.number_of_nodes+np.random.choice(self.number_of_communities))
                    seq.append(int(w))

                y.append(seq)

            E = self.number_of_nodes
            K = self.number_of_communities
            L = self.params['walk_length']
            hmm_number_of_iters = self.params['hmm_number_of_iters']
            hmm_subset_size = self.params['hmm_subset_size']
            N = len(y)

            plates_multiplier = N / hmm_subset_size

            p0 = self.params['hmm_p0']  # a vector of size K
            t0 = self.params['hmm_t0']  # a vector of size K
            e0 = self.params['hmm_e0']

            p_param = p0*np.ones(K, dtype=np.float)
            p = bayes.Dirichlet(p_param, name='p')

            t_param = t0*np.ones(K, dtype=np.float)
            T = bayes.Dirichlet(t_param, plates=(K, ), name='T')

            e_param = e0*np.ones(E, dtype=np.float)
            E = bayes.Dirichlet(e_param, plates=(K, ), name='E')

            z = bayes.CategoricalMarkovChain(p, T, states=L, plates=(hmm_subset_size,),
                                             plates_multiplier=(plates_multiplier,), name='Z')
            x = bayes.Mixture(z, bayes.Categorical, E, name='X')

            p.initialize_from_random()
            T.initialize_from_random()
            E.initialize_from_random()


            Q = VB(x, z, E, T, p)
            """
            x.observe(y)

            Q.update(repeat=1000)
            """
            Q.ignore_bound_checks = True

            delay = 1
            forgetting_rate = 0.5

            for iter in range(hmm_number_of_iters):
                # Observe a random mini-batch
                subset = np.random.choice(a=N, size=hmm_subset_size)

                # print(subsets)
                # print()
                # print(subsets[subset])
                Q['X'].observe([y[inx] for inx in subset])
                # Learn intermediate variables
                Q.update('Z')
                #  Set step length
                step = (iter + delay) ** (-forgetting_rate)
                # Stochastic gradient for the global variables
                Q.gradient_step('p', 'T', 'E', scale=step)

            likelihood = Q['E'].random()

            qp = p.random()
            qT = T.random()
            qE = E.random()

            self.topic_corpus = []

            model = hmm.MultinomialHMM(n_components=self.number_of_communities, tol=0.001, n_iter=5000)
            model.startprob_ = qp
            model.emissionprob_ = qE
            model.transmat_ = qT

            initial_time = time.time()
            seq_for_hmmlearn = np.concatenate([np.asarray(seq).reshape(-1, 1).tolist() for seq in y])
            seq_lens = [self.params['walk_length'] for _ in range(N)]
            comm_conc_seq = model.predict(seq_for_hmmlearn, seq_lens)
            print("The hidden states are predicted in {} secs.".format(time.time() - initial_time))

            self.topic_corpus = []
            for i in range(N):
                self.topic_corpus.append([str(w) for w in comm_conc_seq[i*L:(i+1)*L]])

            return likelihood

        elif method == "bigclam":
            # Run AGM
            if not os.path.exists(BIGCLAM_PATH):
                raise ValueError("Invalid path of BigClam!")

            # If the temp folder for BigClam does not exits
            temp_bigclam_folder = os.path.join(self.temp_folder, "bigclam_temp")
            if not os.path.exists(temp_bigclam_folder):
                os.makedirs(temp_bigclam_folder)

            g = nx.Graph()
            g.add_edge(2, 3)
            print("graph {}".format([g.copy()]))
            # Get all connected components
            cc_list = np.asarray(list(nx.connected_component_subgraphs(self.graph)))
            num_of_cc = cc_list.shape[0]
            print("graph {}".format([self.graph.copy()]))
            if num_of_cc == 1:
                cc_list = [self.graph.copy()]
            print(cc_list)
            print("Number of connected components: {}".format(num_of_cc))
            cc_sizes = [cc.number_of_nodes() for cc in cc_list]
            # Sort the connected components
            cc_sizes_inx = np.argsort(cc_sizes)[::-1]
            cc_sizes = [cc_sizes[inx] for inx in cc_sizes_inx]
            cc_list = [cc_list[inx] for inx in cc_sizes_inx]
            # Find how many communities will be assigned for each connected component
            cum_sum_cc_sizes = np.cumsum(cc_sizes)
            # Find the community assignments of the set of the largest 'cc_inx_limit' connected components
            # in which the ratio of sizes of the smallest connected component and the size of the set is greater than
            # 1.5 times the number of communities which is desired to be assigned
            cc_inx_limit = 0
            for limit in range(num_of_cc):
                if cc_sizes[cc_inx_limit] / float(cum_sum_cc_sizes[cc_inx_limit]) >= (1.5 / self.number_of_communities):
                    cc_inx_limit += 1

            comm2node = []
            temp_bigclam_output = [[] for _ in range(cc_inx_limit)]
            temp_bigclam_edgelist = [[] for _ in range(cc_inx_limit)]
            temp_bigclam_labels = [[] for _ in range(cc_inx_limit)]
            assignment_sizes = np.zeros(shape=cc_inx_limit, dtype=np.int)
            correction_sizes = np.zeros(shape=cc_inx_limit, dtype=np.int)

            for cc_index in range(num_of_cc):
                current_ccg = cc_list[cc_index]

                if cc_index >= cc_inx_limit:
                    comm2node.append([v for v in current_ccg.nodes()])
                else:
                    assignment_sizes[cc_index] = int((float(cc_sizes[cc_index]) / cum_sum_cc_sizes[cc_inx_limit-1]) * self.params['number_of_communities'])
                    temp_bigclam_output[cc_index] = os.path.join(temp_bigclam_folder, "output{}".format(cc_index))
                    temp_bigclam_edgelist[cc_index] = os.path.join(temp_bigclam_folder, "temp{}.edgelist".format(cc_index))
                    temp_bigclam_labels[cc_index] = os.path.join(temp_bigclam_folder, "temp{}.labels".format(cc_index))

                    cc_graph_nodes = sorted([int(node) for node in current_ccg.nodes()])
                    with open(temp_bigclam_edgelist[cc_index], 'w') as f:
                        for node in cc_graph_nodes:
                            for nb in sorted([int(val) for val in nx.neighbors(current_ccg, str(node))]):
                                if int(node) < int(nb):
                                    f.write("{}\t{}\n".format(str(node), str(nb)))

                    with open(temp_bigclam_labels[cc_index], 'w') as f:
                        for node in cc_graph_nodes:
                            f.write("{}\t{}\n".format(str(node), str(node)))

                    cmd = "{} ".format(BIGCLAM_PATH)
                    cmd += "-o:{} ".format(temp_bigclam_output[cc_index])
                    cmd += "-i:{} ".format(temp_bigclam_edgelist[cc_index])
                    cmd += "-l:{} ".format(temp_bigclam_labels[cc_index])
                    cmd += "-nt:{} ".format(8)
                    cmd += "-c:{} ".format(assignment_sizes[cc_index])
                    os.system(cmd)

                    # Read the output file
                    with open(temp_bigclam_output[cc_index], 'r') as f:
                        for line in f.readlines():
                                comm2node.append(line.strip().split())
                                correction_sizes[cc_index] += 1

            total_num_of_assigned_communities = len(comm2node)
            phi = np.zeros(shape=(total_num_of_assigned_communities, self.number_of_nodes), dtype=np.float)

            self.number_of_communities = total_num_of_assigned_communities

            # Generate the phi matrix
            for k in range(total_num_of_assigned_communities):
                for node in comm2node[k]:
                    phi[k, int(node)] = 1.0

            # Be sure that every node is assigned to at least one community
            for node in range(self.number_of_nodes):
                # if a node is not assigned to any community
                if np.sum(phi[:, node]) == 0.0:
                    # Check the assignments of neighbors of the node
                    nb_comm_assign_counts = np.zeros(total_num_of_assigned_communities, dtype=np.float)
                    for nb in nx.neighbors(self.graph, str(node)):
                        nb_comm_assign_counts += phi[:, int(nb)]
                    # If the neighbors of the node is assigned to a community, assign it to the most frequent community
                    if nb_comm_assign_counts.sum() != 0.0:
                        assigned_comm_id = nb_comm_assign_counts.argmax()
                    # Otherwise assign it to a random community
                    else:
                        assigned_comm_id = np.random.choice(a=total_num_of_assigned_communities)
                    phi[assigned_comm_id, node] = 1.0

            # Normalize
            phi = np.divide(phi.T, np.sum(phi, 1)).T

            # Generate the topic corpus
            self.topic_corpus = []
            for walk in self.corpus:
                community_walk = []
                for w in walk:
                    # If the vertex has only one community assignment
                    if np.sum(phi[:, int(w)] > 0.0) == 1:
                        community_walk.append(str(np.where(phi[:, int(w)] > 0)[0][0]))
                    # otherwise, ...
                    else:
                        # if it is possible, assign it to the community which the previous node is assigned to
                        if len(community_walk) > 0 and phi[int(community_walk[-1]), int(w)] > 0.0:
                            community_walk.append(str(community_walk[-1]))
                        # if not, randomly choose a node
                        else:
                            chosen_comm = np.random.choice(a=phi.shape[0], p=phi[:, int(w)]/np.sum(phi[:, int(w)]))
                            community_walk.append(str(chosen_comm))

                self.topic_corpus.append(community_walk)

            print("---< Summary >---")
            print("+ The graph consists of {} connected component(s)".format(num_of_cc))
            for i in range(cc_inx_limit):
                print("+ The component of size {} is assigned to {}/{} communities".format(cc_sizes[i], correction_sizes[i], assignment_sizes[i]))
            print("+ Each of the remaining {} components is assigned to a unique label".format(num_of_cc-cc_inx_limit))
            print("+ The 'phi' matrix contains {} communities".format(phi.shape[0]))
            print("----o----")

            return phi

        elif method == "louvain":

            c = louvain.best_partition(self.graph)

            self.number_of_communities = len(set(c.values()))

            print("The number of detected communities: {}".format(self.number_of_communities))
            phi = np.zeros(shape=(self.number_of_communities, self.number_of_nodes), dtype=np.float)

            for node in self.graph.nodes():
                phi[int(c[node]), int(node)] = 1.0

            self.topic_corpus = []
            for walk in self.corpus:
                seq = [str(c[str(w)]) for w in walk]
                self.topic_corpus.append(seq)

            # Normalize
            phi = (phi.T / np.sum(phi, 1)).T

            return phi

        else:
            raise ValueError("Invalid community/topic detection method")
def create_communities(GRAPHS, num_graphs, graph_header, input_mode, output_filename_prefix):
    # open community name proposal file
    proposal_file = open(proposal_filename, encoding='utf-8')
    proposal_reader = csv.reader(proposal_file, delimiter=',')
    next(proposal_reader)
    proposal_list = list(proposal_reader)
    
    # create community map output file
    map_filename = "./output/" + output_filename_prefix + "_community_map.csv"
    map_file = open(map_filename, mode='w', newline='\n', encoding='utf-8')
    map_writer = csv.writer(map_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    map_writer.writerow(["Modularity Class", graph_header, "Color Name", "Color Value", "Community Name", "Possible Community Names", "Distinct Words"])
    
    if input_mode == "twitter":
        import tweet_input as proj_input
    elif input_mode == "docu":
        import docu_input as proj_input
    else:
        print("Invalid input mode. Exiting...")
        exit()
        
    color_list = proj_input.load_color_list()
    
    cur_highest = -1 # first offset should be zero
    # for each graph
    for i in range(num_graphs):
        G = GRAPHS[i]
        community_info = []
        
        
        # compute for the communities
        community_dictionary = community.best_partition(G, partition=None, weight='weight', resolution=1.0, randomize=True, random_state=None)
        print("Done with best partition algorithm in graph \"" + G.graph['name'] + "\".")
        
        
        # get community information from the dictionary returned by community.best_partition()
        num_communities = 0
        # get the community number and degree for each node
        for node in G.nodes():
            # community number, node label, and degree (for top 5)
            community_info.append([community_dictionary[node], G.node[node]['label'], G.degree(node)])
            
            # take note of the last community number
            if community_dictionary[node] > num_communities:
                num_communities = community_dictionary[node]
                
        num_communities += 1 # total number of communities
        print("There are " + str(num_communities) + " communities in graph \"" + G.graph['name'] + "\".")
        
        
        # determine the community name (either by proposal or top 5)
        # community_map should contain [0]community name, [1]possible community names (top 5), [2]number of distinct words
        community_map = name_communities(community_info, proposal_list)
        community_map = proj_input.color_map(G.graph['name'], community_map, color_list)
        
        
        # store community number, community name, and color to the graph
        # adjust community number based on previous graph
        class_offset = cur_highest + 1
        
        # for each node in the graph
        for node in G.nodes():
            # adjust and add community number to the node as an attribute
            temp = community_dictionary[node] + class_offset
            G.node[node]['modularity_class'] = temp
            
            # add community name
            G.node[node]['community_name'] = community_map[community_dictionary[node]][0]
            
            # color the node
            G.node[node]['color_name'] = community_map[community_dictionary[node]][3]
            G.node[node]['color_value'] = community_map[community_dictionary[node]][4]
            
            # if it is the highest community number, take note (to be used on the next graph)
            if temp > cur_highest:
                cur_highest = temp
                
        # save community map
        map_size = len(community_map)
        for j in range(map_size):
            # Modularity Class, Graph Name, Color Name, Color Value, Community Name, Possible Community Names, Distinct Words
            map_writer.writerow([class_offset + j, G.graph['name'], community_map[j][3], community_map[j][4], community_map[j][0], community_map[j][1], community_map[j][2]])
            
        # store the graph back to the list
        GRAPHS[i] = G
        
    return GRAPHS
    
    def modularity_maximization(self, partition=None):
        """Perform louvain's method for modularity maximization contained in 'community' library
        
        Parameters
        ----------
        partition: dict, optional
            the algorithm will start using this partition of the nodes. It's a dictionary where keys are their nodes and values the communities
            (doc taken from 'community' documentation http://perso.crans.org/aynaud/communities/api.html#community.best_partition)
            
        Returns
        -------
        comm_list: list
            community list where indexes correspond to 
            0:users_num , in sorted order by user id
            users_num+1: tags_num, in sorted order by tag id
            tags_num+1: links_num, in sorted order by link id
        matching: list of dictionaries
            *_id: community label dict for each type, Users, Tags, Links
        node_tags: list of str
            order of partites, as found in hyperedges
        q: float
            modularity
        Raises
        ------
        NetworkXError: If the graph is not Eulerian.
            (doc taken from 'community' documentation http://perso.crans.org/aynaud/communities/api.html#community.best_partition)
        """
        #temporary class change for function to work
        self.__class__ = nx.Graph
        d = community.best_partition(self, partition=partition)
        q = community.modularity(d, self)
        self.__class__ = Tripal_nx

        matching = {'U': {}, 'T': {}, 'L': {}}

        for h_e in d.keys():
            u, t, l = map(int, re.split('U|T|L', h_e)[0:3])

            if u in matching['U'].keys():
                matching['U'][u].add(d[h_e])
            else:
                matching['U'][u] = set([d[h_e]])

            if t in matching['T'].keys():
                matching['T'][t].add(d[h_e])
            else:
                matching['T'][t] = set([d[h_e]])

            if l in matching['L'].keys():
                matching['L'][l].add(d[h_e])
            else:
                matching['L'][l] = set([d[h_e]])

        for t in matching:
            for k in matching[t]:
                matching[t][k] = list(matching[t][k])

        comm_list = []
        for t in matching:
            for k in matching[t]:
                matching[t][k] = list(matching[t][k])
            comm_list.extend(
                [matching[t][n] for n in sorted(matching[t].keys())])

        return comm_list, matching, self.node_tags, q
Example #51
0
        if edgeIn[0] == '*Vertices' or edgeIn[0] == '*Edges':
            continue
        else:
            edge = (int(edgeIn[0]), int(edgeIn[1]))
            edges.append(edge)

graph = nx.Graph()
graph.add_edges_from(edges)
print(f'INFO: Completed in {time.perf_counter() - tStart:.4f} secs')
print('INFO: Total edges', len(edges))

partition = {}
if computePartition:
    print('INFO: Computing best community partition...', partitionFile)
    tStart = time.perf_counter()
    partition = community_louvain.best_partition(graph)
    print(f'INFO: Completed in {time.perf_counter() - tStart:.4f} secs')

    if outClusterInfoFile is not None:
        with open(outClusterInfoFile, 'w') as fp:
            for k in partition.keys():
                print(k - vertexIdOffset, partition[k], file=fp)
else:
    print('INFO: Reading cluster partition from', mtxFile)

    # read partition files
    for i in range(1, nodes + 1):
        # start from index 1
        if nodes == 1:
            parFile = partitionFile
        else:
Example #52
0
def get_node_list(corrcoef, threshold=4):
    r"""Calculate community structures from interaction network.

    The interaction network is built using the correlation coeffient matrix, in which the edges are the Pearson correlation
    of the two connecting nodes. The community structures of this network is calculated using the Louvain algorithm [1]_,
    which find high modularity network partitions. The modularity is defined as [2]_:

    .. math::
            Q=\frac{1}{2 m} \sum_{i, j}\left[A_{i j}-\frac{k_{i} k_{j}}{2 m}\right] \delta\left(c_{i}, c_{j}\right)

    where :math:`A_{i j}` is the weight of the edge between node i and node j; :math:`k_{i}` is the sum of weights
    of the nodes attached to the node i, i.e. the degree of the node; :math:`c_{i}` is the community to which node i
    assigned; :math:`\delta\left(c_{i}, c_{j}\right)` is 1 if i=j and 0 otherwise; and
    :math:`m=\frac{1}{2} \sum_{i j} A_{i j}` is the number of edges. In the modularity optimization, the Louvain
    algorithm orders the nodes in the network, and then, one by one, removes and inserts each node in a different
    community c_i until no significant increase in modularity. After modularity optimization, all the nodes that
    belong to the same community are merged into a single node, of which the edge weights are the sum of the weights
    of the comprising nodes. This optimization-aggregation loop is iterated until all nodes are collapsed into one.

    By default, this method returns communities containing at least 4 nodes. This setting can be changed by using
    the parameter ``threshold``.

    Parameters
    -----------
    corrcoef : ndarray(n, n)
        The Pearson correlation matrix.

    threshold : int, default=4
        Size of communities. Only communities with more nodes than the threshold will be returned.

    Returns
    --------
    node_list : list of lists
        A list of community nodes.

    modularity : float or None
        The modularity of network partition. It measure the quality of network partition. The value is between 1 and
        -1. The bigger the modularity, the better the partition.

    References
    ----------
    .. [1] Blondel, V. D.; Guillaume, J.-L.; Lambiotte, R.; Lefebvre, E., Fast unfolding of communities in large
           networks. Journal of Statistical Mechanics: Theory and Experiment 2008, 2008 (10), P10008

    .. [2] Newman, M. E. J., Analysis of weighted networks. Physical Review E 2004, 70 (5), 056131.

    """
    # TODO: check negative values in corrcoef_matrix. Come up with better solutions.
    corrcoef[
        corrcoef <
        0.0] = 0.0  # network edge can't take negative values. Residues with
    # negative correlationsare are forced to separate to different binding sites.
    graph = nx.Graph(corrcoef)
    partition = community.best_partition(graph, weight='weight')
    values = [partition.get(node) for node in graph.nodes()]
    node_list = []
    for value in range(max(values)):
        nodes = [k for k, v in partition.items() if v == value]
        if len(nodes) >= threshold:
            node_list.append(nodes)
    if len(node_list) > 0:
        modularity = community.modularity(partition, graph)
    else:
        modularity = None
    return node_list, modularity
G_task_noreg = nx.read_adjlist(fNet_task_noreg, nodetype=int)
# rest (absence of task)
fNet_rest = 'DataTaskNetwork/fMRI_covertverb_r_bp_reg_Rt2_K200_deg20_rest.adjlist'
G_rest = nx.read_adjlist(fNet_rest, nodetype=int)
# consolidating all into a list
G_list = [G_task_reg, G_task_noreg, G_rest]
listLabel = [
    'During task\n(task regressed out)',
    'During task\n(task NOT regressed out)', 'Rest\n(absence of task)'
]

####### Community detection
# Community detection with the Louvain method
partition_list = []
for iG in G_list:
    partition = community.best_partition(iG)
    partition_list.append(partition)

###### visualizing the modular organization
# dictionary of xy-coordinates
pos = {}
for iROI in range(len(nodes)):
    pos[nodes[iROI]] = xyz[iROI, :2]

# loop over networks for visualization
plt.figure(figsize=[10, 4])
for i, iG in enumerate(G_list):

    plt.subplot(1, 3, i + 1)
    nComm = max([comm for comm in partition_list[i].values()]) + 1
    node_color_list = get_cmap(nComm + 1, 'rainbow')
Example #54
0
def clusterGraphData(G):
    labels = community_louvain.best_partition(G, resolution=2.0,random_state=8)
    clusterLabels=np.array(list(labels.items()))[0:60000,1]
    return clusterLabels
Example #55
0
def python_louvain(df, resolution, randomize=None, random_state=None):
    G = nx.from_pandas_edgelist(df = df, source = 'from', target =  'to', edge_attr = 'weight')
    partition = community.best_partition(graph=G, resolution=resolution, weight='weight',
    randomize=randomize, random_state= random_state)
    louvain_dfr = pd.DataFrame.from_dict(data=partition, orient='index')
    return(louvain_dfr)
Example #56
0
def louvain_algorithm(sub_graph):
    partition = community.best_partition(sub_graph)
    c_to_node = defaultdict(set)
    for k, v in partition.items():
        c_to_node[v].add(k)
    return list(c_to_node.values())
Example #57
0
    def ChangeCommunityColorAndInstantiateHierarchy(self, level=-1):
        self.g = self.Graphwidget.Graph_data().DrawHighlightedGraph(
            self.Graphwidget.EdgeSliderValue)
        self.ColorNodesBasedOnCorrelation = False
        self.partition = cm.best_partition(self.g)
        self.induced_graph = cm.induced_graph(self.partition, self.g)

        if not (level == -1):
            dendo = cm.generate_dendrogram(self.g)
            g = cm.partition_at_level(dendo, level)
            self.induced_graph1 = cm.induced_graph(g, self.g)
            self.partition = g
            self.induced_graph = self.induced_graph1

        # Induced graph is the data structure responsible for the adjacency matrix of the community
        # Matrix Before calculating the correlation strength
        # finding out the lower half values of the matrix, can discard other values as computationally intensive

        # self.Find_InterModular_Edge_correlativity()
        self.Matrix = nx.to_numpy_matrix(self.induced_graph)

        # Triggering a new window with the same color
        # If the Gray out option is clicked then gray out the nodes without the colors
        self.ColorForCommunities(len(set(self.partition.values())))
        self.ColorForVisit(self.partition)

        nodes1 = [
            item for item in self.Graphwidget.scene().items()
            if isinstance(item, Node)
        ]
        count = 0
        for community in set(self.partition.values()):
            #Ensuring the right color to the right community is delivered
            list_nodes = [
                nodes for nodes in self.partition.keys()
                if self.partition[nodes] == community
            ]

            for node in nodes1:
                if node.counter - 1 in list_nodes:
                    node.PutColor(self.clut[count])
            count = count + 1

        for node in nodes1:
            node.allnodesupdate()
            break

        clut = self.clut
        Max = self.Graphwidget.Max
        Graph = self.Graphwidget
        Matrix = self.Matrix
        ma = np.ma.masked_equal(Matrix, 0.0)
        Min1 = ma.min()
        Max1 = Matrix.max()
        Pos = self.Find_Initial_Positions()
        """
        Generates a new window so that you can access the views related to community 
        analysis
        """
        def newwindow():
            for i in reversed(range(self.Graphwidget.hbox.count())):
                self.Graphwidget.hbox.itemAt(i).widget().close()

            community = CommunityWidget(
                self.Graphwidget, self.induced_graph,
                self.Graphwidget.correlationTableObject, clut, Max, Matrix, ma,
                Min1, Max1, Pos)
            Dendogram = dendogram(self.Graphwidget, self.g, clut)

            self.Graphwidget.hbox.setContentsMargins(0, 0, 0, 0)

            self.Graphwidget.hbox.addWidget(community)
            self.Graphwidget.hbox.setContentsMargins(0, 0, 0, 0)

            self.Graphwidget.hbox.addWidget(Dendogram)
            self.Graphwidget.hbox.setContentsMargins(0, 0, 0, 0)

            self.communityObject = community
            self.dendogramObject = Dendogram

            self.Graphwidget.hbox.setContentsMargins(0, 0, 0, 0)
            self.Graphwidget.wid.setContentsMargins(0, 0, 0, 0)

            self.Graphwidget.wid.setLayout(self.Graphwidget.hbox)

        newwindow()
        self.Graphwidget.CommunityColorAndDict.emit(self.ColorToBeSentToVisit,
                                                    self.partition)
Example #58
0
    def fit(self, ds: loompy.LoomConnection) -> None:
        logging.info(f"Running cytograph on {ds.shape[1]} cells")
        if self.config.params.factorization not in ["PCA", "HPF", "both"]:
            raise ValueError(
                "params.factorization must be either 'PCA' or 'HPF' or 'both'")
        if self.config.params.features not in ["enrichment", "variance"]:
            raise ValueError(
                "params.features must be either 'enrichment' or 'variance'")
        if self.config.params.nn_space not in ["PCA", "HPF", "auto"]:
            raise ValueError(
                "params.nn_space must be either 'PCA' or 'HPF' or 'auto'")
        if not ((self.config.params.nn_space in ["PCA", "auto"]
                 and self.config.params.factorization in ["PCA", "both"]) or
                (self.config.params.nn_space in ["HPF", "auto"]
                 and self.config.params.factorization in ["HPF", "both"])):
            raise ValueError(
                f"config.params.nn_space = '{self.config.params.nn_space}' is incompatible with config.params.factorization = '{self.config.params.factorization}'"
            )

        species = Species.detect(ds)
        logging.info(f"Species is '{species.name}'")

        logging.info("Recomputing the list of valid genes")
        nnz = ds.map([np.count_nonzero], axis=0)[0]
        valid_genes = (nnz > 10) & (nnz < ds.shape[1] * 0.6)
        ds.ra.Valid = valid_genes.astype('int')

        # Perform Poisson pooling if requested
        main_layer = ""
        if "poisson_pooling" in self.config.steps:
            logging.info(
                f"Poisson pooling with k_pooling == {self.config.params.k_pooling}"
            )
            main_layer = "pooled"  # if not in config.steps, use the main layer
            pp = PoissonPooling(self.config.params.k_pooling,
                                self.config.params.n_genes,
                                compute_velocity=False,
                                n_threads=self.config.execution.n_cpus,
                                factorization=self.config.params.factorization,
                                batch_keys=self.config.params.batch_keys)
            pp.fit_transform(ds)

        # Select features
        if self.config.params.features == "enrichment":
            logging.info(
                f"Feature selection by enrichment on preliminary clusters")
            with warnings.catch_warnings():
                warnings.simplefilter(
                    "ignore", category=NumbaPerformanceWarning
                )  # Suppress warnings about numba not being able to parallelize code
                warnings.simplefilter(
                    "ignore", category=NumbaPendingDeprecationWarning
                )  # Suppress warnings about future deprecations
                warnings.simplefilter(
                    "ignore", category=SparseEfficiencyWarning
                )  # Suppress warnings about setting the diagonal to 1
                logging.info(f"  Gene selection for PCA")
                genes = FeatureSelectionByVariance(
                    self.config.params.n_genes,
                    mask=Species.mask(ds, self.config.params.mask)).fit(ds)
                logging.info(f"  Factorization by PCA")
                normalizer = Normalizer(False)
                normalizer.fit(ds)
                logging.info("  PCA projection to %d components",
                             self.config.params.n_factors)
                pca = PCA(genes,
                          max_n_components=self.config.params.n_factors,
                          layer=main_layer,
                          test_significance=False,
                          batch_keys=self.config.params.batch_keys)
                transformed = pca.fit_transform(ds, normalizer)
                logging.info(
                    f"  Computing KNN (k={self.config.params.k}) in PCA space")
                nn = NNDescent(data=transformed, metric="euclidean")
                indices, distances = nn.query(transformed,
                                              k=self.config.params.k)
                indices = indices[:, 1:]
                distances = distances[:, 1:]
                knn = sparse.csr_matrix(
                    (np.ravel(distances), np.ravel(indices),
                     np.arange(0, distances.shape[0] * distances.shape[1] + 1,
                               distances.shape[1])),
                    (transformed.shape[0], transformed.shape[0]))

            g = nx.from_scipy_sparse_matrix(knn)
            partitions = community.best_partition(g,
                                                  resolution=1,
                                                  randomize=False)
            ds.ca.Clusters = np.array(
                [partitions[key] for key in range(knn.shape[0])])
            n_labels = ds.ca.Clusters.max() + 1
            genes = FeatureSelectionByEnrichment(
                int(self.config.params.n_genes // n_labels),
                Species.mask(ds, self.config.params.mask),
                findq=False).select(ds)
        elif self.config.params.features == "variance":
            logging.info(f"Feature selection by variance")
            genes = FeatureSelectionByVariance(
                self.config.params.n_genes, main_layer,
                Species.mask(ds, self.config.params.mask)).select(ds)
        logging.info(f"Selected {genes.sum()} genes")

        if self.config.params.factorization in ['PCA', 'both']:
            logging.info(f"Factorization by PCA")
            normalizer = Normalizer(False)
            normalizer.fit(ds)
            n_components = min(self.config.params.n_factors, ds.shape[1])
            logging.info("  PCA projection to %d components", n_components)
            pca = PCA(genes,
                      max_n_components=n_components,
                      layer=main_layer,
                      test_significance=False,
                      batch_keys=self.config.params.batch_keys)
            ds.ca.PCA = pca.fit_transform(ds, normalizer)

        if self.config.params.factorization in ['HPF', 'both']:
            logging.info(f"Factorization by HPF")
            # Load the data for the selected genes
            data = ds[main_layer].sparse(rows=genes).T
            logging.debug(f"  Data shape is {data.shape}")

            # HPF factorization
            hpf = HPF(k=self.config.params.n_factors,
                      validation_fraction=0.05,
                      min_iter=10,
                      max_iter=200,
                      compute_X_ppv=False,
                      n_threads=self.config.execution.n_cpus)
            hpf.fit(data)
            beta_all = np.zeros((ds.shape[0], hpf.beta.shape[1]))
            beta_all[genes] = hpf.beta
            # Save the unnormalized factors
            ds.ra.HPF_beta = beta_all
            ds.ca.HPF_theta = hpf.theta
            # Here we normalize so the sums over components are one, because JSD requires it
            # and because otherwise the components will be exactly proportional to cell size
            theta = (hpf.theta.T / hpf.theta.sum(axis=1)).T
            beta = (hpf.beta.T / hpf.beta.sum(axis=1)).T
            beta_all[genes] = beta
            # Save the normalized factors
            ds.ra.HPF = beta_all
            ds.ca.HPF = theta

        if "nn" in self.config.steps or "clustering" in self.config.steps:
            if self.config.params.nn_space in ["PCA", "auto"
                                               ] and "PCA" in ds.ca:
                transformed = ds.ca.PCA
                metric = "euclidean"
            elif self.config.params.nn_space in ["HPF", "auto"
                                                 ] and "HPF" in ds.ca:
                transformed = ds.ca.HPF
                metric = "js"
            logging.info(
                f"Computing balanced KNN (k = {self.config.params.k}) in {self.config.params.nn_space} space using the '{metric}' metric"
            )
            bnn = BalancedKNN(k=self.config.params.k,
                              metric=metric,
                              maxl=2 * self.config.params.k,
                              sight_k=2 * self.config.params.k,
                              n_jobs=-1)
            bnn.fit(transformed)
            knn = bnn.kneighbors_graph(mode='distance')
            knn.eliminate_zeros()
            mknn = knn.minimum(knn.transpose())
            # Convert distances to similarities
            max_d = knn.data.max()
            knn.data = (max_d - knn.data) / max_d
            mknn.data = (max_d - mknn.data) / max_d
            ds.col_graphs.KNN = knn
            ds.col_graphs.MKNN = mknn
            mknn = mknn.tocoo()
            mknn.setdiag(0)
            # Compute the effective resolution
            d = 1 - knn.data
            radius = np.percentile(d, 90)
            logging.info(f"  90th percentile radius: {radius:.02}")
            ds.attrs.radius = radius
            inside = mknn.data > 1 - radius
            rnn = sparse.coo_matrix(
                (mknn.data[inside], (mknn.row[inside], mknn.col[inside])),
                shape=mknn.shape)
            ds.col_graphs.RNN = rnn

        if "embeddings" in self.config.steps or "clustering" in self.config.steps:
            logging.info(f"Computing 2D and 3D embeddings from latent space")
            metric_f = (
                jensen_shannon_distance if metric == "js" else metric
            )  # Replace js with the actual function, since OpenTSNE doesn't understand js
            logging.info(f"  Art of tSNE with {metric} distance metric")
            ds.ca.TSNE = np.array(
                art_of_tsne(transformed, metric=metric_f)
            )  # art_of_tsne returns a TSNEEmbedding, which can be cast to an ndarray (its actually just a subclass)
            logging.info(f"  UMAP with {metric} distance metric")
            ds.ca.UMAP = UMAP(n_components=2,
                              metric=metric_f,
                              n_neighbors=self.config.params.k // 2,
                              learning_rate=0.3,
                              min_dist=0.25).fit_transform(transformed)
            ds.ca.UMAP3D = UMAP(n_components=3,
                                metric=metric_f,
                                n_neighbors=self.config.params.k // 2,
                                learning_rate=0.3,
                                min_dist=0.25).fit_transform(transformed)

        if "clustering" in self.config.steps:
            logging.info("Clustering by polished Louvain")
            pl = PolishedLouvain(outliers=False, graph="RNN", embedding="TSNE")
            labels = pl.fit_predict(ds)
            ds.ca.ClustersModularity = labels + min(labels)
            ds.ca.OutliersModularity = (labels == -1).astype('int')

            logging.info("Clustering by polished Surprise")
            ps = PolishedSurprise(graph="RNN", embedding="TSNE")
            labels = ps.fit_predict(ds)
            ds.ca.ClustersSurprise = labels + min(labels)
            ds.ca.OutliersSurprise = (labels == -1).astype('int')

            if self.config.params.clusterer == "louvain":
                ds.ca.Clusters = ds.ca.ClustersModularity
                ds.ca.Outliers = ds.ca.OutliersModularity
            else:
                ds.ca.Clusters = ds.ca.ClustersSurprise
                ds.ca.Outliers = ds.ca.OutliersSurprise

            logging.info(f"Found {ds.ca.Clusters.max() + 1} clusters")

        if species.name in ["H**o sapiens", "Mus musculus"]:
            logging.info(f"Inferring cell cycle")
            CellCycleAnnotator(species).annotate(ds)
Example #59
0
    def communityLayoutCalculation(self, Layout, g):
        self.g = g

        if not (self.Graphwidget.ColorNodesBasedOnCorrelation):
            partition = cm.best_partition(self.g)
            size = float(len(set(partition.values())))
            induced_graph = cm.induced_graph(partition, self.g)
            if not (self.Graphwidget.level == -1):
                dendo = cm.generate_dendrogram(self.g)
                g = cm.partition_at_level(dendo, self.Graphwidget.level)
                partition = g
            self.ColorForCommunities(len(set(partition.values())))
        if (Layout == "circular") or (Layout == "shell") or (Layout == "random") \
        or (Layout == "fruchterman_reingold_layout") or (Layout == "spring") or (Layout == "spectral"):
            if (Layout == "spring"):
                if self.Graphwidget.First:
                    self.Graphwidget.First = False
                    neewPos = nx.spring_layout(self.g,
                                               weight='weight',
                                               k=0.55,
                                               iterations=20,
                                               scale=500)
                    pos = neewPos
                else:
                    neewPos = nx.spring_layout(self.g,
                                               pos=self.pos,
                                               weight='weight',
                                               scale=500)
                    pos = neewPos
                count = 0
                Factor = 1
            elif (Layout == "random") or (Layout == "shell") or (Layout
                                                                 == "neato"):
                neewPos = eval('nx.' + Layout + '_layout(self.g)')
                pos = neewPos
                Factor = 2000
            else:
                neewPos = eval('nx.' + Layout + '_layout(self.g,scale=1000)')
                pos = neewPos
                Factor = 1
            if not (self.Graphwidget.ColorNodesBasedOnCorrelation):
                self.Graphwidget.ColorNodesBasedOnCorrelation = False
                if not (self.Graphwidget.level == -1):
                    self.ChangeCommunityColorAndInstantiateHierarchy(
                        self.Graphwidget.level - 1)
                else:
                    self.ChangeCommunityColorAndInstantiateHierarchy()
        else:
            if Layout != "circo":
                pos = nx.nx_pydot.graphviz_layout(
                    self.g, prog=Layout, args='-Gsep=.25,-GK=20-Eweight=2')
                Factor = 0.7 + self.counter / 100
                if Layout == 'sfdp':
                    Factor = 1
            else:
                pos = nx.nx_pydot.graphviz_layout(self.g, prog=Layout)
                Factor = 0.7

            if not (self.Graphwidget.ColorNodesBasedOnCorrelation):
                self.Graphwidget.ColorNodesBasedOnCorrelation = False
                if not (self.Graphwidget.level == -1):
                    self.ChangeCommunityColorAndInstantiateHierarchy(
                        self.Graphwidget.level - 1)
                else:
                    self.ChangeCommunityColorAndInstantiateHierarchy()
        self.pos = pos
        return pos, Factor
    alpha=0.8)

nx.draw_networkx_nodes(
    g,
    pos,
    nodelist=[node for node in g.nodes() if found[int(node)] == 0],
    node_color='b',
    node_size=100,
    alpha=0.8)

nx.draw_networkx_edges(g, pos, width=1.0, alpha=0.5)
'''
plt.show()


'''

gt_node2comm = nx.get_node_attributes(g, 'community')
correct_labels = [gt_node2comm[str(node)] for node in range(N)]
pred_labels = [found[node] for node in range(N)]

nmi = normalized_mutual_info_score(correct_labels, pred_labels)

print("NMI: {}".format(nmi))

found2 = community.best_partition(graph=g)
pred_labels = [found2[str(node)] for node in range(N)]
nmi = normalized_mutual_info_score(correct_labels, pred_labels)

print("Louvain NMI: {}".format(nmi))