def markov_clustering(distance_mat, inflation):
    """
    Runs the Markov Clustering algorithm on the input distance matrix.
    Inputs:
        DISTANCE_MAT: A (neurons x neurons) numpy matrix calculated by some
            distance metric.
        INFLATION: An int; the Hadamarde power to take during the inflation step.
            In general, values from 1.1 to 10.0 can be tried, with higher
            values generally resulting in more clusters. Inflation boosts the
            probabilities of intra-cluster walks and demotes inter-cluster walks.
    Outputs:
        CLUSTERS: A (neurons x neurons) numpy matrix of the final remaining
            clusters.
        Q: A float between [-1,1]; the modularity score associated with this
            clustering. Modularity measures the density of in-cluster edges
            to out-of-cluster edges. Specifically, it is the fraction of edges
            that fall within the clusters minus the expected fraction if edges
            were randomly distributed.
    """

    G = nx.from_numpy_matrix(distance_mat)
    sparse_G = nx.to_scipy_sparse_matrix(G)
    result = mc.run_mcl(sparse_G, inflation=inflation)
    clusters = mc.get_clusters(result)
    Q = mc.modularity(matrix=result, clusters=clusters)
    return clusters, Q
Exemple #2
0
def test_modularity():
    source = np.matrix(test_matrices[4][0])
    target = test_matrices[4][1]
    clusters = mc.get_clusters(mc.run_mcl(source))

    quality = mc.modularity(source, clusters)
    assert np.isclose(quality, target)
Exemple #3
0
def mcl(graph, viz=False):
    
    mat = nx.to_numpy_matrix(graph)
    
    mod = -1
    
    for val in np.arange(1.2,3,0.1):
        
        res = mc.run_mcl(mat, inflation=val)
        clust = mc.get_clusters(res)
        q = mc.modularity(matrix=np.asmatrix(res), clusters=clust)
        if q > mod:
            clusters = clust
    
    if viz == False:
        
        labels = dict(zip(range(len(graph)),graph.nodes()))

        return[[labels.get(item) for item in clust] for clust in clusters]
    
    else:
        
        plt.figure(num=None, figsize=(20,20), dpi=50)
        pos = nx.spring_layout(graph)
        mc.draw_graph(mat, clusters, node_size=200, with_labels=False, edge_color="silver")
Exemple #4
0
def clusters_mcl(oid, evs, inf=inf):

	# MCL clustering: create network
	logging.info("%s Create network" % oid)
	evs_e = evs[["in_gene","out_gene","branch_support"]]
	evs_e = evs_e[evs_e["branch_support"] > min_support]
	evs_n = nx.convert_matrix.from_pandas_edgelist(evs_e, source="in_gene", target="out_gene", edge_attr="branch_support")
	evs_n_nodelist = [ node for i, node in enumerate(evs_n.node()) ]
	evs_m = nx.to_scipy_sparse_matrix(evs_n, nodelist=evs_n_nodelist)
	# MCL clustering: run clustering
	# inf,_ = optimise_inflation(matrix=evs_m)
	logging.info("%s MCL clustering, inflation = %f" % (oid, inf))
	mcl_m  = markov_clustering.run_mcl(evs_m, inflation=inf)
	mcl_c  = markov_clustering.get_clusters(mcl_m)
	logging.info("%s MCL clustering, num clusters = %i" % (oid, len(mcl_c)))
	# plt.figure(figsize=(10,10))
	#markov_clustering.draw_graph(mcl_m, mcl_c, node_size=20, with_labels=False, edge_color="silver", cmap="Accent")
	# plt.savefig('graph.pdf')
	# MCL clustering: save output
	mcl_c_clu = [ i for i, cluster in enumerate(mcl_c) for node in cluster]
	mcl_c_noi = [ node for i, cluster in enumerate(mcl_c) for node in cluster]
	clu = pd.DataFrame( { 
		"node"    : [evs_n_nodelist[i] for i in mcl_c_noi],
		"cluster" : mcl_c_clu,
	}, columns=["node","cluster"])
	clu["cluster"] = clu["cluster"].astype(str)
	logging.info("%s MCL clustering, num clustered genes = %i" % (oid, len(clu)))

	return clu
Exemple #5
0
    def cluster(self, collection, vectorizer) -> List[Cluster]:
        articles = list(collection.articles())
        texts = ['{} {}'.format(a.title, a.text) for a in articles]
        try:
            X = vectorizer.transform(texts)
        except:
            X = vectorizer.fit_transform(texts)

        times = [a.time for a in articles]

        print('temporal graph...')
        S = self.temporal_graph(X, times)
        #print('S shape:', S.shape)
        print('run markov clustering...')
        result = mc.run_mcl(S)
        print('done')

        idx_clusters = mc.get_clusters(result)
        idx_clusters.sort(key=lambda c: len(c), reverse=True)

        print(f'times: {len(set(times))} articles: {len(articles)} '
              f'clusters: {len(idx_clusters)}')

        clusters = []
        for c in idx_clusters:
            c_vectors = [X[i] for i in c]
            c_articles = [articles[i] for i in c]
            Xc = sparse.vstack(c_vectors)
            centroid = sparse.csr_matrix(Xc.mean(axis=0))
            cluster = Cluster(c_articles, c_vectors, centroid=centroid)
            clusters.append(cluster)

        return clusters
Exemple #6
0
def score(G):
    FF = G.copy()
    for k, v in FF.edges.items():
        FF.edges[k]['weight'] = 1. / FF.edges[k]['weight']

    M = nx.adjacency_matrix(FF).todense()
    N = M.shape[0]

    SC = SpectralClustering(n_clusters=2, affinity='precomputed')
    clustering = SC.fit(M)
    y_sc = clustering.labels_ + 1
    chinese_whispers(FF, iterations=100)

    result = mc.run_mcl(M, inflation=2)
    clusters = mc.get_clusters(result)
    y_mc = mc_pred(FF, clusters)

    y = np.zeros_like(clustering.labels_)
    y_cw = []
    for i, x in enumerate(FF.nodes):
        if FF.nodes[x]['color'] == 'blue':
            y[i] = 1
        elif FF.nodes[x]['color'] == 'red':
            y[i] = 2
        y_cw.append(FF.nodes[x]['label'])
    labels = {x: i for i, x in enumerate(set(y_cw))}
    y_cw = [labels[x] for x in y_cw]
    y_cw = np.array(y_cw)

    print(*scores(y, y_cw), nmi(y, y_cw))
    print(*scores(y, y_sc), nmi(y, y_sc))
    print(*scores(y, y_mc), nmi(y, y_mc))

    return ((*scores(y, y_cw), nmi(y, y_cw)), (*scores(y, y_sc), nmi(y, y_sc)),
            (*scores(y, y_mc), nmi(y, y_mc)))
def clusters_mcl(phy_fn, phy_id, evou_d):

	# MCL clustering: create network
	logging.info("%s Create network" % phy_id)
	evou_e = evou_d[["in_gene","out_gene","branch_support"]]
	evou_n = networkx.convert_matrix.from_pandas_edgelist(evou_e, source="in_gene", target="out_gene", edge_attr="branch_support")
	evou_n_nodelist = [ node for i, node in enumerate(evou_n.node()) ]
	evou_m = networkx.to_scipy_sparse_matrix(evou_n, nodelist=evou_n_nodelist)
	# MCL clustering: run clustering
	# inf,_ = optimise_inflation(matrix=evou_m)
	logging.info("%s MCL clustering, inflation = %f" % (phy_id, inf))
	mcl_m  = markov_clustering.run_mcl(evou_m, inflation=inf)
	mcl_c  = markov_clustering.get_clusters(mcl_m)
	logging.info("%s MCL clustering, num clusters = %i" % (phy_id, len(mcl_c)))
	# markov_clustering.draw_graph(mcl_m, mcl_c, node_size=50, with_labels=True, edge_color="k", cmap="Accent")
	# MCL clustering: save output
	mcl_c_clu = [ i for i, cluster in enumerate(mcl_c) for node in cluster]
	mcl_c_noi = [ node for i, cluster in enumerate(mcl_c) for node in cluster]
	mcl_c_out = pd.DataFrame( { 
		"node"    : [evou_n_nodelist[i] for i in mcl_c_noi],
		"cluster" : mcl_c_clu,
	}, columns=["node","cluster"])
	mcl_c_out["cluster"] = mcl_c_out["cluster"].astype(str)
	logging.info("%s MCL clustering, num clustered genes = %i" % (phy_id, len(mcl_c_out)))

	return mcl_c_out
Exemple #8
0
 def fit(self, inflation=2.0):
     if self.result is None:
         self.result = mc.run_mcl(self.matrix, inflation=inflation)
         self.clusters = mc.get_clusters(self.result)
         print(self.clusters[0][0])
         print("number of clusters:", len(self.clusters))
     return self.clusters
Exemple #9
0
def mcl_parameter_QC(network, range_from=15, range_to=26):
    matrix = nx.to_scipy_sparse_matrix(network)
    # perform clustering using different inflation values from 1.5 and 2.5
    # for each clustering run, calculate the modularity
    for inflation in [i / 10 for i in range(range_from, range_to)]:
        result = mc.run_mcl(matrix, inflation=inflation)
        clusters = mc.get_clusters(result)
        Q = mc.modularity(matrix=result, clusters=clusters)
        print("inflation:", inflation, "modularity:", Q)
Exemple #10
0
def MCL(cdr3, edgelist=None, mcl_hyper=[1.2, 2], outfile=None):
    """
    Perform clustering on a network of CDR3 amino acid sequences with
    a known hamming distance, using the Markov clustering (MCL) algorithm.
    For more info about the inflation and expansion parameters,
    visit: https://micans.org/mcl/


    Parameters
    ----------
    edgelist : set, optional
        Tab-separated edgelist. The default is None.
    mcl_hyper : list, optional
        MCL hyperparameters: inflation and expansion.
        The default is [1.2,2].
    outfile : str, optional
        Name of outfile. The default is None.

    Returns
    -------
    clusters : pd.DataFrame
        pd.DataFrame containing two columns: 'CDR3' and 'cluster'.
        The first column contains CDR3 sequences, the second column
        contains the corresponding cluster ids.
    """
    if edgelist is None:
        edgelist = create_edgelist(cdr3)

    try:
        G = nx.parse_adjlist(edgelist, nodetype=str)
        m = nx.to_scipy_sparse_array(G)
    
        # Run MCL
        result = mcl.run_mcl(m, inflation=mcl_hyper[0], expansion=mcl_hyper[1])
        mcl_output = mcl.get_clusters(result)
        identifiers = list(G.nodes())
    
        # Map cluster ids back to seqs
        cluster_ids = dict()
        for i in range(len(mcl_output)):
            cluster_ids[i] = list(identifiers[i] for i in mcl_output[i])
    
        # Generate nodelist
        clusters = {"CDR3": [], "cluster": []}
        for c in cluster_ids:
            for seq in cluster_ids[c]:
                clusters["CDR3"].append(seq)
                clusters["cluster"].append(c)
        clusters = pd.DataFrame(data=clusters)
    
        # Write to file
        if outfile is not None:
            clusters.to_csv(outfile, sep="\t", index=False)
    except nx.NetworkXError:
        clusters = pd.DataFrame({"CDR3": [], "cluster": []})

    return clusters
Exemple #11
0
def cluster_graph(graph, infl):
    """
    A naive markov clustering based only on inflation value.
    """
    matrix = nx.to_scipy_sparse_matrix(graph) #get adjacency matrix in sparse form
    m = normalize(matrix, norm='l1', axis=1)
    result = mc.run_mcl(m, inflation=infl) #MCL algorithm with default parameters, inflation of 2
    # result = mc.run_mcl(matrix, inflation=1.5) #MCL algorithm with inflation of 1.5 (coarser clustering)
    clusters =  mc.get_clusters(result) #gets clusters
    return m, clusters, infl
def optimise_inflation(matrix, start=1.1, end=2.5, step=0.1):
	I_lis = np.arange(start, end, step).tolist()
	Q_lis = np.zeros(shape=(len(I_lis),1))
	for n,I in enumerate(I_lis):
		result   = markov_clustering.run_mcl(matrix, inflation=I)
		clusters = markov_clustering.get_clusters(result)
		Q        = markov_clustering.modularity(matrix=result, clusters=clusters)
		Q_lis[n] = Q
	max_Q_index = np.argmax(Q_lis)
	# return inflation with maximum modularity and modularities array
	return I_lis[max_Q_index], Q_lis[max_Q_index]
Exemple #13
0
def clusters_mclw(evs, node_list, inf=inflation, verbose=True):

    import markov_clustering
    import networkx as nx

    if len(evs) > 0:

        # MCL clustering: create network
        if verbose:
            logging.info("Create network")
        evs_e = evs[["in_gene", "out_gene", "branch_support"]]
        evs_e.columns = ["in_gene", "out_gene", "weight"]
        evs_n = nx.convert_matrix.from_pandas_edgelist(evs_e,
                                                       source="in_gene",
                                                       target="out_gene",
                                                       edge_attr="weight")
        evs_n.add_nodes_from(node_list)
        evs_n_nodelist = [node for i, node in enumerate(evs_n.nodes())]
        evs_m = nx.to_scipy_sparse_matrix(evs_n, nodelist=evs_n_nodelist)
        # MCL clustering: run clustering
        if verbose:
            logging.info("MCL weighted clustering, inflation = %.3f" % (inf))
        mcl_m = markov_clustering.run_mcl(
            evs_m, inflation=inf,
            pruning_threshold=0)  #why pruning threshold HAS to be zero?
        mcl_c = markov_clustering.get_clusters(mcl_m)
        if verbose:
            logging.info("MCL weighted clustering, num clusters = %i" %
                         (len(mcl_c)))
        # MCL clustering: save output
        mcl_c_clu = [i for i, cluster in enumerate(mcl_c) for node in cluster]
        mcl_c_noi = [
            node for i, cluster in enumerate(mcl_c) for node in cluster
        ]
        mcl_c_nod = [evs_n_nodelist[i] for i in mcl_c_noi]

    else:

        # MCL clustering: create network
        if verbose:
            logging.info("There are no speciation events in this tree.")
        mcl_c_nod = node_list
        mcl_c_clu = [i for i in range(len(node_list))]

    # output
    clu = pd.DataFrame({
        "node": mcl_c_nod,
        "cluster": mcl_c_clu,
    },
                       columns=["node", "cluster"])
    if verbose:
        logging.info("MCL clustering, num clustered genes = %i" % (len(clu)))

    return clu
Exemple #14
0
 def markov_clustering(JSONin):
     nx_graph = LengthCluster.make_graph_from_JSONin(JSONin)
     matrix = nx.to_scipy_sparse_matrix(nx_graph)
     print('clustreing...')
     result = mc.run_mcl(matrix)  # run MCL with default parameters
     clusters = mc.get_clusters(result)
     transformed_clusters = LengthCluster.cluster_transform(
         cluster_list=clusters, nx_graph=nx_graph)
     # print(clusters)
     # print('drawing...')
     # plt.rcParams["figure.figsize"] = (40, 40)
     # mc.draw_graph(matrix, clusters)
     return transformed_clusters
Exemple #15
0
def cluster_traces(traces: pd.Series, expansion: int,
                   inflation: float) -> None:
    corpus = create_corpus(traces)
    trace2vec_fn = partial(vectorize_trace, corpus)

    # activity_ids, activities, max_len = vectorize_activities(traces, corpus)
    mat = create_similarity_matrix(corpus, traces, trace2vec_fn)

    result = mcl.run_mcl(mat,
                         expansion=expansion,
                         inflation=inflation,
                         loop_value=0)
    clusters = mcl.get_clusters(result)  # get clusters
    return clusters
def community_detection(edge_index, num_nodes, edge_attr=None, method='mcl'):
    """Detects clusters of nodes based on the edge attributes (distances)

    Args:
        edge_index (Tensor): Edge index
        num_nodes (int): Number of nodes
        edge_attr (Tensor, optional): Edge attributes. Defaults to None.
        method (str, optional): method. Defaults to 'mcl'.

    Raises:
        ValueError: Requires a valid clustering method ('mcl' or 'louvain')

    Returns:
        cluster Tensor
    """
    # make the networkX graph
    g = nx.Graph()
    g.add_nodes_from(range(num_nodes))

    for iedge, (i, j) in enumerate(edge_index.transpose(0, 1).tolist()):
        if edge_attr is None:
            g.add_edge(i, j)
        else:
            g.add_edge(i, j, weight=edge_attr[iedge])

    # get the device
    device = edge_index.device

    # detect the communities using Louvain detection
    if method == 'louvain':
        cluster = community.best_partition(g)
        return torch.tensor([v for k, v in cluster.items()]).to(device)

    # detect the communities using MCL detection
    elif method == 'mcl':

        matrix = nx.to_scipy_sparse_matrix(g)

        result = mc.run_mcl(matrix)  # run MCL with default parameters

        clusters = mc.get_clusters(result)  # get clusters

        index = np.zeros(num_nodes).astype('int')
        for ic, c in enumerate(clusters):
            index[list(c)] = ic

        return torch.tensor(index).to(device)
    else:
        raise ValueError('Clustering method %s not supported' % method)
Exemple #17
0
def get_clusters(df,inflation=1.7,p=95,ntopics=5):
    """
    Takes as an input the dataframe containing articles with their abstract, vectorize it,
    then returns the label as well as coordinates of the clusters centers.

    Inputs:
    ------
    df: the dataframe containing the data for the articles
    inflation: parameter that enter in the clustering algorithm. Increase value for more clusters.
               default typically yield ~10 clusters.
    p: percentile for the threshold.
    ntopics: the number of keywords for each cluster

    Outputs:
    ------
    adjacency_matrix: the adjacency matrix
    df_vectorized: dataframe with each abstract cleaned and vectorized.
    df_label: the original dataframe with a new column containing the labels
    most_frequentwords_dict: a dict relating each labels to its highest idf words with their weight

    """

    df_vectorized = prepare_vector(df)

    df_label = df.copy()


    cosine_distance_matrix = get_distance_matrix(df_vectorized)

    adjacency_matrix = get_adjacency_matrix(cosine_distance_matrix,p=p)

    df_adjacency_matrix = pd.DataFrame(adjacency_matrix,columns=df_label.index,index=df_label.index)

    G = nx.from_numpy_matrix(adjacency_matrix)

    matrix = nx.to_scipy_sparse_matrix(G)
    result = mc.run_mcl(matrix, inflation=inflation)
    clusters = mc.get_clusters(result)


    index_df = df_vectorized.index
    clusters_recid = [index_df[np.array(cluster)] for cluster in clusters]

    for i,cluster in enumerate(clusters_recid):
        df_label.loc[cluster,'Label'] = i

    most_frequentwords_dict = get_highest_idf_words_title(df_label,ntopics=ntopics)

    return df_adjacency_matrix,df_vectorized,df_label,most_frequentwords_dict
Exemple #18
0
def export_node_list_withimage(encode_data,
                               name_data,
                               matrix_form=False,
                               thresh=5,
                               inflation=1.5,
                               header=True,
                               delim=',',
                               label=None,
                               filename='node_list.csv'):

    print('building graph')
    if matrix_form:
        numnodes = encode_data.shape[0]
        matrix = encode_data
    else:
        numnodes = len(encode_data)
        positions = {i: encode_data[i] for i in range(numnodes)}
        # use networkx to generate the graph
        network = nx.random_geometric_graph(numnodes, thresh, pos=positions)

        # then get the adjacency matrix (in sparse form)
        matrix = nx.to_scipy_sparse_matrix(network)

    print('runing mcl')
    #

    result = mc.run_mcl(matrix, inflation=inflation)
    clusters = mc.get_clusters(result)
    mc.draw_graph(matrix,
                  clusters,
                  node_size=50,
                  with_labels=False,
                  edge_color="silver")
    if label == None:
        label = [str(i) for i in range(numnodes)]
    f = open("node_list.csv", 'w')
    if header:
        f.write("Id,Label,Cluster-ID,image\n")
    i = 1
    for j in clusters:
        for node in j:
            pos = label[node].find('!')
            pos2 = name_data[node].find('RTW')
            sent = "\"" + label[node] + "\"" + delim + "\"" + label[
                node][:pos] + "\"" + delim + str(i) + delim + "\"" + name_data[
                    node][pos2 + 4:-4] + ".png" + "\"" + "\n"
            f.write(sent)
        i = i + 1
    f.close()
Exemple #19
0
def graph_clustering(graph,cluster_rate = 1.5, draw = False):
    AS_Num = 0
    n_Matrix = nx.to_scipy_sparse_matrix(graph)
    result = mc.run_mcl(n_Matrix,inflation = cluster_rate)
    clusters = mc.get_clusters(result)
    print("Number of AS: " + str(len(clusters)))
    graph.graph['Total_AS'] = len(clusters)
    for c in clusters:
        for n_id in c:
            graph.add_node(n_id,AS_N = AS_Num)
        AS_Num += 1
    if(draw):
        mc.draw_graph(n_Matrix, clusters,
        node_size=10, with_labels=False, edge_color="black",width=0.2)
        plt.show()
def community_detection_per_batch(edge_index,
                                  batch,
                                  num_nodes,
                                  edge_attr=None,
                                  method='mcl'):

    # make the networkX graph
    g = nx.Graph()
    g.add_nodes_from(range(num_nodes))

    for iedge, (i, j) in enumerate(edge_index.transpose(0, 1).tolist()):
        if edge_attr is None:
            g.add_edge(i, j)
        else:
            g.add_edge(i, j, weight=edge_attr[iedge])

    num_batch = max(batch) + 1
    all_index = range(num_nodes)
    cluster, ncluster = [], 0

    for iB in range(num_batch):

        index = torch.tensor(all_index)[batch == iB].tolist()
        subg = g.subgraph(index)

        # detect the communities using Louvain method
        if method == 'louvain':
            c = community.best_partition(subg)
            cluster += [v + ncluster for k, v in c.items()]
            ncluster = max(cluster)

        # detect communities using MCL
        elif method == 'mcl':
            matrix = nx.to_scipy_sparse_matrix(subg)
            result = mc.run_mcl(matrix)  # run MCL with default parameters
            mc_clust = mc.get_clusters(result)  # get clusters

            index = np.zeros(subg.number_of_nodes()).astype('int')
            for ic, c in enumerate(mc_clust):
                index[list(c)] = ic + ncluster
            cluster += index.tolist()
            ncluster = max(cluster)

        else:
            raise ValueError('Clustering method %s not supported' % method)
    # return
    device = edge_index.device
    return torch.tensor(cluster).to(device)
Exemple #21
0
def community_detection(edge_index, num_nodes, edge_attr=None, method='mcl'):

    # make the networkX graph
    g = nx.Graph()
    g.add_nodes_from(range(num_nodes))

    for iedge, (i, j) in enumerate(edge_index.transpose(0, 1).tolist()):
        if edge_attr is None:
            g.add_edge(i, j)
        else:
            g.add_edge(i, j, weight=edge_attr[iedge])

    # get the device
    device = edge_index.device

    # detect the communities using Louvain detection
    if method == 'louvain':
        cluster = community.best_partition(g)
        return torch.tensor([v for k, v in cluster.items()]).to(device)

    # detect the communities using MCL detection
    elif method == 'mcl':

        #print('')
        #t0 = time()
        matrix = nx.to_scipy_sparse_matrix(g)
        #print('__ scipy %f' %(time()-t0))

        #t0 = time()
        result = mc.run_mcl(matrix)  # run MCL with default parameters
        #print('__ run %f' %(time()-t0))

        #t0 = time()
        clusters = mc.get_clusters(result)  # get clusters
        #print('__ cluster %f' %(time()-t0))

        #t0 = time()
        index = np.zeros(num_nodes).astype('int')
        for ic, c in enumerate(clusters):
            index[list(c)] = ic
        #print('__ process %f' %(time()-t0))

        return torch.tensor(index).to(device)
    else:
        raise ValueError('Clustering method %s not supported' % method)
Exemple #22
0
    def cluster(self, collection, vectorizer, sbert=False) -> List[Cluster]:
        articles = list(collection.articles())
        texts = ['{} {}'.format(a.title, a.text) for a in articles]
        if sbert:
            X = vectorizer.encode(texts,
                                  batch_size=128,
                                  show_progress_bar=True,
                                  device='cpu',
                                  num_workers=24)
        else:
            try:
                X = vectorizer.transform(texts)
            except:
                X = vectorizer.fit_transform(texts)

        times = [a.time for a in articles]

        print('temporal graph...')
        S = self.temporal_graph(X, times)
        #print('S shape:', S.shape)
        print('run markov clustering...')
        result = mc.run_mcl(S)
        print('done')

        idx_clusters = mc.get_clusters(result)
        idx_clusters.sort(key=lambda c: len(c), reverse=True)

        print(f'times: {len(set(times))} articles: {len(articles)} '
              f'clusters: {len(idx_clusters)}')

        clusters = []
        for c in idx_clusters:
            c_vectors = [X[i] for i in c]
            c_articles = [articles[i] for i in c]

            if sparse.issparse(c_vectors):
                Xc = sparse.vstack(c_vectors)
                centroid = sparse.csr_matrix(Xc.mean(axis=0))
            else:
                Xc = np.vstack(c_vectors)
                centroid = np.mean(Xc, axis=0)
            cluster = Cluster(c_articles, c_vectors, centroid=centroid)
            clusters.append(cluster)

        return clusters
Exemple #23
0
def export_cluster_dirs(encode_data,
                        name_data,
                        matrix_form=False,
                        thresh=5,
                        inflation=1.5):

    print('building graph')
    if matrix_form:
        numnodes = encode_data.shape[0]
        matrix = encode_data
    else:
        numnodes = len(encode_data)
        positions = {i: encode_data[i] for i in range(numnodes)}
        # use networkx to generate the graph
        network = nx.random_geometric_graph(numnodes, thresh, pos=positions)

        # then get the adjacency matrix (in sparse form)
        matrix = nx.to_scipy_sparse_matrix(network)
    #
    print('runing mcl')
    #
    result = mc.run_mcl(matrix, inflation=inflation)
    clusters = mc.get_clusters(result)

    i = 1
    import shutil, os
    try:
        shutil.rmtree("cluster")
    except:
        pass
    os.makedirs("cluster")
    for j in clusters:
        if len(j) == 1:
            continue
        file = "cluster/{}".format(i)
        os.makedirs(file)
        for node in j:
            pos = name_data[node].find('RTW')
            shutil.copyfile(
                name_data[node],
                os.path.join(file,
                             str(node) + name_data[node][pos + 4:]))
        i = i + 1
Exemple #24
0
def clusters_mcl(evd, inf=1.1):

	# MCL clustering: create network
	evou_e = evd[["in_gene","out_gene","branch_support"]]
	evou_n = networkx.convert_matrix.from_pandas_edgelist(evou_e, source="in_gene", target="out_gene", edge_attr="branch_support")
	evou_n_nodelist = [ node for i, node in enumerate(evou_n.node()) ]
	evou_m = networkx.to_scipy_sparse_matrix(evou_n, nodelist=evou_n_nodelist)
	# MCL clustering: run clustering
	mcl_m  = markov_clustering.run_mcl(evou_m, inflation=inf)
	mcl_c  = markov_clustering.get_clusters(mcl_m)
	# MCL clustering: save output
	mcl_c_clu = [ i for i, cluster in enumerate(mcl_c) for node in cluster]
	mcl_c_noi = [ node for i, cluster in enumerate(mcl_c) for node in cluster]
	clu = pd.DataFrame( {
		"node"    : [evou_n_nodelist[i] for i in mcl_c_noi],
		"cluster" : mcl_c_clu,
	}, columns=["node","cluster"])
	clu["cluster"] = clu["cluster"].astype(str)

	return clu
def markov_cluster(mark_array):

    # Send square matrix to markov clustering algorithm
    result = mc.run_mcl(mark_array, inflation=1.5)
    # run MCL with default parameters
    clusters = mc.get_clusters(result)
    #print("results of cluster: ", result)
    print("clusters: ", clusters)
    cluster_array = []
    cluster_array = np.asarray(clusters)
    print("size of cluster: ", cluster_array.shape)

    # Test to choose the best inflation point
    # for inflation in [i/10 for i in range (15,26)]:
    #     result = mc.run_mcl(mark_array, inflation = inflation)
    #     clusters = mc.get_clusters(result)
    #     q = mc.modularity(mark_array=result, clusters=clusters)
    #     print("Inflation: ", inflation, "modularity: ", q)

    #mc.draw_graph(mark_array, clusters, node_size=50, with_labels=True, edge_color="silver")
    print("Successful MCL")
Exemple #26
0
def cluster_plot(adjacent,
                 title,
                 pos,
                 inflation=1.3,
                 filename=None,
                 labels=None,
                 label_flag=True,
                 node_size=150,
                 figsize=(6, 6),
                 use_nodeaslabel=False,
                 width=1):
    fig, ax = plt.subplots(1, 1, sharey=False, sharex=False, figsize=figsize)
    result = mc.run_mcl(adjacent, inflation=inflation)
    clusters = mc.get_clusters(result)  # get clusters
    plt.title(title)
    graph = nx.Graph(adjacent)
    clusters = complete_cluster(clusters, graph.nodes())
    if pos is None:
        pos = nx.spring_layout(graph, iterations=100)
    mc.draw_graph(adjacent,
                  clusters,
                  pos=pos,
                  with_labels=False,
                  edge_color="silver",
                  node_size=node_size,
                  width=width)
    if labels is None:
        if use_nodeaslabel:
            labels = {n: n for ni, n in enumerate(graph.nodes())}
        else:
            labels = {n: ci for ci, c in enumerate(clusters) for n in c}
    if pos is None:
        pos = nx.spring_layout(graph, iterations=100)
    if label_flag:
        nx.draw_networkx_labels(graph, pos, labels=labels)
    if not filename is None:
        plt.savefig(filename, bbox_inches='tight')
    else:
        plt.show(block=False)
    return clusters, pos, labels
Exemple #27
0
def cluster_main(args: argparse.Namespace):
    paf = PAF.from_file(args.inpaf)
    G = paf_to_graph(paf, min_cov=args.min_cov)

    nodes = list(G.nodes())
    matrix = nx.to_scipy_sparse_matrix(G, nodelist=nodes)
    result = mc.run_mcl(matrix,
                        expansion=args.expansion,
                        inflation=args.inflation)
    clusters = mc.get_clusters(result)

    named_clusters = [[nodes[i] for i in cl] for cl in clusters]

    for i, cluster_members in enumerate(named_clusters, 1):
        for member in cluster_members:
            print(f"{i}\t{member}", file=args.outfile)

    if args.plot is not None:
        plot_clusters(args.plot, matrix, clusters, args.plot_height,
                      args.plot_width, args.plot_dpi)

    return
Exemple #28
0
def test_clustering_structure(n_runs=20):
    nmis_gt = []
    nmis_mcl = []
    nmis_louvain = []
    for i in range(n_runs):
        print("Run number {0}".format(i))
        ensemble_density_huge("file.csv", "\\t")
        dist_dense = pd.read_csv("./matrix.csv", delimiter="\t",
                                 header=None).values
        dist_dense = dist_dense[:, :-1]
        scaler = QuantileTransformer(n_quantiles=10)
        dist_dense_scaled = scaler.fit_transform(dist_dense)
        results_dense = TSNE(
            metric="precomputed").fit_transform(dist_dense_scaled)
        model_kmeans = KMeans(n_clusters=len(set(true)))
        labels_dense_kmeans = model_kmeans.fit_predict(results_dense)
        clusters_mcl = [0 for i in range(len(adj))]
        result_mcl = mc.run_mcl(adj)  # run MCL with default parameters
        clusters = mc.get_clusters(result_mcl)  # get clusters
        i = 0
        for cluster in clusters:
            for j in cluster:
                clusters_mcl[j] = i
            i += 1

        partition = louvain.best_partition(G)
        labels_spectral = [v for k, v in partition.items()]

        nmis_gt.append(
            nmi(labels_dense_kmeans, true, average_method="arithmetic"))
        nmis_mcl.append(nmi(clusters_mcl, true, average_method="arithmetic"))
        nmis_louvain.append(
            nmi(labels_spectral, true, average_method="arithmetic"))
    print("GT : {0}, {1}".format(np.mean(nmis_gt), np.std(nmis_gt)))
    print("MCL : {0}, {1}".format(np.mean(nmis_mcl), np.std(nmis_mcl)))
    print("Louvain : {0}, {1}".format(np.mean(nmis_louvain),
                                      np.std(nmis_louvain)))
    return ((nmis_gt, nmis_mcl, nmis_louvain))
def RunMCL(graph): 
	""" run markove clustering once """
	# get adjacency matrix and mapping, then run mcl
	adj_mat, idx_to_token_mapping = AdjacencyMatrix(graph)
	adj_mat = adj_mat.toarray() # from sparse to np array
	mcl_clustering = mc.run_mcl(adj_mat, inflation=2)
	clusters = mc.get_clusters(mcl_clustering)

	# get token representation of the clusters 
	token_clusters = list() 
	for cl in clusters:
		cl_tokens = list()
		for idx in cl:
			cl_tokens.append(idx_to_token_mapping[idx])
		token_clusters.append(cl_tokens)
		
	# check if repeating node exists due to isomorphic graph structures 
	num_nodes_in_clusters = sum(len(c) for c in token_clusters)
	if graph.number_of_nodes() != num_nodes_in_clusters:
		token_clusters, modularity = EnforceOneToOneMapping(graph, token_clusters)				
	else: 
		modularity = Modularity(graph, token_clusters)
		
	return token_clusters, modularity
Exemple #30
0
def find_clusters(embeddings, similarity_threshold, inflation):
    """
    Identify clusters within embedded data.
    
    :param embeddings: embedded data
    :type embeddings: numpy array-like
    :param similarity_threshold: Can be used to tune the clustering
                                 performance.
    :type similarity_threshold: real number
    :param inflation: Markov clustering inflation. Used to control the
                      granularity of the clustering. Low values give 
                      fewer, larger clusters. Higher values give more,
                      smaller clusters.
    :type inflation: real number
    :return: The identified clusters
    :rtype: dict
    """
    dists = distances_from_embeddings(embeddings)
    similarity = similarity_from_dists(dists, similarity_threshold)
    results = mc.run_mcl(similarity, inflation=inflation)
    clusters = sorted(mc.get_clusters(results),
                      key=lambda x: len(x),
                      reverse=True)
    return clusters