Example #1
0
def compute_eval_metrics(G, array_connection, clustering, labels=None):
    if labels is None:
        return (nx.community.quality.modularity(G,
                                                get_partitions(G, clustering)),
                modularity_density(array_connection, clustering,
                                   np.unique(clustering)),
                len(np.unique(clustering)))
    else:
        from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score
        aux = []
        aux_c = []
        for key in G.node:
            aux.append(labels[key])
            aux_c.append(clustering[key])

        clustering = aux_c
        labels = aux
        labels = np.array(labels, dtype=int)

        return ((nx.community.quality.modularity(G,
                                                 get_partitions(G,
                                                                clustering)),
                 modularity_density(array_connection, clustering,
                                    np.unique(clustering)),
                 len(np.unique(clustering))),
                (adjusted_rand_score(labels, clustering),
                 adjusted_mutual_info_score(labels, clustering)))
Example #2
0
def detect(rel_df, resolution=0.0):
    df = rel_df.copy(deep=True)
    df = df.rename(mapper=lambda name: name.lower(), axis='columns')

    G_original = nx.from_pandas_edgelist(df, edge_attr=['weight', 'change'])
    G = nx.relabel.convert_node_labels_to_integers(G_original)

    # Ensure everyone is present, even if no edge points towards them
    # if nodes is not None:
    #   for node in nodes:
    #     if G.has_node(node) == False:
    #       G.add_node(node)

    adj = nx.to_scipy_sparse_matrix(G)

    density = False
    c = None

    if density:
        c = fine_tuned_clustering_qds(G)
    else:
        c = fine_tuned_clustering_q(G, r=resolution)

    Q = metrics.modularity_r(adj, c, np.unique(c), r=resolution)
    D = metrics.modularity_density(adj, c, np.unique(c))

    print("Communities:", c)
    print("#comms: ", len(np.unique(c)), 'given by', np.unique(c))
    print("Modularity Q:", Q)
    print("Modularity D:", D)
    print("Mode: ", 'D' if density else 'Q')

    for i, node in enumerate(G_original.nodes()):
        G_original.add_node(node, modularity_class=str(c[i]))

    # print(G_original.nodes())

    return {
        'graph': G_original,
        'communities': c,
        'Q': Q,
        'D': D,
    }
def forced_split_communities_qds(adj, c, cluster_size, normalize,
                                 evd_method, tolerence, seed):
    """Force splits the communities in graph, if the size of the first_community
       is greater than the threshold, such that the splitting least
       compromizes modularity density.

    Parameters
    ----------
    adj : SciPy sparse matrix (csr or csc)
        The N x N Adjacency matrix of the graph.
    c : Integer array
        Current array of community labels for the nodes in the graph as
        ordered by the adjacency matrix.
    cluster_size : integer
        Threshold/maximum size (number of nodes) of a cluster.
    normalize : bool
        Whether the normalized Laplacian matrix is used.
    evd_method : string
        Method of eigenvalue computation. It should be one of 'tracemin'
        (TraceMIN), 'lanczos' (Lanczos iteration) and 'lobpcg' (LOBPCG).
    tolerence : float
        Tolerance of relative residual in eigenvalue computation.
    seed : integer, random_state, or None
        Indicator of random number generation state.

    Returns
    -------
    Integer array
        Array of community labels, as a result of splitting, for the nodes
        in the graph as ordered by the adjacency matrix.

    """

    # Array of unique community labels
    unique_clusters = np.unique(c)

    # Tracks the nodes in each community
    dict_bool = {}

    # Tracks the clusters that are connected to each community
    dict_connected = {}

    for label in unique_clusters:
        # Track the nodes in each community
        dict_bool[label] = (c == label)

        # Initialize each key to an empty set
        dict_connected[label] = set()

    # Track the clusters that are connected to each community
    for comm1 in unique_clusters[:-1]:
        # index of the community 'comm1'
        i = np.where(unique_clusters == comm1)[0][0]
        bool_1 = dict_bool[comm1]
        adj_comm1 = adj[bool_1]

        # Track the clusters that are connected to community 'comm1'
        for comm2 in unique_clusters[i+1:]:
            bool_2 = dict_bool[comm2]
            zero = np.zeros(len(c), dtype=int)
            zero[bool_2] = 1

            # Check if 'comm2' is connected to 'comm1'
            if ((adj_comm1.dot(zero)).sum()) != 0:
                dict_connected[comm1].add(comm2)
                dict_connected[comm2].add(comm1)

    # Create a copy of cluster labels
    c_new = c.copy()

    # Split each community, whose size is greater than the threshold
    for cluster_num in unique_clusters:

        bool_r = dict_bool[cluster_num]

        # Sparse adjacency matrix corresponding to 'cluster_num'
        sub_adj = adj[bool_r].T[bool_r]

        # Subgraph constructed from sparse adjacency matrix of 'cluster_num'
        g = nx.from_scipy_sparse_matrix(sub_adj)
        # Number of nodes in 'g'
        len_g = len(g)

        # Don't consider further splitting singleton communities
        # or communities of size lower than the threshold
        # or a community which has disconnected modules
        if ((len_g == 1) | (len_g <= cluster_size) |
           (not(nx.is_connected(g)))):

            if(not(nx.is_connected(g))):
                print("Warning: Check your data as an earliar iteration \
                      resulted in a cluster with \
                      internal disconnected components")
            continue
        else:

            # Create an array of community labels for the
            # nodes in 'cluster_num'
            c_sub = np.zeros(len_g, dtype=int)

            # indices of the nodes in 'sub_adj'
            sub_index = np.arange(len_g)

            # Determine the fiedler_vector of subgraph 'g'
            f_vector = fiedler_vector(g, weight='weight', normalized=normalize,
                                      tol=tolerence,
                                      method=evd_method, seed=seed)

            # Rearrange the nodes of 'sub_adj' in the descreasing order of
            # elements of fieldler vector
            nodeIds = [i for f_vector, i in sorted(zip(f_vector, sub_index),
                                                   reverse=True)]

            # Initialize the communities corresponding to
            # bipartitioning of 'cluster_num'
            first_community = []
            second_community = []
            second_community.extend(nodeIds)

            # Records the splitting information
            split_info = {}

            # Create a copy of the latest cluster labels
            c_latest = c_new.copy()

            # Create a copy of 'dict_bool'
            dict_bool_copy = dict_bool.copy()

            # Possible splits of 'cluster_num' based on the fielder vector
            for j in range(len(nodeIds)-1):

                # Split the 'cluster_num' into two clusters
                first_community.append(nodeIds[j])
                second_community.remove(nodeIds[j])

                # Graph induced by nodes in 'first_community'
                g1 = g.subgraph(first_community)

                # Graph induced by nodes in 'second_community'
                g2 = g.subgraph(second_community)

                # Check if 'g1' and 'g2' are connected graphs each
                if(nx.is_connected(g1) & nx.is_connected(g2)):
                    # Relabel the cluster labels of nodes in 'cluster_num'
                    c_sub[first_community] = cluster_num
                    new_label = max(c_new) + 1
                    c_sub[second_community] = new_label

                    # Array of the union of connected clusters of the
                    # split communities of 'cluster_num'
                    conn_clusters = \
                        np.array(list(((dict_connected[cluster_num]) |
                                 set([cluster_num, new_label]))))

                    # Update the cluster labels in 'c_latest'
                    c_latest[bool_r] = c_sub

                    # Update the boolean array of the split communities
                    # of 'cluster_num'
                    dict_bool_copy[cluster_num] = (c_latest == cluster_num)
                    dict_bool_copy[new_label] = (c_latest == new_label)

                    # Calculate the modularity density after
                    # splitting 'cluster_num'
                    div_metric = modularity_density(adj,
                                                    c_latest,
                                                    np.unique(c_sub[0:]),
                                                    dict_bool_copy,
                                                    conn_clusters)

                    # Record the split
                    split_info[div_metric] = j

            # Delete to save memory
            del c_latest
            del dict_bool_copy

            # Check if atleast one instance of splitting 'cluster_num' exists
            # that does not result in disconnected modules
            if len(split_info) > 0:
                # Split 'cluster_num' based on the division that
                # least compromizes modularity density
                best_split = split_info[max(split_info.keys())]
                c_sub[nodeIds[0:best_split+1]] = cluster_num
                c_sub[nodeIds[best_split+1:]] = max(c_new) + 1

                # Update 'c_new' with new community labels as a
                # result of splitting 'cluster_num'
                c_new[bool_r] = c_sub
            else:
                print("No split possible for cluster num: {}, \
                 as any further split results in disconnected modules".
                      format(cluster_num))

    # Array of community labels, as a result of splitting, for the nodes
    # in the graph as ordered by the adjacency matrix
    return c_new
def constrained_merge_communities_qds(adj, c, cluster_size):
    """Merges the communities in graph if the merging improves modularity density,
       under the condition that the merging does not result in a community
       size greater than the threshold.

    Parameters
    ----------
    adj : SciPy sparse matrix (csr or csc)
        The N x N Adjacency matrix of the graph.
    c : Integer array
        Current array of community labels for the nodes in the graph as
        ordered by the adjacency matrix.
    cluster_size : integer
        Threshold/maximum size (number of nodes) of a cluster.

    Returns
    -------
    Integer array
        Array of community labels, as a result of merging, for the nodes
        in the graph as ordered by the adjacency matrix.

    """

    # Array of unique community labels
    unique_clusters = np.unique(c)

    # Tracks the nodes in each community
    dict_bool = {}

    # Tracks the clusters that are connected to each community
    dict_connected = {}

    for label in unique_clusters:
        # Track the nodes in each community
        dict_bool[label] = (c == label)

        # Initialize each key to an empty set
        dict_connected[label] = set()

    # Records the merging information
    merging_info = {}

    # Tracks communities, which are connected to atleast one other community
    unique_clusters2 = []
    for comm in unique_clusters:
        bool_1 = dict_bool[comm]

        zero = np.zeros(adj.shape[0], dtype=int)
        zero[~bool_1] = 1

        # Check if the community 'comm' is connected to
        # atleast one other community and the community size is less than
        # the threshold
        if ((adj[bool_1].dot(zero)).sum() != 0) & \
           (np.count_nonzero(bool_1) < cluster_size):
            # Record the community 'comm'
            unique_clusters2.append(comm)

    # Convert the list of community labels to array
    unique_clusters2 = np.array(unique_clusters2)

    # Track the clusters that are connected to each community
    for comm1 in unique_clusters2[:-1]:
        # index of the community 'comm1'
        i = np.where(unique_clusters2 == comm1)[0][0]
        bool_1 = dict_bool[comm1]
        adj_comm1 = adj[bool_1]

        # Track the clusters that are connected to community 'comm1'
        for comm2 in unique_clusters2[i+1:]:
            bool_2 = dict_bool[comm2]
            zero = np.zeros(len(c), dtype=int)
            zero[bool_2] = 1

            # Check if 'comm2' is connected to 'comm1'
            if ((adj_comm1.dot(zero)).sum()) != 0:
                dict_connected[comm1].add(comm2)
                dict_connected[comm2].add(comm1)

    # Determine the contribution of each community to modularity density
    comm_metric = np.array([modularity_density(adj, c,
                           [cluster_num], dict_bool,
                           np.array(list(dict_connected[cluster_num])))
                           for cluster_num in unique_clusters2])

    # Record the improvement in modularity density for
    # each pair of connected clusters
    for comm1 in unique_clusters2[:-1]:
        # Modularity density for community 'comm1'
        metric_1 = comm_metric[unique_clusters2 == comm1][0]

        # index of the community 'comm1'
        i = np.where(unique_clusters2 == comm1)[0][0]
        bool_1 = dict_bool[comm1]

        # Prospective merger communities of 'comm1'
        for comm2 in unique_clusters2[i+1:]:

            # boolean indices of 'comm2'
            bool_2 = dict_bool[comm2]

            # Consider merging only if 'comm2' is connected to 'comm1', and
            # the merging results in a cluster size less than
            # or equal to the threshold
            if (comm2 in dict_connected[comm1]) & \
               (np.count_nonzero(bool_1 | bool_2) <= cluster_size):

                # Create a copy of cluster labels
                c_latest = c.copy()

                # Create a copy of 'dict_bool'
                dict_bool_copy = dict_bool.copy()

                # Modularity density value for community 'comm2'
                metric_2 = comm_metric[unique_clusters2 == comm2][0]

                # Label of the merged community
                merged_label = min(comm1, comm2)

                # Update the array of community labels to determine the
                # new value of the Modularity density (as a result of merging)
                c_latest[bool_1 | bool_2] = merged_label

                # Update the boolean array of the merged community
                dict_bool_copy[merged_label] = (bool_1 | bool_2)

                # Array of connected clusters of the new merged community
                conn_clusters = np.array(list(((dict_connected[comm1] |
                                         dict_connected[comm2]) -
                                         set([comm1, comm2]))))

                # Calculate the difference in modularity density for
                # merging 'comm1' and 'comm2'
                div_metric = modularity_density(adj, c_latest,
                                                np.array([merged_label]),
                                                dict_bool_copy,
                                                conn_clusters) - \
                                               (metric_1 + metric_2)

                # Record the above merge only if it improves modularity density
                if div_metric > 0:
                    merging_info[div_metric] = (comm1, comm2)

                # Delete to save memory
                del c_latest
                del dict_bool_copy

    # Tracks communities which have already merged
    comms_list = []
    # Create a copy of cluster labels
    c_new = c.copy()

    # Check if atleast one instance of merging exists that
    # improves modularity density
    if (len(merging_info) > 0):
        # Sort the merging_info in the descending order of 'div_metric'
        for div_metric in sorted(merging_info.keys(), reverse=True):

            # Consider each pair of clusters, which improve
            # modularity density when merged
            (comm1, comm2) = merging_info[div_metric]

            # Check if 'comm1' or 'comm2' already exist in
            # the list of merged clusters
            if ((not(comm1 in comms_list)) & (not(comm2 in comms_list))):
                # Merge the pair of communities
                comms_list.extend([comm1, comm2])

                # Label of the merged community
                c_new[dict_bool[comm1] | dict_bool[comm2]] = min(comm1, comm2)

    # Array of community labels, as a result of merging,
    # for the nodes in the graph as ordered by the adjacency matrix
    return c_new

def mapname(name):
    print(name)
    return name.lower()


df = pd.read_csv('cc9_rel_undirected_nozeroes.csv')
df = df.rename(mapper=mapname, axis='columns')
print(df)
G = nx.from_pandas_edgelist(df, edge_attr=['weight', 'change'])
# G = nx.les_miserables_graph()
G = nx.relabel.convert_node_labels_to_integers(G)

print(G)

adj = nx.to_scipy_sparse_matrix(G)

for gr in nx.connected_component_subgraphs(G):
    # Nodes of the subgraph 'gr'
    nodes_gr = list(gr)
    print(nodes_gr)

c = fine_tuned_clustering_q(G)
print(c)
Q = metrics.modularity_r(adj, c, np.unique(c), r=0)
D = metrics.modularity_density(adj, c, np.unique(c))

print("Modularity Q:", Q)
print("Modularity D:", D)