Ejemplo n.º 1
0
    def extract_with_density(self,
                             document: str) -> Sequence[Tuple[str, float]]:
        """Extraction of keywords corresponding to the nodes of the k-core satisfying a density criterion

        Density criterion consists in applying the elbow method when going down the k-core
        """
        # Building the graph-of-words
        gow = self.builder.compute_gow_from_document(document)
        if len(gow.nodes) > 0:
            graph = gow.to_graph()

            # Computation of the k-cores
            if self.builder.weighted:
                kcore_number = core_number_weighted(graph)
            else:
                kcore_number = nx_core_number(graph)

            # Sorted sequence of k for each k-core
            ks = sorted({k for _, k in kcore_number.items()})

            # Storage for (i, density)
            densities = []
            # Mapping between i and the k-core value
            i_to_k = {}
            # Storage of k-core graph for each k
            k_graphs = {}

            # Going DOWN the k-core and computation of the k-core densities
            for i, k in enumerate(reversed(ks)):
                g_k = k_core(graph, k=k, core_number=kcore_number)
                k_graphs[k] = g_k
                i_to_k[i] = k
                densities.append((i, density(g_k)))

            # Retrieving the most appropriate density via the elbow method
            i_k_best = elbow(densities)
            # Retrieving the corresponding k
            k_best = i_to_k[i_k_best]

            # Retrieving the keywords for k-core with k=k_best
            keywords = []
            best_graph = k_graphs[k_best]
            for v in best_graph.nodes:
                token_code = best_graph.nodes[v]['label']
                token = self.builder.get_token_(token_code)
                k = kcore_number[v]
                keywords.append((token, k))

            return sorted(keywords, key=lambda p: p[1], reverse=True)
        else:
            return []
Ejemplo n.º 2
0
# [4] Component Size
# Def : a connected component of a graph as a subgraph of a simple graph G in
#   which every vertex is connected to every other vertex in the subgraph by a path.
# Def : the number of nodes in the connected component that contains i
# Outputs an int.

component_size = C.number_of_nodes()
number_connected_components = connected_components.number_connected_components(
    G)

# [5] Component Density
# Def : the number of edges in the graph divided by the number of total possible
#   edges the graph might have.
# Outputs a float.

component_density = component_density.density(C)

# [6] Geodesic Distance
# Def : the number of edges on the shortest path between two vertices. We want
#   the average geodesic distance for the component containing i.
# geodesic_distance.closeness_centrality(C)
#   Outputs a dictionary of nodes with closeness_centrality as the value. Reciprocal
#       of how defined in latex document.
# We want : the average geodesic distance in the component where the initial infection
#   is fist introduced

# Outputs a float representing the average geodesic distance in the component where
#   the initial infection is fist introduced
# AGD was computed by summing all geodesic lengths and dividing by the number of geodesics.

average_geodesic_dist = geodesic_distance.average_shortest_path_length(C)
 for fname in os.listdir("output"):
     print(fname)
     try:
         G = read_dot(os.path.join("output", fname))
         nx.draw(G)
     except:
         print("cannot load graph")
         continue
     if G.number_of_nodes() == 0:
         print("Cannot read binary file")
         continue
     data = []
     data.append(fname)
     data.append(G.number_of_nodes())
     data.append(G.number_of_edges())
     data.append(density(G))
     deg_centrality = degree_centrality(G)
     data.extend(properties_of_array(deg_centrality))
     cln_centrality = closeness_centrality(G)
     data.extend(properties_of_array(cln_centrality))
     btn_centrality = betweenness_centrality(G)
     data.extend(properties_of_array(btn_centrality))
     st_path = shortest_path(G)
     deg = [len(val) for key, val in st_path.items()]
     d = np.array(deg)
     data.extend(
         [np.min(d),
          np.max(d),
          np.median(d),
          np.mean(d),
          np.std(d)])
Ejemplo n.º 4
0
def ver_medidas(G):
    print(function.info(G))
    """
    Numero minimo de nodos que deben ser removidos para desconectar G
    """
    print("Numero minimo de nodos que deben ser removidos para desconectar G :"+str(approximation.node_connectivity(G)))

    """
    average clustering coefficient of G.
    """
    print("average clustering coefficient of G: "+str(approximation.average_clustering(G)))

    """
    Densidad de un Grafo
    """
    print("Densidad de G: "+str(function.density(G)))

    """
    Assortativity measures the similarity of connections in
    the graph with respect to the node degree.
    Valores positivos de r indican que existe una correlacion entre nodos 
    con grado similar, mientras que un valor negativo indica
    correlaciones entre nodos de diferente grado
    """

    print("degree assortativity:"+str(assortativity.degree_assortativity_coefficient(G)))

    """
    Assortativity measures the similarity of connections
    in the graph with respect to the given attribute.
    """

    print("assortativity for node attributes: "+str(assortativity.attribute_assortativity_coefficient(G,"crime")))

    """
    Grado promedio vecindad
    """
    plt.plot(assortativity.average_neighbor_degree(G).values())
    plt.title("Grado promedio vecindad")
    plt.xlabel("Nodo")
    plt.ylabel("Grado")
    plt.show();

    """
    Grado de Centralidad de cada nodo
    """

    plt.plot(centrality.degree_centrality(G).values())
    plt.title("Grado de centralidad")
    plt.xlabel("Nodo")
    plt.ylabel("Centralidad")
    plt.show();


    """
    Calcular el coeficiente de agrupamiento para nodos
    """

    plt.plot(cluster.clustering(G).values())
    plt.title("coeficiente de agrupamiento")
    plt.xlabel("Nodo")
    plt.show();

    """
    Media coeficiente de Agrupamiento
    """
    print("Coeficiente de agrupamiento de G:"+str(cluster.average_clustering(G)))

    """
    Centro del grafo
    El centro de un grafo G es el subgrafo inducido por el 
    conjunto de vertices de excentricidad minima.

     La  excentricidad  de  v  in  V  se  define  como  la
     distancia maxima desde v a cualquier otro vertice del 
     grafo G siguiendo caminos de longitud minima.
    """

    print("Centro de G:"+ str(distance_measures.center(G)))

    """
    Diametro de un grafo
    The diameter is the maximum eccentricity.
    """
    print("Diametro de G:"+str(distance_measures.diameter(G)))


    """
    Excentricidad de cada Nodo
    The eccentricity of a node v is the maximum distance
    from v to all other nodes in G.
    """
    plt.plot(distance_measures.eccentricity(G).values())
    plt.title("Excentricidad de cada Nodo")
    plt.xlabel("Nodo")
    plt.show();

    """
    Periferia 
    The periphery is the set of nodes with eccentricity equal to the diameter.
    """
    print("Periferia de G:")
    print(distance_measures.periphery(G))

    """
    Radio
    The radius is the minimum eccentricity.

    """

    print("Radio de G:"+str(distance_measures.radius(G)))

    """
    PageRank calcula una clasificacion de los nodos
    en el grafico G en funcion de la estructura de 
    los enlaces entrantes. Originalmente fue disenado
    como un algoritmo para clasificar paginas web.
    """

    plt.plot(link_analysis.pagerank_alg.pagerank(G).values())
    plt.title("Puntaje de cada Nodo")
    plt.xlabel("Nodo")
    plt.show();

    """
    Coeficiente de Small World.
    A graph is commonly classified as small-world if sigma>1.

    """

    print("Coeficiente de Small World: " + str(smallworld.sigma(G)))

    """
    The small-world coefficient (omega) ranges between -1 and 1.
    Values close to 0 means the G features small-world characteristics.
    Values close to -1 means G has a lattice shape whereas values close
    to 1 means G is a random graph.
    """
    print("Omega coeficiente: "+str(smallworld.omega(G)))
def compute_directed_graph_metrics(G):
    assert type(G) is nx.DiGraph

    n_edges = len(G.edges)

    # in & out degree stats
    in_degrees = np.array([n for _, n in G.in_degree()])
    out_degrees = np.array([n for _, n in G.out_degree()])

    in_degrees_k_freq = np.unique(in_degrees, return_counts=True)[1]
    out_degrees_k_freq = np.unique(out_degrees, return_counts=True)[1]

    out_in_degrees_corr = numeric_attribute_correlation(
        G, dict(G.out_degree), dict(G.in_degree))

    # dyad metrics
    dyad_freq = dyadic_census(G)
    dyad_metrics = compute_dyad_metrics(dyad_freq)

    # reciprocity
    reciprocity = None
    if n_edges > 0:
        # based on networkx definition
        reciprocity = 2 * dyad_freq["2"] / (dyad_freq["1"] +
                                            2 * dyad_freq["2"])

    # clustering
    global_clustering = transitivity(G)
    local_clustering_mean = average_clustering(G)

    # fraction of connected node pairs (any path len)
    f_connected_node_pairs = fraction_of_connected_node_pairs(G)

    # centralization
    cent_metrics = centralization_metrics(G, prefix="_di")

    metrics = {
        "n_edges_di":
        len(G.edges),
        "density_di":
        density(G),
        "reciprocity":
        reciprocity,
        # in_degree
        "in_degrees_mean":
        safe(np.mean, in_degrees),
        "in_degrees_var":
        safe(np.var, in_degrees),
        "in_degrees_hidx":
        safe(h_index, in_degrees),
        "in_degrees_gini":
        safe(gini, in_degrees + eps),
        "in_degrees_f0":
        safe(np.mean, (in_degrees == 0)),
        "in_degrees_pk_ent":
        entropy(in_degrees_k_freq),
        "in_degrees_pk_gini":
        gini(in_degrees_k_freq),
        # out_degree
        "out_degrees_mean":
        safe(np.mean, out_degrees),
        "out_degrees_var":
        safe(np.var, out_degrees),
        "out_degrees_hidx":
        safe(h_index, out_degrees),
        "out_degrees_gini":
        safe(gini, out_degrees + eps),
        "out_degrees_f0":
        safe(np.mean, (out_degrees == 0)),
        "out_degrees_pk_ent":
        entropy(out_degrees_k_freq),
        "out_degrees_pk_gini":
        gini(out_degrees_k_freq),
        # degree assortativity
        "out_in_degrees_corr":
        out_in_degrees_corr,
        # dyad metric
        **dyad_metrics,
        # fraction of connected node pairs with path of any length
        "f_connected_node_pairs_di":
        f_connected_node_pairs,
        # clustering coefficients
        "global_clustering_di":
        global_clustering,
        "local_clustering_mean_di":
        local_clustering_mean,
        # centralization
        **cent_metrics
    }

    return metrics
def compute_undirected_graph_metrics(G):
    assert type(G) is nx.Graph

    # degrees stats
    degrees = np.array([i for _, i in G.degree])
    degrees_k_freq = np.unique(degrees, return_counts=True)[1]
    degrees_corr = numeric_attribute_correlation(G, dict(G.degree),
                                                 dict(G.degree))

    # clustering
    global_clustering = transitivity(G)
    local_clustering_mean = average_clustering(G)

    # fraction of connected node pairs (any path len)
    f_connected_node_pairs = fraction_of_connected_node_pairs(G)

    # centralization
    cent_metrics = centralization_metrics(G, prefix="_ud")

    # modularity
    modularity_metrics = compute_modularity_metrics(G)

    # largest CC
    CC1_nodes = max(connected_components(G), key=len)
    CC1 = G.subgraph(CC1_nodes).copy()
    f_CC1_nodes = len(CC1) / len(G)

    # algebraic_connectivity of the largest CC
    algebraic_connectivity_CC1 = None
    if len(CC1) > 2:
        try:
            algebraic_connectivity_CC1 = algebraic_connectivity(CC1, seed=0)
        except:
            algebraic_connectivity_CC1 = None

    # connected components
    CC = connected_components(G)
    CC_sizes = np.array([len(cc_i) for cc_i in CC])

    CC_metrics = {}
    for k in CC_k_thresholds:
        CC_metrics[f"n_CC_{k}"] = np.sum(CC_sizes >= k)

    # k-core
    k_core_metrics = {}
    G_core_number = core_number(G)

    for k in k_core_ks:
        k_core_subgraph = k_core(G, k=k, core_number=G_core_number)
        k_core_metrics[f"core_{k}_n_nodes"] = len(k_core_subgraph.nodes)
        k_core_metrics[f"core_{k}_n_edges"] = len(k_core_subgraph.edges)
        k_core_metrics[f"core_{k}_density"] = density(k_core_subgraph)
        k_core_metrics[f"core_{k}_n_CC"] = len(
            list(connected_components(k_core_subgraph)))

    # k-truss
    k_truss_metrics = {}

    for k in k_truss_ks:
        k_truss_subgraph = k_truss(G, k=k)
        k_truss_metrics[f"truss_{k}_n_nodes"] = len(k_truss_subgraph.nodes)
        k_truss_metrics[f"truss_{k}_n_edges"] = len(k_truss_subgraph.edges)
        k_truss_metrics[f"truss_{k}_density"] = density(k_truss_subgraph)
        k_truss_metrics[f"truss_{k}_n_CC"] = len(
            list(connected_components(k_truss_subgraph)))

    metrics = {
        "n_edges_ud":
        len(G.edges()),
        "density_ud":
        density(G),
        # degree stats
        "degrees_mean":
        safe(np.mean, degrees),
        "degrees_var":
        safe(np.var, degrees),
        "degrees_hidx":
        safe(h_index, degrees),
        "degrees_gini":
        safe(gini, degrees + eps),
        "degrees_f0":
        safe(np.mean, (degrees == 0)),
        "degrees_corr":
        degrees_corr,
        "degrees_pk_ent":
        entropy(degrees_k_freq),
        "degrees_pk_gini":
        gini(degrees_k_freq),
        # fraction of connected node pairs with path of any length
        "f_connected_node_pairs_ud":
        f_connected_node_pairs,
        # clustering coefficients
        "global_clustering_ud":
        global_clustering,
        "local_clustering_mean_ud":
        local_clustering_mean,
        # centralization
        **cent_metrics,
        # modularity
        **modularity_metrics,
        # fraction of nodes in the largest CC
        "f_CC1_nodes":
        f_CC1_nodes,
        # algebraic connectivity of the largest CC
        "algebraic_connectivity_CC1":
        algebraic_connectivity_CC1,
        # connected components
        **CC_metrics,
        # k-core
        **k_core_metrics,
        # k-truss
        **k_truss_metrics
    }

    return metrics