Esempio n. 1
0
def clustering(categories):
    """
	Average clustering coefficient per year.
	"""
    fig, ax = plt.subplots()
    for cat in categories:
        # Create an empty graph
        graph = nx.Graph()
        cl_coeff = []
        for year in YEARS:
            # Add edges of current year to graph
            # .edgelist files contain loops to add isolated nodes to the graph.
            # These self-loops lead to density values higher than 1 with
            # networkx's density-function, therefore they are removed.
            graph, isolated = build_graph(GRAPH_FILES.format(cat, year),
                                          G=graph,
                                          count_isolated=True)
            # Average clustering coefficient:
            # For node n, how much are n's neighbors connected
            cl_coeff.append(nx.algorithms.cluster.average_clustering(graph))

        #Plot results
        ax.plot(YEARS, cl_coeff, label=cat, color=COLORS[cat])

    ax.legend(loc="upper left")
    ax.set_xlabel("Year")
    ax.set_ylabel("Average clustering coefficient")
    plt.savefig(PLOT_DIR + "edges_clustering.png")
    print("Plot saved to '{}'".format(PLOT_DIR.format("edges_clustering.png")))
Esempio n. 2
0
def sample_unconnected(cat, year, num_samples):
    """
	Samples a number of keyword pairs that are not currently connected
	and don't get a connection in the current year either.
	"""
    assert year < YEAR_END
    # build graph for following year for easy checking which connections exist or are established in the current year
    graph = build_graph(GRAPH_FILES.format(cat, year + 1))
    nodes = list(graph.nodes)
    samples = set()
    while len(samples) < num_samples:
        n1, n2 = random.choice(nodes), random.choice(nodes)
        if n1 == n2 or (n2, n1) in samples:
            continue
        samples.add((n1, n2))
    return np.array(list(samples))
def graph_size(categories, verbose=False):
    """
	Creates a plot depicting the number of nodes in each year for a list of keyword categories.
	"""
    fig, ax = plt.subplots()
    for cat in categories:
        # Create an empty graph
        graph = nx.Graph()
        no_nodes = list()
        if verbose: print("-----------" + cat + "-----------")
        for year in YEARS:
            # Add edges of current year to graph
            graph = build_graph(GRAPH_FILES.format(cat, year), graph)
            no_nodes.append(len(graph))
            if verbose: print("{}\t{}".format(year, len(graph)))
        # Plot the number of nodes
        ax.plot(YEARS, no_nodes, label=cat, color=COLORS[cat])
    ax.legend(loc="upper left")
    ax.set_xlabel("Year")
    ax.set_ylabel("Nodes in the keyword graph")
    plt.savefig(PLOT_DIR.format("graph_size.png"))
    print("Plot saved to " + PLOT_DIR.format("graph_size.png"))
Esempio n. 4
0
def density(categories):
    """
	Plots the density of a graph, defined as: (2 * #edges) / (#nodes * (#nodes -1))
	"""
    fig, ax = plt.subplots()
    for cat in categories:
        # Create an empty graph
        graph = nx.Graph()
        density = []
        for year in YEARS:
            # Add edges of current year to graph
            graph = build_graph(GRAPH_FILES.format(cat, year), G=graph)
            # Density: (2 * #edges) / (#nodes * (#nodes -1))
            # -> Completion of a graph. Density 0 for no edges, 1 for complete graphs
            density.append(nx.density(graph))

        #Plot results
        ax.plot(YEARS, density, label=cat, color=COLORS[cat])

    ax.legend(loc="upper left")
    ax.set_xlabel("Year")
    ax.set_ylabel("Density")
    plt.savefig(PLOT_DIR + "density.png")
    print("Plot saved to '{}'".format(PLOT_DIR.format("density.png")))
def _max_edges(n_nodes):
    """
	Returns the maximum number of edges a graph can have depending on it's number of nodes.
	"""
    return (n_nodes * (n_nodes - 1)) / 2


if __name__ == "__main__":
    logfile = open("shortest_path.log", mode="w")
    num_samples = parse_args().samples
    fig, ax = plt.subplots()
    for cat in CATEGORIES + ["all_keys"]:
        av_sample_path = list()  # to save average shortest path between nodes
        used_years = list()  # in some years there might be too little data
        for year in YEARS:
            graph = build_graph(GRAPH_FILES.format(cat, year), add_nodes=False)
            if len(graph) > 0:
                # Build largest connected component:
                # (there are usually some isolated nodes or small clusters)
                c_sub = graph.subgraph(_largest_component(graph))
                current_av_path = sample_shortest_paths(
                    c_sub, num_samples=num_samples)
                if current_av_path != -1:
                    av_sample_path.append(current_av_path)
                    used_years.append(year)

        ax.plot(used_years, av_sample_path, label=cat, color=COLORS[cat])
        print("category {} done".format(cat))

    ax.legend(loc="upper left")
    ax.set_xlabel("Year")
Esempio n. 6
0
def new_cnx_convergence(kw_pairs, cat, year, mode):
    """
	Until the year of connection the shortest path for each given
	keyword pair is observed.
	"""

    # Will store the number of pairs that have a specific distance in a year
    # -1 for no connection
    # -2 for at least one keyword not yet in graph

    # first the distances of each keyword pair are saved for each year chunk
    pair_dists = [list() for i in range(len(kw_pairs))]

    # construct list of labels
    distances = ["not in graph", "unconnected"
                 ] + [str(i)
                      for i in range(THRESHOLD)] + [">=" + str(THRESHOLD)]
    # add years for readability
    labels = [d + " : " + str(y) for y in YEARS for d in distances]

    for i, y_chunk in enumerate(YEARS):
        graph = build_graph(GRAPH_FILES.format(cat, y_chunk))
        for i, pair in enumerate(kw_pairs):
            n1, n2 = pair
            # all distances for all years are consecutively numbered. -> to match a label in 'labels'
            if n1 not in graph or n2 not in graph:
                d = 0
            elif not nx.has_path(graph, n1, n2):
                d = 1
            else:
                path = nx.shortest_path_length(graph, n1, n2)
                if path < THRESHOLD:
                    d = 2 + path  # first two indices are taken by 'not in graph' and 'unconnected'
                else:
                    d = 2 + THRESHOLD
            # Save index that matches the right label in 'labels'
            pair_dists[i].append(i * (len(distances)) + d)

    # convert the results for plotly sankey diagrams

    # source contains start distance (label); target contains goal distance (label), value contains the number of links that changed distance from source to target in a specific year
    # -> data is always saved as a triple
    dist_changes = {"source": list(), "target": list(), "value": list()}
    transfer_indices = dict(
    )  # saves position of source-target pairs in dist_changes
    for pair_dist in pair_dists:
        for dist in range(len(pair_dist) - 1):
            source_target = (pair_dist[dist], pair_dist[dist + 1])

            if source_target not in transfer_indices:
                # get next free position
                transfer_indices[source_target] = len(transfer_indices)
                dist_changes["source"].append(source_target[0])
                dist_changes["target"].append(source_target[1])
                dist_changes["value"].append(0)

            dist_changes["value"][transfer_indices[source_target]] += 1

    # Save results
    fig = go.Figure(data=[
        go.Sankey(
            node=dict(pad=15,
                      thickness=20,
                      line=dict(color="black", width=0.5),
                      label=labels,
                      color="blue"),
            link=dict(
                source=dist_changes[
                    "source"],  # indices correspond to labels, eg A1, A2, A2, B1, ...
                target=dist_changes["target"],
                value=dist_changes["value"]))
    ])

    fig.update_layout(
        title_text=
        "Distance of {} sampled keyword pairs that get connected in {}".format(
            len(kw_pairs), year),
        font_size=10)
    plotly.io.write_html(fig,
                         PLOT_DIR.format(
                             "pairs_before_cnx_{}_{}_{}.html".format(
                                 cat, year, mode)),
                         include_plotlyjs="cdn")
    print("Saved as " + PLOT_DIR.format(
        "pairs_before_cnx_{}_{}_{}.html".format(cat, year, mode)))