def clustering(categories): """ Average clustering coefficient per year. """ fig, ax = plt.subplots() for cat in categories: # Create an empty graph graph = nx.Graph() cl_coeff = [] for year in YEARS: # Add edges of current year to graph # .edgelist files contain loops to add isolated nodes to the graph. # These self-loops lead to density values higher than 1 with # networkx's density-function, therefore they are removed. graph, isolated = build_graph(GRAPH_FILES.format(cat, year), G=graph, count_isolated=True) # Average clustering coefficient: # For node n, how much are n's neighbors connected cl_coeff.append(nx.algorithms.cluster.average_clustering(graph)) #Plot results ax.plot(YEARS, cl_coeff, label=cat, color=COLORS[cat]) ax.legend(loc="upper left") ax.set_xlabel("Year") ax.set_ylabel("Average clustering coefficient") plt.savefig(PLOT_DIR + "edges_clustering.png") print("Plot saved to '{}'".format(PLOT_DIR.format("edges_clustering.png")))
def sample_unconnected(cat, year, num_samples): """ Samples a number of keyword pairs that are not currently connected and don't get a connection in the current year either. """ assert year < YEAR_END # build graph for following year for easy checking which connections exist or are established in the current year graph = build_graph(GRAPH_FILES.format(cat, year + 1)) nodes = list(graph.nodes) samples = set() while len(samples) < num_samples: n1, n2 = random.choice(nodes), random.choice(nodes) if n1 == n2 or (n2, n1) in samples: continue samples.add((n1, n2)) return np.array(list(samples))
def graph_size(categories, verbose=False): """ Creates a plot depicting the number of nodes in each year for a list of keyword categories. """ fig, ax = plt.subplots() for cat in categories: # Create an empty graph graph = nx.Graph() no_nodes = list() if verbose: print("-----------" + cat + "-----------") for year in YEARS: # Add edges of current year to graph graph = build_graph(GRAPH_FILES.format(cat, year), graph) no_nodes.append(len(graph)) if verbose: print("{}\t{}".format(year, len(graph))) # Plot the number of nodes ax.plot(YEARS, no_nodes, label=cat, color=COLORS[cat]) ax.legend(loc="upper left") ax.set_xlabel("Year") ax.set_ylabel("Nodes in the keyword graph") plt.savefig(PLOT_DIR.format("graph_size.png")) print("Plot saved to " + PLOT_DIR.format("graph_size.png"))
def density(categories): """ Plots the density of a graph, defined as: (2 * #edges) / (#nodes * (#nodes -1)) """ fig, ax = plt.subplots() for cat in categories: # Create an empty graph graph = nx.Graph() density = [] for year in YEARS: # Add edges of current year to graph graph = build_graph(GRAPH_FILES.format(cat, year), G=graph) # Density: (2 * #edges) / (#nodes * (#nodes -1)) # -> Completion of a graph. Density 0 for no edges, 1 for complete graphs density.append(nx.density(graph)) #Plot results ax.plot(YEARS, density, label=cat, color=COLORS[cat]) ax.legend(loc="upper left") ax.set_xlabel("Year") ax.set_ylabel("Density") plt.savefig(PLOT_DIR + "density.png") print("Plot saved to '{}'".format(PLOT_DIR.format("density.png")))
def _max_edges(n_nodes): """ Returns the maximum number of edges a graph can have depending on it's number of nodes. """ return (n_nodes * (n_nodes - 1)) / 2 if __name__ == "__main__": logfile = open("shortest_path.log", mode="w") num_samples = parse_args().samples fig, ax = plt.subplots() for cat in CATEGORIES + ["all_keys"]: av_sample_path = list() # to save average shortest path between nodes used_years = list() # in some years there might be too little data for year in YEARS: graph = build_graph(GRAPH_FILES.format(cat, year), add_nodes=False) if len(graph) > 0: # Build largest connected component: # (there are usually some isolated nodes or small clusters) c_sub = graph.subgraph(_largest_component(graph)) current_av_path = sample_shortest_paths( c_sub, num_samples=num_samples) if current_av_path != -1: av_sample_path.append(current_av_path) used_years.append(year) ax.plot(used_years, av_sample_path, label=cat, color=COLORS[cat]) print("category {} done".format(cat)) ax.legend(loc="upper left") ax.set_xlabel("Year")
def new_cnx_convergence(kw_pairs, cat, year, mode): """ Until the year of connection the shortest path for each given keyword pair is observed. """ # Will store the number of pairs that have a specific distance in a year # -1 for no connection # -2 for at least one keyword not yet in graph # first the distances of each keyword pair are saved for each year chunk pair_dists = [list() for i in range(len(kw_pairs))] # construct list of labels distances = ["not in graph", "unconnected" ] + [str(i) for i in range(THRESHOLD)] + [">=" + str(THRESHOLD)] # add years for readability labels = [d + " : " + str(y) for y in YEARS for d in distances] for i, y_chunk in enumerate(YEARS): graph = build_graph(GRAPH_FILES.format(cat, y_chunk)) for i, pair in enumerate(kw_pairs): n1, n2 = pair # all distances for all years are consecutively numbered. -> to match a label in 'labels' if n1 not in graph or n2 not in graph: d = 0 elif not nx.has_path(graph, n1, n2): d = 1 else: path = nx.shortest_path_length(graph, n1, n2) if path < THRESHOLD: d = 2 + path # first two indices are taken by 'not in graph' and 'unconnected' else: d = 2 + THRESHOLD # Save index that matches the right label in 'labels' pair_dists[i].append(i * (len(distances)) + d) # convert the results for plotly sankey diagrams # source contains start distance (label); target contains goal distance (label), value contains the number of links that changed distance from source to target in a specific year # -> data is always saved as a triple dist_changes = {"source": list(), "target": list(), "value": list()} transfer_indices = dict( ) # saves position of source-target pairs in dist_changes for pair_dist in pair_dists: for dist in range(len(pair_dist) - 1): source_target = (pair_dist[dist], pair_dist[dist + 1]) if source_target not in transfer_indices: # get next free position transfer_indices[source_target] = len(transfer_indices) dist_changes["source"].append(source_target[0]) dist_changes["target"].append(source_target[1]) dist_changes["value"].append(0) dist_changes["value"][transfer_indices[source_target]] += 1 # Save results fig = go.Figure(data=[ go.Sankey( node=dict(pad=15, thickness=20, line=dict(color="black", width=0.5), label=labels, color="blue"), link=dict( source=dist_changes[ "source"], # indices correspond to labels, eg A1, A2, A2, B1, ... target=dist_changes["target"], value=dist_changes["value"])) ]) fig.update_layout( title_text= "Distance of {} sampled keyword pairs that get connected in {}".format( len(kw_pairs), year), font_size=10) plotly.io.write_html(fig, PLOT_DIR.format( "pairs_before_cnx_{}_{}_{}.html".format( cat, year, mode)), include_plotlyjs="cdn") print("Saved as " + PLOT_DIR.format( "pairs_before_cnx_{}_{}_{}.html".format(cat, year, mode)))