def export_KG(G, filename): #- cores = nx.onion_layers(G.to_undirected()) ranks = nx.pagerank(G) nx.set_node_attributes(G, cores, "core") nx.set_node_attributes(G, ranks, "pagerank") #- node_colors = [] node_sizes = [] node_attri = {} pos_attrs = {} pos_nodes = nx.kamada_kawai_layout(G) for ndx, coords in pos_nodes.items(): pos_attrs[ndx] = (coords[0] + 0.00, coords[1] + 0.0) node = G.nodes[ndx] node_colors.append(node["core"]) node_sizes.append(node["pagerank"]) if "index" in node: node_attri[ndx] = "{}:'{}'".format(node["index"], node["token"]) else: node_attri[ndx] = "" node_sizes = np.array(node_sizes) node_sizes = (node_sizes - node_sizes.min()) / (node_sizes.max() - node_sizes.min()) node_sizes *= 150 node_colors = np.array(node_colors) node_colors = (node_colors - node_colors.min()) / (node_colors.max() - node_colors.min()) node_colors = [plt.cm.rainbow(x) for x in node_colors] edges_width = [] edges = list(G.edges(data=True)) for i in range(len(edges)): if "weight" in edges[i][2]: edges_width.append(edges[i][2]["weight"]) else: edges_width.append(0.1) edges_width = np.array(edges_width) edges_width = (edges_width - edges_width.min()) / (edges_width.max() - edges_width.min()) #- nx.draw(G, pos_nodes, arrowsize=3, width=edges_width, node_size=node_sizes, node_color=node_colors) nx.draw_networkx_labels(G, pos_attrs, labels=node_attri, font_weight='bold', font_size=8) plt.savefig(filename, bbox_inches='tight') plt.clf()
def test_onion_layers(self): layers = nx.onion_layers(self.G) nodes_by_layer = [ sorted(n for n in layers if layers[n] == val) for val in range(1, 7) ] assert nodes_equal(nodes_by_layer[0], [21]) assert nodes_equal(nodes_by_layer[1], [17, 18, 19, 20]) assert nodes_equal(nodes_by_layer[2], [10, 12, 13, 14, 15, 16]) assert nodes_equal(nodes_by_layer[3], [9, 11]) assert nodes_equal(nodes_by_layer[4], [1, 2, 4, 5, 6, 8]) assert nodes_equal(nodes_by_layer[5], [3, 7])
return (silh.mean(), n, labels, silh, pipe) core = nx.k_core(nx.Graph(G)) # Capitalize all occurrences of keywords for easy display on the output pattern = re.compile(f"\\b({tz.pipe(keywords, tz.pluck(0), '|'.join)})\\b") # TODO, make matching case insensitive nice_pars = nice_pars.apply(lambda x: re.sub(pattern, lambda m: m.group().upper(), x)) # TODO, add [[]] around our keywords core_pars = np.array(nice_pars)[core.nodes] core_vecs = vecs[core.nodes] sil_u, n, lab, sil, p = clust(nx.adjacency_matrix(core), core_vecs, 8) len(lab), len(sil) layers = nx.onion_layers(core) len(core.nodes) # TODO, drop items of silhouette <= 0 df = pd.DataFrame(data=[{"Label": par, "Cluster ID": cid, "Silhouette Score": ss} for par, cid, ss in zip(core_pars, lab, sil)]) df['Cluster ID'] = df.apply(lambda row: "T" + str(row['Cluster ID']), axis=1) for cluster_id in df['Cluster ID'].unique(): df = df.append({"Label": cluster_id, "Cluster ID": NAME_OF_TEXT, "Silhouette Score": None}, ignore_index=True) else: df = df.append({"Label": NAME_OF_TEXT, "Cluster ID": None, "Silhouette Score": None}, ignore_index=True) df.to_csv("out.csv", index=False) print()
def main(args): name_of_pdf_dir = os.path.basename(args.directory_with_pdfs) all_text = get_all_pdf_text_concatenated(args.directory_with_pdfs) pars = pd.Series(all_text.split('\n\n')).str.replace('\n', ' ') pars.str.len().apply(lambda x: np.log2(x + 1)).astype(int).value_counts() # TODO, is this being stored anywhere? text_keywords = keywords(all_text, scores=True, lemmatize=True, words=args.num_keywords) lower_bound_chars, upper_bound_chars = args.lower_bound_chars, args.upper_bound_chars word_count = int((lower_bound_chars + upper_bound_chars) / (2 * (avg_word_len + 1))) lens = pars.str.len() # paragraph lengths nice_pars = pars[(lens >= lower_bound_chars)] # paragraphs we want to use nice_pars = nice_pars.apply( partial(text_reduce_return, upper_bound_chars=upper_bound_chars, max_word_count=word_count) ) vecs = emb(tuple(nice_pars), args.tfhub_sentence_encoder_url).numpy() D = sk.metrics.pairwise_distances(vecs, metric='cosine') # pairwise distances of vectors R = scipy.sparse.csgraph.minimum_spanning_tree(D).max() # reduced graph G = neighbors.radius_neighbors_graph(vecs, R, metric='cosine') core = nx.k_core(nx.Graph(G)) # Capitalize all occurrences of keywords for easy display on the output # TODO, make matching case insensitive pattern = re.compile(f"\\b({tz.pipe(text_keywords, tz.pluck(0), '|'.join)})\\b") nice_pars = nice_pars.apply( lambda x: re.sub(pattern, lambda m: m.group().upper(), x)) # TODO add [[]] around our keywords for zettelkasten core_nodes = core.nodes core_pars = np.array(nice_pars)[core_nodes] core_vecs = vecs[core_nodes] sil_u, n, lab, sil, p = clust(nx.adjacency_matrix(core), core_vecs, 8) layers = nx.onion_layers(core) df = pd.DataFrame( data=[{"Label": par, "Cluster ID": cid, "Silhouette Score": ss} for par, cid, ss in zip(core_pars, lab, sil)]) df = df[df["Silhouette Score"] > 0] df['Cluster ID'] = df.apply(lambda row: "T" + str(row['Cluster ID']), axis=1) # add footer to dataframe so that csv export will be imported by gsheet's tree map plotter correctly for cluster_id in df['Cluster ID'].unique(): df = df.append({"Label": cluster_id, "Cluster ID": name_of_pdf_dir, "Silhouette Score": None}, ignore_index=True) else: df = df.append({"Label": name_of_pdf_dir, "Cluster ID": None, "Silhouette Score": None}, ignore_index=True) df.to_csv(args.output_filename, index=False) return { "text_keywords": text_keywords }
# read test data df_test = pd.read_csv(root / 'test.csv', dtype={'authorID': np.int64}) n_test = df_test.shape[0] # load the graph G = nx.read_edgelist(root / 'collaboration_network.edgelist', delimiter=' ', nodetype=int) n_nodes = G.number_of_nodes() n_edges = G.number_of_edges() print('Number of nodes:', n_nodes) print('Number of edges:', n_edges) # computes structural features for each node core_number = nx.core_number(G) # dict that associates node -> core_number onion_number = nx.onion_layers(G) avg_neighbor_degree = nx.average_neighbor_degree(G) degree_centrality = nx.degree_centrality(G) clustering = nx.clustering(G) # create the training matrix. each node is represented as a vector of 3 features: # (1) its degree, (2) its core number and (3) the average degree of its neighbors X_train_ = np.zeros((n_train, 6)) y_train_ = np.zeros(n_train) for i, row in df_train.iterrows(): node = row['authorID'] X_train_[i, 0] = G.degree(node) X_train_[i, 1] = core_number[node] X_train_[i, 2] = avg_neighbor_degree[node] X_train_[i, 3] = onion_number[node] X_train_[i, 4] = degree_centrality[node]