def test_from_numpy_array_parallel_edges(self): """Tests that the :func:`networkx.from_numpy_array` function interprets integer weights as the number of parallel edges when creating a multigraph. """ A = np.array([[1, 1], [1, 2]]) # First, with a simple graph, each integer entry in the adjacency # matrix is interpreted as the weight of a single edge in the graph. expected = nx.DiGraph() edges = [(0, 0), (0, 1), (1, 0)] expected.add_weighted_edges_from([(u, v, 1) for (u, v) in edges]) expected.add_edge(1, 1, weight=2) actual = nx.from_numpy_array(A, parallel_edges=True, create_using=nx.DiGraph()) assert_graphs_equal(actual, expected) actual = nx.from_numpy_array(A, parallel_edges=False, create_using=nx.DiGraph()) assert_graphs_equal(actual, expected) # Now each integer entry in the adjacency matrix is interpreted as the # number of parallel edges in the graph if the appropriate keyword # argument is specified. edges = [(0, 0), (0, 1), (1, 0), (1, 1), (1, 1)] expected = nx.MultiDiGraph() expected.add_weighted_edges_from([(u, v, 1) for (u, v) in edges]) actual = nx.from_numpy_array(A, parallel_edges=True, create_using=nx.MultiDiGraph()) assert_graphs_equal(actual, expected) expected = nx.MultiDiGraph() expected.add_edges_from(set(edges), weight=1) # The sole self-loop (edge 0) on vertex 1 should have weight 2. expected[1][1][0]['weight'] = 2 actual = nx.from_numpy_array(A, parallel_edges=False, create_using=nx.MultiDiGraph()) assert_graphs_equal(actual, expected)
def test_from_numpy_matrix_type(self): A = np.matrix([[1]]) G = nx.from_numpy_matrix(A) assert_equal(type(G[0][0]['weight']), int) A = np.matrix([[1]]).astype(np.float) G = nx.from_numpy_matrix(A) assert_equal(type(G[0][0]['weight']), float) A = np.matrix([[1]]).astype(np.str) G = nx.from_numpy_matrix(A) assert_equal(type(G[0][0]['weight']), str) A = np.matrix([[1]]).astype(np.bool) G = nx.from_numpy_matrix(A) assert_equal(type(G[0][0]['weight']), bool) A = np.matrix([[1]]).astype(np.complex) G = nx.from_numpy_matrix(A) assert_equal(type(G[0][0]['weight']), complex) A = np.matrix([[1]]).astype(np.object) assert_raises(TypeError, nx.from_numpy_matrix, A) G = nx.cycle_graph(3) A = nx.adj_matrix(G).todense() H = nx.from_numpy_matrix(A) assert_true(all(type(m) == int and type(n) == int for m, n in H.edges())) H = nx.from_numpy_array(A) assert_true(all(type(m) == int and type(n) == int for m, n in H.edges()))
def identity_conversion(self, G, A, create_using): assert(A.sum() > 0) GG = nx.from_numpy_array(A, create_using=create_using) self.assert_equal(G, GG) GW = nx.to_networkx_graph(A, create_using=create_using) self.assert_equal(G, GW) GI = create_using.__class__(A) self.assert_equal(G, GI)
def test_from_numpy_array_dtype(self): dt=[('weight',float),('cost',int)] A=np.array([[(1.0,2)]],dtype=dt) G=nx.from_numpy_array(A) assert_equal(type(G[0][0]['weight']),float) assert_equal(type(G[0][0]['cost']),int) assert_equal(G[0][0]['cost'],2) assert_equal(G[0][0]['weight'],1.0)
def test_symmetric(self): """Tests that a symmetric array has edges added only once to an undirected multigraph when using :func:`networkx.from_numpy_array`. """ A = np.array([[0, 1], [1, 0]]) G = nx.from_numpy_array(A, create_using=nx.MultiGraph()) expected = nx.MultiGraph() expected.add_edge(0, 1, weight=1) assert_graphs_equal(G, expected)
def test_from_numpy_array_type(self): A=np.array([[1]]) G=nx.from_numpy_array(A) assert_equal(type(G[0][0]['weight']),int) A=np.array([[1]]).astype(np.float) G=nx.from_numpy_array(A) assert_equal(type(G[0][0]['weight']),float) A=np.array([[1]]).astype(np.str) G=nx.from_numpy_array(A) assert_equal(type(G[0][0]['weight']),str) A=np.array([[1]]).astype(np.bool) G=nx.from_numpy_array(A) assert_equal(type(G[0][0]['weight']),bool) A=np.array([[1]]).astype(np.complex) G=nx.from_numpy_array(A) assert_equal(type(G[0][0]['weight']),complex) A=np.array([[1]]).astype(np.object) assert_raises(TypeError,nx.from_numpy_array,A)
def get_graph_from_prob_matrix(p_mat: np.array, thresh: float = None) -> nx.Graph: """ Generates a NetworkX graph from probability matrix :param p_mat: matrix of edge probabilities :return: """ n = p_mat.shape[0] # number of rows / nodes if thresh is not None: rand_mat = np.ones((n, n)) * thresh else: rand_mat = np.random.rand(n, n) sampled_mat = rand_mat <= p_mat # sampled_mat = sampled_mat * sampled_mat.T # to make sure it is symmetric sampled_mat = sampled_mat.astype(int) np.fill_diagonal(sampled_mat, 0) # zero out the diagonals g = nx.from_numpy_array(sampled_mat, create_using=nx.Graph()) return g
def calc_fluidC(adj_matrix, nr_communities_range=(5, 40)): nx_G = nx.from_numpy_array(adj_matrix) for nr in range(nr_communities_range[0], nr_communities_range[1] + 1): communities = nx.algorithms.community.asyn_fluid.asyn_fluidc(nx_G, nr, seed=0) # search for optimal communities number_communities = max(communities, key=lambda x: communities[x]) + 1 community_list = [] for i in range(number_communities): grp_list = [] for grp in communities: if communities[grp] == i: grp_list.append(grp) else: if grp_list: community_list.append(grp_list) return community_list
def generate_summary(file_name, top_n=5): stop_words = stopwords.words('english') summarize_text = [] sentences = read_article(file_name) sentence_similarity_martix = build_similarity_matrix(sentences, stop_words) sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix) scores = nx.pagerank(sentence_similarity_graph) ranked_sentence = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) print("Indexes of top ranked_sentence order are ", ranked_sentence) for i in range(top_n): summarize_text.append(" ".join(ranked_sentence[i][1])) print("Summarize Text: \n", ". ".join(summarize_text)) #generate_summary( "msft.txt", 2)
def fuzzy_geom_graph(size, radius, deg, ret_coords=True, force_connected=True): for _ in range(10000): # sample coordinates x, y = coords = np.random.rand(2, size) / radius # build the adjacency matrix adj = np.zeros((size, size)).astype(np.bool) for i, (xi, yi, di) in enumerate(zip(x, y, deg)): # sample neighbors based on euclidian distance p = np.exp(-np.sqrt((xi - x) ** 2 + (yi - y) ** 2)) other_nodes = [k for k in range(size) if k != i] p = p[other_nodes] p /= p.sum() neighbors = np.random.choice(other_nodes, size=di, replace=False, p=p) adj[i, neighbors] = True adj |= adj.T G = nx.from_numpy_array(adj) if not force_connected or nx.is_connected(G): return G print('failed graph generation fuzzy_geom_graph')
def get_bridge_bonds_matrix(self): """ Returning a boolean matrix of size (n_defined_atoms, n_defined_atoms) representing whether bonds of the molecular graph are bridges. """ # Converting the molecular graph to a NetworkX object nx_mol_graph = nx.from_numpy_array(self.get_adjacency_matrix()) # Initialization of the output matrix of bridge bonds output_bridges_matrix = np.full( (self.get_n_atoms(), self.get_n_atoms()), False) # Extracting the list of bridges in the molecular simple graph bridges_list = list(nx.bridges(nx_mol_graph)) for bridge in bridges_list: output_bridges_matrix[bridge[0], bridge[1]] = True output_bridges_matrix[bridge[1], bridge[0]] = True return output_bridges_matrix
def draw_graph(adj=None, G = None, marginals=None, draw_edge_color=False, title=None, node_size=300, node_labels=None): node_color = marginals if G is None: assert adj is not None, "you have to provide either the adjacency matrix or the graph" G = nx.from_numpy_array(adj) edge_color = G.number_of_edges()*[1] n = G.number_of_nodes() if adj is not None: edges = adj[np.triu_indices(n,1)] # strict upper triangle inds if draw_edge_color: edge_color = edges[edges != 0].ravel().astype(float).tolist() if node_labels is not None: node_dict = dict([(i, str(node_labels[i])) for i in range(n)]) else: node_dict = None nx.draw(G, node_color=marginals, edge_color = edge_color, label=title, node_size = node_size, labels=node_dict) plt.show()
def generate_summary(file_name, top_n): stop_words = stopwords.words('english') summarize_text = [] sentences = ra.read_article(file_name) sentence_similarity_martix = sm.build_similarity_matrix(sentences, stop_words) sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix) scores = nx.pagerank(sentence_similarity_graph) ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True) if(top_n>len(ranked_sentence)): print("Entered number of sentences is greater than the actual summary") return for i in range(top_n): summarize_text.append(" ".join(ranked_sentence[i][1])) print("Summarize Text: \n", ". ".join(summarize_text))
def generate_summary(self): stop_words = stopwords.words('english') summarize_text = [] sentences = self.read_article() sentence_similarity_martix = self.build_similarity_matrix( sentences, stop_words) sentence_similarity_graph = nx.from_numpy_array( sentence_similarity_martix) scores = nx.pagerank(sentence_similarity_graph) ranked_sentence = sorted( ((scores[i], s) for i, s in enumerate(sentences)), reverse=True) for i in range(self.top_n): summarize_text.append(" ".join(ranked_sentence[i][1])) summary = "".join(summarize_text) return summary
def generate_summary(file_name): """ The main function to generate summary by finding similarity among sentences and ranking them :param file_name: filename and path :return: Summarized text """ summarize_text = [] sentences = read_article(file_name) if len(sentences) > 4: num_sentences = int(len(sentences) / 3) else: num_sentences = len(sentences) sentence_similarity_martix = build_similarity_matrix(sentences) sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix) scores = nx.pagerank(sentence_similarity_graph) ranked_sentence = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) for i in range(num_sentences): summarize_text.append(" ".join(ranked_sentence[i][1])) final_text = '. '.join(summarize_text) return final_text
def generate_summary(text, n=5): summary = [] # Sentence-tokenize text sentences = process_input(text) # Generate similiarity matrix sim_mx = make_similiarity_matrix(sentences) # Rank sentences in the matrix sim_graph = nx.from_numpy_array(sim_mx) scores = nx.pagerank(sim_graph) # Sort rank + pick top ranked sentences ranked = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) for i in range(n): summary += [ranked[i][1]] return '. '.join(summary)
def extract_links(n, connections, link_cap): A = np.zeros((n,n)) for a,c in zip(A,connections): a[c]=1 G=nx.from_numpy_array(A, create_using=nx.DiGraph()) edges=list(G.edges) capacities_links = [] # The edges 0-2 or 2-0 can exist. They are duplicated (up and down) and they must have same capacity. for e in edges: if str(e[0])+':'+str(e[1]) in link_cap: capacity = link_cap[str(e[0])+':'+str(e[1])] capacities_links.append(capacity) elif str(e[1])+':'+str(e[0]) in link_cap: capacity = link_cap[str(e[1])+':'+str(e[0])] capacities_links.append(capacity) else: print("ERROR IN THE DATASET!") exit() return edges, capacities_links
def erdos_renyi_graph(n, k_avg): r""" Generates an Erdos-Renyi random graph by randomly connecting two nodes, $i$ and $j$, with a probability $p$, corresponding to the specified average degree, $\langle k \rangle$. Parameters ---------- n (int): number of nodes k_avg (float): desired average degreee of the resulting network Returns ------- g (nx.Graph): a networkx graph """ a = np.triu(np.random.rand(n, n) < k_avg / (n - 1), 1) g = nx.from_numpy_array(np.array(a + a.T, dtype=int)) return g
def get_articulation_points_vector(self): """ Returning a boolean vector representing whether the atoms of the molecular graph are articulation points (vertices whose removal would create two connected components). :return: """ # Articulation points vector initialization art_points_vector = np.zeros((self.get_n_atoms(), )) # Converting the molecular graph to a NetworkX object nx_mol_graph = nx.from_numpy_array(self.get_adjacency_matrix()) # Computing articulation points art_points_ids = nx.articulation_points(nx_mol_graph) # Setting output vector for art_points_id in art_points_ids: art_points_vector[art_points_id] = 1 return art_points_vector
def summarize(self, paragraph, mode="clustering", keep_sentences=5): origin_sentence = sent_tokenize(paragraph) sentences = self.clearner.preprocessing(paragraph) sent_vectors = self.vectorizer.vectorize(sentences) # row vector if mode == "clustering": kmeans = KMeans(n_clusters=keep_sentences) kmeans = kmeans.fit(sent_vectors) avg = [] for j in range(keep_sentences): idx = np.where(kmeans.labels_ == j)[0] avg.append(np.mean(idx)) closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, sent_vectors) # top_sentences = sorted(range(n_clusters), key=lambda k: avg[k]) top_sentences = sorted(closest) elif mode == "lsa": # input: column vector sent_vectors_t = sent_vectors.T U, S, VT = np.linalg.svd(sent_vectors_t) saliency_vec = np.dot(np.square(S), np.square(VT)) top_sentences = saliency_vec.argsort()[-keep_sentences:][::-1] top_sentences.sort() else: sim_mat = np.zeros([len(sentences), len(sentences)]) for i in range(len(sentences)): for j in range(len(sentences)): if i != j: sim_mat[i][j] = cosine_similarity( sent_vectors[i].reshape(1, -1), sent_vectors[j].reshape(1, -1))[0][0] nx_graph = nx.from_numpy_array(sim_mat) scores = list(nx.pagerank(nx_graph).values()) top_sentences = np.argsort(scores)[-keep_sentences:][::-1] top_sentences.sort() summary = " ".join([origin_sentence[i] for i in top_sentences]) return summary, top_sentences
def generate_summary(file_or_url_string, out_file_string, top_n=5): stop_words = stopwords.words('english') summarize_text = [] # Step 1 - Read text anc split it # article = read_article(file_or_url_string) article = read_file_or_web_contents(file_or_url_string) # article = read_article_single_or_multi_lines(file_or_url_string) sentences = process_sentences(article) # Step 2 - Generate Similary Martix across sentences sentence_similarity_martix = build_similarity_matrix(sentences, stop_words) # Step 3 - Rank sentences in similarity martix sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix) scores = nx.pagerank(sentence_similarity_graph) # Step 4 - Sort the rank and pick top sentences sentences_list = enumerate(sentences) ranked_sentence = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) print("Indexes of top ranked_sentence order are ", ranked_sentence) for i in range(top_n): summarize_text.append(" ".join(ranked_sentence[i][1])) # Step 5 - Offcourse, output the summarize texr print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Summarize Text: ") print(summarize_text) # Step 6 - Output to a file with open(out_file_string, 'w') as outfile: #for item in summarize_text: # outfile.write("%s\n" % item) outfile.write("\n".join(str(item) for item in summarize_text)) return summarize_text
def find_minimum_spanning_tree(graph: nx.Graph) -> nx.Graph: """ find_minimum_spanning_tree methods used to fing mimimum spanning tree for given graph :param gr: nx.Graph type where we are looking for minimum spanning tree :return: minimum spanning tree as a grap :rtype: nx.Graph """ adj_matrix = nx.to_numpy_array(graph) no_nodes = adj_matrix.shape[0] selected = np.zeros(no_nodes) minimum_spanning_tree = np.zeros((no_nodes, no_nodes)) no_edge = 0 selected[0] = True while no_edge < no_nodes - 1: minimum = sys.maxsize vertex_begin = 0 vertex_end = 0 for first_dim_iter in range(no_nodes): if selected[first_dim_iter]: for second_dim_iter in range(no_nodes): if not selected[second_dim_iter] and adj_matrix[ first_dim_iter][second_dim_iter]: if minimum > adj_matrix[first_dim_iter][ second_dim_iter]: minimum = adj_matrix[first_dim_iter][ second_dim_iter] vertex_begin = first_dim_iter vertex_end = second_dim_iter minimum_spanning_tree[vertex_begin][vertex_end] = adj_matrix[ vertex_begin][vertex_end] minimum_spanning_tree[vertex_end][vertex_begin] = adj_matrix[ vertex_begin][vertex_end] selected[vertex_end] = True no_edge += 1 return nx.from_numpy_array(minimum_spanning_tree)
def test_held_karp_ascent_asymmetric_3(): """ Tests the ascent method using a truly asymmetric graph with a fractional solution for which the solution has been brute forced. In this graph their are two different optimal, integral solutions (which are also the overall atsp solutions) to the Held Karp relaxation. However, this particular graph has two different tours of optimal value and the possible solutions in the held_karp_ascent function are not stored in an ordered data structure. """ import networkx.algorithms.approximation.traveling_salesman as tsp np = pytest.importorskip("numpy") G_array = np.array([ [0, 1, 5, 2, 7, 4], [7, 0, 7, 7, 1, 4], [4, 7, 0, 9, 2, 1], [7, 2, 7, 0, 4, 4], [5, 5, 4, 4, 0, 3], [3, 9, 1, 3, 4, 0], ]) solution1_edges = [(0, 3), (1, 4), (2, 5), (3, 1), (4, 2), (5, 0)] solution2_edges = [(0, 3), (3, 1), (1, 4), (4, 5), (2, 0), (5, 2)] G = nx.from_numpy_array(G_array, create_using=nx.DiGraph) opt_hk, z_star = tsp.held_karp_ascent(G) assert round(opt_hk, 2) == 13.00 # Check that the z_stars are the same solution1 = nx.DiGraph() solution1.add_edges_from(solution1_edges) solution2 = nx.DiGraph() solution2.add_edges_from(solution2_edges) assert nx.utils.edges_equal(z_star.edges, solution1.edges) or nx.utils.edges_equal( z_star.edges, solution2.edges)
def extract_links(num_nodes, connections, link_capacity_dict): """ :param num_nodes: :param connections: :param link_capacity_dict: :return: """ # An adjacency matrix representation of a graph grph_adjcny_mtrx = np.zeros((num_nodes, num_nodes)) # for adjacencies, connection_set in zip(grph_adjcny_mtrx, connections): # For adjacencies row n of the matrix, corresponding to node n, set a value of 1 for each # position in the row corresponding to the other nodes that node n has a link to. adjacencies[connection_set] = 1 # Given the adjacency matrix, construct a directed graph. graph = nx.from_numpy_array(grph_adjcny_mtrx, create_using=nx.DiGraph()) # From the graph, "Edges are represented as links between nodes ...". The links is a list # of tuples representing the connections_lists from node n to node m, and node m to node n. links = list(graph.edges) # The link_capacities is a list of the capacities of the links in order of the the connections_lists # in links. link_capacities = [] # The links are duplicated from n to m and m to n, so they must have the same capacity in both # directions. The link_capacity_dict keys are of the form n:m and m:n. for link in links: if str(link[0]) + ':' + str(link[1]) in link_capacity_dict: capacity = link_capacity_dict[str(link[0]) + ':' + str(link[1])] link_capacities.append(capacity) elif str(link[1]) + ':' + str(link[0]) in link_capacity_dict: capacity = link_capacity_dict[str(link[1]) + ':' + str(link[0])] link_capacities.append(capacity) else: raise Exception( 'Error in dataset - link not found in link capacities - ', link) return links, link_capacities
def cooc_2d(self): """ Calculates the number of co-occurrences of the index phrases in the text (output as the matrix ``cooc_mat``), as well as records the first time a co-occurrence occurred in the text (``dist_mat``). """ dim = len(self.index_labels) cooc_mat = np.zeros((dim, dim)) first_cooc = np.zeros((dim, dim)) sentences = self.windows timeline = {} for sent, num in tqdm_notebook( list(zip(sentences, range(1, len(sentences) + 1)))): joined_sent = ' '.join(sent) for i in range(dim): if self.index_labels[i] in joined_sent: cooc_mat[i, i] += 1 if first_cooc[i, i] == 0: first_cooc[i, i] = num for j in range(i + 1, dim): if self.index_labels[j] in joined_sent: cooc_mat[(np.array([i, j]), np.array([j, i]))] += 1 if first_cooc[i, j] == 0: first_cooc[(np.array([i, j]), np.array([j, i]))] = num timeline[tuple(self.index_labels[np.array( [i, j])])] = num first_cooc[first_cooc == 0] = np.inf self.cutoff = len(sentences) self.dist_mat = first_cooc self.cooc_mat = cooc_mat self.timeline = timeline # make graph G = nx.from_numpy_array(cooc_mat) G.remove_edges_from(nx.selfloop_edges(G)) name_mapping = {i: label for i, label in enumerate(self.index_labels)} nx.relabel_nodes(G, name_mapping, copy=False) self.graph = G return cooc_mat, first_cooc, timeline
def get_smoothnes_kNN(embeddings, energies, K): """ kNN based graph for smoothness calc Args: embeddings ([type]): [description] energies ([type]): [description] K ([type]): [description] Returns: [type]: [description] """ N = embeddings.shape[0] energies = energies.reshape(N, 1) # get kNN graph print("getting kNN graph") A_mat = kneighbors_graph(embeddings, n_neighbors=K, mode='connectivity').todense() # make symmetric A_mat = A_mat + A_mat.T A_mat = A_mat.clip(0, 1) nn_graph = nx.from_numpy_array(A_mat) print("computing combinatorial graph laplacian") L_mat = nx.laplacian_matrix(nn_graph).todense() # compute smoothness index print("computing smoothness value") lap_smooth = np.matmul(L_mat, energies) lap_smooth = np.matmul(energies.T, lap_smooth) signal_dot = np.matmul(energies.T, energies) lap_smooth = lap_smooth / signal_dot print("smoothness for K={}: {}".format(K, lap_smooth.item())) return lap_smooth.item()
def generate_summary(file_name, top_n=5): stop_words = stopwords.words('english') summarize_text = [] path = 'Add/path' for filename in os.listdir(path): if fnmatch.fnmatch(filename, '*.story'): print("Filename", filename) file_name = os.path.join(path, filename) # Step 1 - Read text and split it sentences = read_article(file_name) # Step 2 - Generate Similary Martix across sentences sentence_similarity_martix = build_similarity_matrix( sentences, stop_words) # Step 3 - Rank sentences in similarity martix sentence_similarity_graph = nx.from_numpy_array( sentence_similarity_martix) scores = nx.pagerank(sentence_similarity_graph, alpha=0.85, personalization=None, max_iter=10000, tol=1e-06, nstart=None, weight='weight', dangling=None) # Step 4 - Sort the rank and pick top sentences ranked_sentence = sorted( ((scores[i], s) for i, s in enumerate(sentences)), reverse=True) print("Indexes of top ranked_sentence order are ", ranked_sentence) for i in range(top_n): summarize_text.append(" ".join(ranked_sentence[i][1])) # Step 5 - Output summarized text print('\n') print("Summarize Text: \n", ". ".join(summarize_text)) print('\n\n')
def DocumentSimalarity(Filepath,CSVFILE,Topic): df=pd.read_csv(CSVFILE, low_memory=False) df=df[df['topic']==Topic] df=df[df['pdf_json_files'].notnull()] PubPaths=df['pdf_json_files'].tolist() df.loc[df['pmc_json_files'].notnull(),'Full PMC']=True df.loc[df['pdf_json_files'].notnull(),'Full PDF']=True df.loc[df['pmc_json_files'].notnull(),'Full PDF']=False#### look at PMC if both PMC and PDF are available #df.loc[df['pmc_json_files'].isnull(),'Full PMC']=False Titles=df[df['Full PMC']==True]['title'].tolist() Titles.extend(df[df['Full PDF']==True]['title'].tolist()) PubPaths=df[df['Full PMC']==True]['pmc_json_files'].tolist() PDFPapers=df[df['Full PDF']==True]['pdf_json_files'].tolist() PubPaths.extend(PDFPapers) #del df Corpus=[] nlpsci=InitSciSpacy(); nlpsci.disable_pipes("parser","ner") for pub in PubPaths: BodyText=[] pub=pub.split("; ") for p in pub: #p=p.replace(" ","") print(p) SkimmedText=SkimallText(Filepath+p,nlpsci) if len(SkimmedText)==0:continue BodyText.extend(SkimmedText)####Full text not just the abstract Doc=".".join(BodyText) Corpus.append(Doc) tfidf_vectorizer = TfidfVectorizer() tfidf = tfidf_vectorizer.fit_transform(Corpus) tfidf_feature_names = tfidf_vectorizer.get_feature_names() SimilarityArray=cosine_similarity(tfidf, tfidf) print(len(PubPaths),SimilarityArray.shape) nx_graph = nx.from_numpy_array(SimilarityArray)#### Convert similarity matrix into a graph scores = nx.pagerank(nx_graph) return scores,Titles
def value_graph_laplacians(): n_states = 8 n_actions = 2 det_pis = utils.get_deterministic_policies(n_states, n_actions) N = len(det_pis) print('n pis: {}'.format(N)) for i in range(1): mdp = utils.build_random_mdp(n_states, n_actions, 0.5) values = [utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() for pi in det_pis] Vs = np.stack(values).reshape((N, n_states)) A = graph.mdp_topology(det_pis) W = np.exp(-np.linalg.norm(Vs[None, :, :] - Vs[:, None, :], ord=np.inf, axis=-1)+1e-8) # mVs = np.mean(Vs, axis=0) # n_states # W = np.dot((Vs - mVs) , (Vs - mVs).T) adj = W * A G = nx.from_numpy_array(adj) pos = nx.spectral_layout(G) #, iterations=500) plt.figure(figsize=(16,16)) nx.draw(G, pos, node_color=[np.sum(v) for v in values], node_size=150) plt.savefig('figs/value_graphs/{}-value_graph-{}-{}.png'.format(i, n_states, n_actions)) plt.close() u, v = graph_laplacian_spectra(adj) plt.figure(figsize=(8,8)) plt.bar(range(len(u)), u) plt.savefig('figs/value_graphs/{}-lap.png'.format(i)) plt.close() plt.figure(figsize=(16,16)) n = 5 for j in range(n*n): plt.subplot(n,n,j+1) nx.draw(G, pos, node_color=u[10*j] * v[10*j], node_size=150) plt.savefig('figs/value_graphs/{}-spectra.png'.format(i, n_states, n_actions)) plt.close()
def sample(prediction_step, n_balls, _delta_T=0.001, sample_freq=100): """ This function generate training data for IVP(init value problem) prediction. Sampling time intervel is (_delta_T * sample_freq), i.e. regular intervel. Input: --prediction_step: int --n_balls --_delta_T: minimum time intervel of simulation --sample_freq: Notices: (_delta_T*sample_freq) is the time intervel of output data """ # 开始采样的时间点随机设置,增加样本的多样性 sample_t0 = np.random.choice(range(10, 500)) T = (sample_t0 + prediction_step) * sample_freq model = SpringSim(n_balls=n_balls, _delta_T=_delta_T) pos, vel, adj = model.sample_trajectory(T, sample_freq) pos = pos[sample_t0:] vel = vel[sample_t0:] G = nx.from_numpy_array(adj) edge_index = torch.LongTensor(np.array(G.edges()).T) edge_index = tg.utils.to_undirected(edge_index) pos_0 = torch.Tensor(pos[0]) vel_0 = torch.Tensor(vel[0]) pos_res = torch.Tensor(pos[1:]) vel_res = torch.Tensor(vel[1:]) delta_t = torch.arange(prediction_step) * (_delta_T * sample_freq) data = Data(num_nodes=n_balls, edge_index=edge_index, pos_0=pos_0.transpose(0, 1), pos_res=pos_res.transpose(1, 2), vel_0=vel_0.transpose(0, 1), vel_res=vel_res.transpose(1, 2), delta_t=delta_t) return data
def summarize(content, isFile): sentences = [] if isFile: sentences = read_data(content) else: sentences.append(sent_tokenize(content)) word_embeddings = get_word_embeddings() output = [] for item in sentences: cleaned_sentences = clean_sentences(item) sentence_vectors = get_sentence_vectors(cleaned_sentences, word_embeddings) # similarity matrix sim_mat = np.zeros([len(item), len(item)]) for i in range(len(item)): for j in range(len(item)): if i != j: sim_mat[i][j] = \ cosine_similarity(sentence_vectors[i].reshape(1, 100), sentence_vectors[j].reshape(1, 100))[ 0, 0] nx_graph = nx.from_numpy_array(sim_mat) scores = nx.pagerank(nx_graph) ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(item)), reverse=True) # Specify number of sentences to form the summary sn = 3 # Generate summary result = [] for i in range(sn): # print(ranked_sentences[i][1]) if len(ranked_sentences) > i: result.append(ranked_sentences[i][1]) output.append(result) return output
def text_rank(sentences: list, word_embeddings: dict) -> dict: """ Input: List, Dict Output: Dict Takes a list of sentences and Glove word embeddings as input and returns a dictionary containing sentences index as key and rank as value. The ranking is done based on the PageRank algorithm """ # Clean sentences for PageRank algorithm. clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ") clean_sentences = [s.lower() for s in clean_sentences] clean_sentences = [remove_stopwords(r) for r in clean_sentences] # Replace each word with Glove embeddings. The Sentence vector is the average of the sum of embeddings of all words in that # sentence. sentence_vectors = [] for i in clean_sentences: if len(i) != 0: v = sum( [word_embeddings.get(w, np.zeros((100, ))) for w in i.split()]) / (len(i.split()) + 0.001) else: v = np.zeros((100, )) sentence_vectors.append(v) # Initialize a similarity matrix for pair of sentences sim_mat = np.zeros([len(sentences), len(sentences)]) # Calculate cosine similarity for each pair of sentences for i in range(len(sentences)): for j in range(len(sentences)): if i != j: sim_mat[i][j] = cosine_similarity( sentence_vectors[i].reshape(1, 100), sentence_vectors[j].reshape(1, 100))[0, 0] # Create a PageRank graph using similarity matrix nx_graph = nx.from_numpy_array(sim_mat) scores = nx.pagerank(nx_graph) return scores
def generate_summary(text, top_n): stop_words = stopwords.words('english') summarize_text = [] # Step 1 - Read text and split it sentences = format_text(text) # Step 2 - Generate Similary Martix across sentences sentence_similarity_martix = build_similarity_matrix(sentences, stop_words) # Step 3 - Rank sentences in similarity martix sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix) scores = nx.pagerank(sentence_similarity_graph) # Step 4 - Sort the rank and pick top sentences ranked_sentence = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) for i in range(top_n): summarize_text.append(" ".join(ranked_sentence[i][1])) # Step 5 - Offcourse, output the summarize texr print("Summarize Text: ", ".\n\nSAD ".join(summarize_text))
def ExponentialTwoGraph(size: int) -> nx.DiGraph: """Generate graph topology such that each points only connected to a point such that the index difference is the power of 2. Example: A ExponentialTwoGraph with 12 nodes: .. plot:: :context: close-figs >>> import networkx as nx >>> from bluefog.common import topology_util >>> G = topology_util.ExponentialTwoGraph(12) >>> nx.draw_circular(G) """ assert size > 0 x = np.array([1.0 if i & (i - 1) == 0 else 0 for i in range(size)]) x /= x.sum() topo = np.empty((size, size)) for i in range(size): topo[i] = np.roll(x, i) G = nx.from_numpy_array(topo, create_using=nx.DiGraph) return G
def as_graph(self, directed=True): if self.normalized_difference.ndim > 2: raise MarkovError("You can only graph one-step chains.") try: import networkx as nx except ImportError: nx = None if nx is None: print("Please install networkx with `pip install networkx`.") return if directed: alg = nx.DiGraph else: alg = nx.Graph G = nx.from_numpy_array(self.normalized_difference, create_using=alg) nx.set_node_attributes(G, self._state_dict, 'state') return G
def test_get_metrics(metric): """ Test various wrappers for getting nx graph metrics """ base_dir = str(Path(__file__).parent/"examples") est_path = f"{base_dir}/miscellaneous/sub-0021001_rsn-Default_nodetype-parc_model-sps_template-MNI152_T1_thrtype-DENS_thr-0.19.npy" in_mat = np.load(est_path) G = nx.from_numpy_array(in_mat) ci = np.ones(in_mat.shape[0]) metric_list_names = [] net_met_val_list_final = [] if metric == 'participation': metric_list_names, net_met_val_list_final = \ netstats.get_participation(in_mat, ci, metric_list_names, net_met_val_list_final) assert len(metric_list_names) == len(netstats.participation_coef(in_mat, ci))+1 assert len(net_met_val_list_final) == len(netstats.participation_coef(in_mat, ci))+1 elif metric == 'diversity': metric_list_names, net_met_val_list_final = \ netstats.get_diversity(in_mat, ci, metric_list_names, net_met_val_list_final) assert len(metric_list_names) == np.shape(netstats.diversity_coef_sign(in_mat, ci))[1]+1 assert len(net_met_val_list_final) == np.shape(netstats.diversity_coef_sign(in_mat, ci))[1]+1 elif metric == 'local_efficiency': metric_list_names, net_met_val_list_final = \ netstats.get_local_efficiency(G, metric_list_names, net_met_val_list_final) assert len(metric_list_names) == len(netstats.local_efficiency(G))+1 assert len(net_met_val_list_final) == len(netstats.local_efficiency(G))+1 elif metric == 'comm_centrality': metric_list_names, net_met_val_list_final = \ netstats.get_comm_centrality(G, metric_list_names, net_met_val_list_final) assert len(metric_list_names) == len(nx.algorithms.communicability_betweenness_centrality(G))+1 assert len(net_met_val_list_final) == len(nx.algorithms.communicability_betweenness_centrality(G))+1 elif metric == 'rich_club_coeff': metric_list_names, net_met_val_list_final = \ netstats.get_rich_club_coeff(G, metric_list_names, net_met_val_list_final) assert len(metric_list_names) == len(nx.algorithms.rich_club_coefficient(G))+1 assert len(net_met_val_list_final) == len(nx.algorithms.rich_club_coefficient(G))+1