def get_hits(self): """ Gets the authority and hub scores for each node. Not included in features_df because it can take some time to calculate. """ hits = nx.algorithms.link_analysis.hits_alg.hits(self.graph, max_iter=1000) return (sort_dict_values(hits[1], ['node', 'hits_authority'], 'hits_authority').merge(sort_dict_values( hits[0], ['node', 'hits_hub'], 'hits_hub'), on="node"))
def get_centrality(self): """ Gets the eigenvector centrality of each node. """ return sort_dict_values( nx.eigenvector_centrality(self.graph, weight="weight"), ["node", "centrality"], "centrality")
def get_reciprocity(self): """ Gets the reciprocity score for each node. Note: Reciprocity in the context or Wikipedia articles can be a misleading metric. The intended use of this method is to be called in the `get_adjusted_reciprocity` method, which accounts for how many connects a node has. """ return sort_dict_values( nx.algorithms.reciprocity(self.graph, self.graph.nodes), ['node', 'reciprocity'], 'reciprocity')
def get_dispersion(self, comparison_node=None, max_nodes=25_000): # depreciated """ Gets the dispersion of the central node compared to each other node. This is depreciated, and not included in features_df because it can take a long time to calculate. """ if not comparison_node: comparison_node = self.entry if max_nodes is None or len(self.graph.nodes) <= max_nodes: return sort_dict_values( nx.dispersion(self.graph, u=comparison_node), ['node', 'dispersion'], 'dispersion') else: # if the network is too large, perform calculation on ego graph of entry node ego = self.create_ego() return sort_dict_values(nx.dispersion(ego, u=comparison_node), ['node', 'dispersion'], 'dispersion')
def get_degrees(self): """ Get all edges of a node and its neighbors (both in and outbound). """ return sort_dict_values( dict(self.graph.degree()), ["node", "degree"], "degree", )
def get_shared_categories_with_source(self): cat_matches = {} for node in self.graph.nodes: cat_matches[node] = compare_categories(self.entry, node, self.categories, starting_count=0) return sort_dict_values(cat_matches, ['node', 'category_matches_with_source'], 'category_matches_with_source', ascending=False)
def get_primary_nodes(self): """ Marks a node as a primary node if it appears in the article introduction or the See Also section. Primary nodes are considered to be more related to the main topics than others. """ primary_nodes = {} for node in self.graph.nodes: if node in primary_nodes: # allows for heavier weight to duplicates in intro and see also primary_nodes[node] += 1 if node in self.primary_nodes: primary_nodes[node] = 1 else: primary_nodes[node] = 0 return sort_dict_values(primary_nodes, ["node", "primary_link"], "primary_link", ascending=False)
def get_jaccard_similarity(self): """ Calculates the Jaccard similarity score for each node compared to the entry node. """ entry_in_edges = set( [x[0] for x in self.graph.in_edges(nbunch=self.entry)]) jaccard_scores = {} for node in self.graph.nodes: target_in_edges = set( [x[0] for x in self.graph.in_edges(nbunch=node)]) in_edge_intersect = len( entry_in_edges.intersection(target_in_edges)) in_edge_union = len(entry_in_edges.union(target_in_edges)) jaccard_scores[node] = in_edge_intersect / in_edge_union return sort_dict_values(jaccard_scores, ["node", "jaccard_similarity"], "jaccard_similarity", ascending=False)
def get_shared_neighbors_with_entry_score(self): """ A score comprised of the total number of shared neighbors with the target OVER the total number of neighbors of each node """ entry_neighbors = list(set(nx.all_neighbors(self.graph, self.entry))) shared_neighbors_score = {} for node in self.graph.nodes: target_neighbors = list(set(nx.all_neighbors(self.graph, node))) shared_neighbors = len( entry_neighbors) + len(target_neighbors) - len( set(entry_neighbors + target_neighbors)) # score is neighbors shared over how many possible unique neighbors could have been shared. shared_neighbors_score[node] = shared_neighbors / len( set(entry_neighbors + target_neighbors)) return sort_dict_values(shared_neighbors_score, ["node", "shared_neighbors_with_entry_score"], "shared_neighbors_with_entry_score", ascending=False)
def get_dominator_counts(self, source=None): """ Gets local dominator score for each node. Not included infeatures_df because it can take some time to calculate. """ if not source: source = self.entry dom_dict = nx.algorithms.dominance.immediate_dominators(self.graph, start=source) dom_counts = {} for key, value in dom_dict.items(): if value in dom_counts: dom_counts[value] += 1 else: dom_counts[value] = 1 for node in self.graph.nodes: if not node in dom_counts: dom_counts[node] = 0 return sort_dict_values(dom_counts, ['node', 'immediate_dominator_count'], 'immediate_dominator_count')