Example #1
0
 def get_hits(self):
     """
     Gets the authority and hub scores for each node. Not included in features_df because it can take some time to calculate. 
     """
     hits = nx.algorithms.link_analysis.hits_alg.hits(self.graph,
                                                      max_iter=1000)
     return (sort_dict_values(hits[1], ['node', 'hits_authority'],
                              'hits_authority').merge(sort_dict_values(
                                  hits[0], ['node', 'hits_hub'],
                                  'hits_hub'),
                                                      on="node"))
Example #2
0
 def get_centrality(self):
     """
     Gets the eigenvector centrality of each node.
     """
     return sort_dict_values(
         nx.eigenvector_centrality(self.graph, weight="weight"),
         ["node", "centrality"], "centrality")
Example #3
0
 def get_reciprocity(self):
     """
     Gets the reciprocity score for each node. Note: Reciprocity in the context or Wikipedia articles can be a misleading metric. The intended use of this method is to be called in the `get_adjusted_reciprocity` method, which accounts for how many connects a node has.
     """
     return sort_dict_values(
         nx.algorithms.reciprocity(self.graph, self.graph.nodes),
         ['node', 'reciprocity'], 'reciprocity')
Example #4
0
    def get_dispersion(self,
                       comparison_node=None,
                       max_nodes=25_000):  # depreciated
        """
        Gets the dispersion of the central node compared to each other node. This is depreciated, and not included in features_df because it can take a long time to calculate.
        """
        if not comparison_node:
            comparison_node = self.entry

        if max_nodes is None or len(self.graph.nodes) <= max_nodes:
            return sort_dict_values(
                nx.dispersion(self.graph, u=comparison_node),
                ['node', 'dispersion'], 'dispersion')
        else:
            # if the network is too large, perform calculation on ego graph of entry node
            ego = self.create_ego()
            return sort_dict_values(nx.dispersion(ego, u=comparison_node),
                                    ['node', 'dispersion'], 'dispersion')
Example #5
0
 def get_degrees(self):
     """
     Get all edges of a node and its neighbors (both in and outbound).
     """
     return sort_dict_values(
         dict(self.graph.degree()),
         ["node", "degree"],
         "degree",
     )
Example #6
0
 def get_shared_categories_with_source(self):
     cat_matches = {}
     for node in self.graph.nodes:
         cat_matches[node] = compare_categories(self.entry,
                                                node,
                                                self.categories,
                                                starting_count=0)
     return sort_dict_values(cat_matches,
                             ['node', 'category_matches_with_source'],
                             'category_matches_with_source',
                             ascending=False)
Example #7
0
    def get_primary_nodes(self):
        """
        Marks a node as a primary node if it appears in the article introduction or the See Also section. Primary nodes are considered to be more related to the main topics than others. 
        """
        primary_nodes = {}
        for node in self.graph.nodes:
            if node in primary_nodes:
                # allows for heavier weight to duplicates in intro and see also
                primary_nodes[node] += 1

            if node in self.primary_nodes:
                primary_nodes[node] = 1
            else:
                primary_nodes[node] = 0
        return sort_dict_values(primary_nodes, ["node", "primary_link"],
                                "primary_link",
                                ascending=False)
Example #8
0
    def get_jaccard_similarity(self):
        """
        Calculates the Jaccard similarity score for each node compared to the entry node. 
        """
        entry_in_edges = set(
            [x[0] for x in self.graph.in_edges(nbunch=self.entry)])
        jaccard_scores = {}
        for node in self.graph.nodes:
            target_in_edges = set(
                [x[0] for x in self.graph.in_edges(nbunch=node)])
            in_edge_intersect = len(
                entry_in_edges.intersection(target_in_edges))
            in_edge_union = len(entry_in_edges.union(target_in_edges))

            jaccard_scores[node] = in_edge_intersect / in_edge_union

        return sort_dict_values(jaccard_scores, ["node", "jaccard_similarity"],
                                "jaccard_similarity",
                                ascending=False)
Example #9
0
    def get_shared_neighbors_with_entry_score(self):
        """
        A score comprised of the total number of shared neighbors with the target OVER the total number of neighbors
        of each node 
        """
        entry_neighbors = list(set(nx.all_neighbors(self.graph, self.entry)))
        shared_neighbors_score = {}
        for node in self.graph.nodes:
            target_neighbors = list(set(nx.all_neighbors(self.graph, node)))
            shared_neighbors = len(
                entry_neighbors) + len(target_neighbors) - len(
                    set(entry_neighbors + target_neighbors))
            # score is neighbors shared over how many possible unique neighbors could have been shared.
            shared_neighbors_score[node] = shared_neighbors / len(
                set(entry_neighbors + target_neighbors))

        return sort_dict_values(shared_neighbors_score,
                                ["node", "shared_neighbors_with_entry_score"],
                                "shared_neighbors_with_entry_score",
                                ascending=False)
Example #10
0
    def get_dominator_counts(self, source=None):
        """
        Gets local dominator score for each node. Not included infeatures_df because it can take some time to calculate. 
        """
        if not source:
            source = self.entry

        dom_dict = nx.algorithms.dominance.immediate_dominators(self.graph,
                                                                start=source)

        dom_counts = {}

        for key, value in dom_dict.items():
            if value in dom_counts:
                dom_counts[value] += 1
            else:
                dom_counts[value] = 1
        for node in self.graph.nodes:
            if not node in dom_counts:
                dom_counts[node] = 0

        return sort_dict_values(dom_counts,
                                ['node', 'immediate_dominator_count'],
                                'immediate_dominator_count')