Example #1
0
 def test_scipy_pagerank(self):
     G = self.G
     p = networkx.pagerank_scipy(G, alpha=0.9, tol=1.e-08)
     for n in G:
         assert_almost_equal(p[n], G.pagerank[n], places=4)
     personalize = dict((n, random.random()) for n in G)
     p = networkx.pagerank_scipy(G, alpha=0.9, tol=1.e-08,
                                 personalization=personalize)
Example #2
0
 def test_scipy_pagerank(self):
     G = self.G
     p = networkx.pagerank_scipy(G, alpha=0.9, tol=1.e-08)
     for n in G:
         assert_almost_equal(p[n], G.pagerank[n], places=4)
     personalize = dict((n, random.random()) for n in G)
     p = networkx.pagerank_scipy(G, alpha=0.9, tol=1.e-08,
                                 personalization=personalize)
Example #3
0
    def test_scipy_pagerank(self):
        G = self.G
        p = nx.pagerank_scipy(G, alpha=0.9, tol=1.0e-08)
        for n in G:
            assert p[n] == pytest.approx(G.pagerank[n], abs=1e-4)
        personalize = {n: random.random() for n in G}
        p = nx.pagerank_scipy(G, alpha=0.9, tol=1.0e-08, personalization=personalize)

        nstart = {n: random.random() for n in G}
        p = nx.pagerank_scipy(G, alpha=0.9, tol=1.0e-08, nstart=nstart)
        for n in G:
            assert p[n] == pytest.approx(G.pagerank[n], abs=1e-4)
Example #4
0
    def test_scipy_pagerank(self):
        G = self.G
        try:
            import scipy
        except ImportError:
            raise SkipTest("scipy not available.")
        p = networkx.pagerank_scipy(G, alpha=0.9, tol=1.0e-08)
        for n in G:
            assert_almost_equal(p[n], G.pagerank[n], places=4)
        personalize = dict((n, random.random()) for n in G)
        p = networkx.pagerank_scipy(G, alpha=0.9, tol=1.0e-08, personalization=personalize)

        assert_raises(networkx.NetworkXError, networkx.pagerank_scipy, G, max_iter=0)
Example #5
0
    def test_scipy_pagerank(self):
        G = self.G
        p = networkx.pagerank_scipy(G, alpha=0.9, tol=1.0e-08)
        for n in G:
            assert almost_equal(p[n], G.pagerank[n], places=4)
        personalize = {n: random.random() for n in G}
        p = networkx.pagerank_scipy(
            G, alpha=0.9, tol=1.0e-08, personalization=personalize
        )

        nstart = {n: random.random() for n in G}
        p = networkx.pagerank_scipy(G, alpha=0.9, tol=1.0e-08, nstart=nstart)
        for n in G:
            assert almost_equal(p[n], G.pagerank[n], places=4)
Example #6
0
    def test_scipy_pagerank(self):
        G=self.G
        try:
            import scipy
        except ImportError:
            raise SkipTest('scipy not available.')
        p=networkx.pagerank_scipy(G,alpha=0.9,tol=1.e-08)
        for n in G:
            assert_almost_equal(p[n],G.pagerank[n],places=4)
        personalize = dict((n,random.random()) for n in G)
        p=networkx.pagerank_scipy(G,alpha=0.9,tol=1.e-08,
                                  personalization=personalize)

        assert_raises(networkx.NetworkXError,networkx.pagerank_scipy,G,
                      max_iter=0)
Example #7
0
def ExtractSentence(text,k):
    "根据文本内容获得句子重要性排名"
    print('开始句子重要性排名')

    sent_tokens = nlp.sent_tokenize(text)

    #可以加入限制条件,如果句子中的实体数少于阈值则放弃这个句子,等等,待扩展
    sent_tokens = filter_sent(sent_tokens,1)

    #建图结构
    text_graph = graph_construct(sent_tokens)

    #这里pagerank有三种,一种是正常的pg,一种是利用numpy还有一种就是下面的利用scipy的稀疏矩阵
    print('start to calculate')
    #cal_gr_page_rank = nx.pagerank(text_graph,weight='weight')
    cal_gr_page_rank = nx.pagerank_scipy(text_graph)
    print('ended')

    #按照最后的score得分进行排序,获得前K个,待扩展,使之取不超250个词的句子
    sents = sorted(cal_gr_page_rank,key = cal_gr_page_rank.get, reverse=True)

    kth = get_sum_sents(sents,250)
    #topK

    str_tmp_list = []
    for sidx in range(kth):
        str_tmp = sents[sidx]
        str_tmp += '[%.4f]'%(cal_gr_page_rank[sents[sidx]])
        str_tmp_list.append(str_tmp)
    print_score(str_tmp_list)

    return ' '.join(sents[:kth])
Example #8
0
    def candidate_weighting(self, threshold=0.25, method='average'):
        """ Candidate weight calculation using random walk.

            Args:
                threshold (float): the minimum similarity for clustering,
                    defaults to 0.25.
                method (str): the linkage method, defaults to average.
        """

        # cluster the candidates
        self.topic_clustering(threshold=threshold, method=method)

        # build the topic graph
        self.build_topic_graph()

        # compute the word scores using random walk
        w = nx.pagerank_scipy(self.graph)

        # loop throught the topics
        for i, topic in enumerate(self.topics):

            # get first occuring candidate from topic
            offsets = [self.candidates[t].offsets[0] for t in topic]
            first = offsets.index(min(offsets))
            self.weights[topic[first]] = w[i]
Example #9
0
def compute_centrality(star_dict, edge_dict):
    
    #build up a nx graph
    galaxy = networkx.Graph()
    for v, vertex in star_dict.iteritems():
        galaxy.add_node(v)
    
    for v, neighbors in edge_dict.iteritems():
        for n in neighbors:
            galaxy.add_edge(v,n)
            
    print "betweenness"
    betweenness_map = networkx.current_flow_betweenness_centrality(galaxy)
    betweenness_map = normalize(betweenness_map)
    
    for key, value in betweenness_map.iteritems():
        star_dict[key]['betweenness'] = value
        
    print "closeness"
    closeness_map = networkx.current_flow_closeness_centrality(galaxy)
    closeness_map = normalize(closeness_map)
    
    for key, value in closeness_map.iteritems():
        star_dict[key]['closeness'] = value
        

    print "pagerank"
    pagerank_map = networkx.pagerank_scipy(galaxy)
    pagerank_map = normalize(pagerank_map)
    
    for key, value in pagerank_map.iteritems():
        star_dict[key]['pagerank'] = value
Example #10
0
def main():
	print '- updating pagerank :'

	# DB-CONNECT
	conn = sqlite3.connect(db_path)
	c = conn.cursor()
	
	# DB-EXECUTE
	# get subgraph
	r = c.execute("SELECT blog_name, source_title FROM subgraph")
	graph = {key : value.split() for (key, value) in r}
	
	G = nx.DiGraph(graph)
	pr = nx.pagerank_scipy(G, alpha=0.85)
	
	# normalise
	ranks = pr.values()
	rank_min, rank_max = min(ranks), max(ranks)
	for k in pr: pr[k] = round(((pr[k] - rank_min) / (rank_max - rank_min)), 4)
	
	# update table
	for blog in pr:
		c.execute("UPDATE tumblr_model SET pagerank=? WHERE blog_name=?", [pr[blog], blog])
	
	# DB-COMMIT AND CLOSE
	conn.commit()
	conn.close()
	
	# sorting, optional
	pr_sorted = sorted(pr.items(), key=operator.itemgetter(1))
	print "    %s is the most popular domain in the network"%pr_sorted[-1:][0][0] 
	print '' 
Example #11
0
    def candidate_weighting(self, window=10, pos=None, normalized=False):
        """ Candidate weight calculation using random walk.

            Args:
                window (int): the window within the sentence for connecting two
                    words in the graph, defaults to 10.
                pos (set): the set of valid pos for words to be considered as
                    nodes in the graph, defaults to (NN, NNS, NNP, NNPS, JJ,
                    JJR, JJS).
                normalized (False): normalize keyphrase score by their length,
                    defaults to False
        """

        # define default pos tags set
        if pos is None:
            pos = set(['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS'])

        # build the word graph
        self.build_word_graph(window=window, pos=pos)

        # compute the word scores using random walk
        w = nx.pagerank_scipy(self.graph)

        # loop through the candidates
        for k in self.candidates.keys():
            tokens = self.candidates[k].lexical_form
            self.weights[k] = sum([w[t] for t in tokens])
            if normalized:
                self.weights[k] /= len(tokens)
Example #12
0
    def get_keyphrases(self, document, speakers = None, use_main = False, topX = 5, maxlen = 50, include_scores = False):
        """ Get keyphrases from a document using LexRank
        
        Speakers, use_main use case similar to in keynet.py """
        self.document = document
        
        if speakers:
            main, others = parse_doc_speakers(self.document, speakers)
            if use_main:
                self.document = main
            else:
                self.document = others
                
        self.init_counts()
        
        network_graph = self._build_graph()

        ranked = nx.pagerank_scipy(network_graph)
        ranked = [(val, text) for val, text in ranked.items()]
                
        sort_ranked = sorted(ranked, key=lambda t: t[1], reverse=True)

        if maxlen:
            sort_ranked = [t for t in sort_ranked if len(t[0].split()) < maxlen]

        if not include_scores:
            sort_ranked = [s[0] for s in sort_ranked]
            
        return sort_ranked[:topX]
Example #13
0
    def candidate_weighting(self, window=10, pos=None, normalized=False):
        """ Candidate weight calculation using random walk.

            Args:
                window (int): the window within the sentence for connecting two
                    words in the graph, defaults to 10.
                pos (set): the set of valid pos for words to be considered as
                    nodes in the graph, defaults to (NN, NNS, NNP, NNPS, JJ,
                    JJR, JJS).
                normalized (False): normalize keyphrase score by their length,
                    defaults to False
        """

        # define default pos tags set
        if pos is None:
            pos = set(['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS'])

        # build the word graph
        self.build_word_graph(window=window, pos=pos)

        # compute the word scores using random walk
        w = nx.pagerank_scipy(self.graph, alpha=0.85, weight='weight')

        # loop through the candidates
        for k in self.candidates.keys():
            tokens = self.candidates[k].lexical_form
            self.weights[k] = sum([w[t] for t in tokens])
            if normalized:
                self.weights[k] /= len(tokens)
Example #14
0
    def findBestChilds(self, nodes, k=4):
        n = len(nodes)
        node_list = dict()
        i = 0
        for node in nodes:
            node_list[i] = node
            i += 1

        self.stateGraph = np.zeros(shape=(n, n), dtype=np.byte)

        [self.buildSubGraph(i, n, node_list) for i in range(n)]

        try:
            self.logger.debug(len(self.stateGraph))
            h = (nx.pagerank_scipy(nx.Graph(self.stateGraph),
                                   max_iter=100,
                                   tol=1e-07))

            res = list(sorted(h, key=h.__getitem__, reverse=True))

            important = res[:k]
        except:
            self.logger.error('Graph is empty')
            self.logger.error(sys.exc_info())

        dereffed_list = set([self.sub(i, node_list) for i in important])
        dereffed_list.discard(0)
        dereffed_list.discard(1)
        return list(dereffed_list)
Example #15
0
def textrank_tagger(tokenized, w2v_model):
    """
    TextRank based on cosine similarity between TF-IDF-reweighted w2v sentence sums.
    """
    idf_weights = idf_weight_dict(tokenized)

    original_indices, sent_representations = w2v_sentence_sums_tfidf(
        tokenized, w2v_model, idf_weights)

    distance_matrix = pairwise_distances(sent_representations, metric='cosine')
    similarity_matrix = np.subtract(1, distance_matrix)

    # Use PageRank algorithm on similarity matrix
    nx_graph = nx.from_numpy_matrix(similarity_matrix)

    # Convergence of graph (tolerance from TextRank paper)
    scores = nx.pagerank_scipy(nx_graph, max_iter=100, tol=1e-04)

    # For now, the number of summary-worthy sentences is set to ~33% of the sentences.
    cutoff = len(tokenized) // 3
    sorted_sentences = sorted([(scores[i], original_indices[i])
                               for i, s in enumerate(tokenized)],
                              reverse=True)
    summary_indices = [index for score, index in sorted_sentences[:cutoff]]
    labels = [
        1 if i in summary_indices else 0 for i, _ in enumerate(tokenized)
    ]

    return labels
Example #16
0
def text_summary(doc, sent_count):
    """
    Summarizes given text using word vectors and graph-based ranking.

    Args:
        doc: a spacy.Doc object
        sent_count: number (/ratio) of sentences in the summary
    Returns:
        Text summary
    """
    sents = list(doc.sents)
    sent_graph = networkx.Graph()
    sent_graph.add_nodes_from(idx for idx, sent in enumerate(sents))

    for i, j in it.combinations(sent_graph.nodes_iter(), 2):
        # Calculate cosine similarity of two sentences transformed to the interval [0,1]
        similarity = (sents[i].similarity(sents[j]) + 1) / 2
        if similarity != 0:
            sent_graph.add_edge(i, j, weight=similarity)

    sent_ranks = networkx.pagerank_scipy(sent_graph)

    if 0 < sent_count < 1:
        sent_count = round(sent_count * len(sent_ranks))
    sent_count = int(sent_count)

    top_indices = top_keys(sent_count, sent_ranks)

    # Return the key sentences in chronological order
    top_sents = map(lambda i: sents[i], sorted(top_indices))

    return format_output(doc, list(top_sents))
Example #17
0
    def get_keyphrases(self, document, include_scores=False, maxlen=None):
        """ Get keyphrases from a document using LexRank
        
        Speakers, use_main use case similar to in keynet.py """

        # Incoporate documnet being considered
        self.document = document

        # Initialize document counts, tfidif scores
        self.init_counts()

        # Build graph of sentences, edges are cossim
        network_graph = self._build_graph()

        # Run PageRank on the graph
        ranked = nx.pagerank_scipy(network_graph)
        ranked = [(val, text) for val, text in ranked.items()]

        # Sort results by score
        sort_ranked = sorted(ranked, key=lambda t: t[1], reverse=True)

        # Keep only results up to some maximum length in tokens
        if maxlen:
            sort_ranked = [
                t for t in sort_ranked if len(t[0].split()) < maxlen
            ]

        # For outputting without scores
        if not include_scores:
            sort_ranked = [s[0] for s in sort_ranked]

        return sort_ranked
Example #18
0
    def candidate_weighting(self, doc: Doc) -> List[Tuple[Candidate, float]]:
        """Compute the weighted score of each keyword candidate.

        Args:
            doc (Doc): doc.

        Returns:
            list of tuples, candidate with a score.
        """
        res = []
        C = doc._.kw_candidates
        G = self.build_graph(doc)
        W = nx.pagerank_scipy(G, alpha=self.cfg["alpha"], tol=self.cfg["tol"], weight="weight")

        for i, topic in nx.get_node_attributes(G, "C").items():
            offsets = [C[t].offsets[0] for t in topic]
            if self.cfg["heuristic"] == "frequent":
                freq = [len(C[t].surface_forms) for t in topic]
                indexes = [j for j, f in enumerate(freq) if f == max(freq)]
                indexes_offsets = [offsets[j] for j in indexes]
                most_frequent = offsets.index(min(indexes_offsets))
                res.append((C[topic[most_frequent]], W[i]))
            else:
                first = offsets.index(min(offsets))
                res.append((C[topic[first]], W[i]))

        res.sort(key=lambda x: x[1], reverse=True)
        return res
Example #19
0
 def test_rank_time(self):
     from pygrank.algorithms.pagerank import PageRank as ranker
     from pygrank.algorithms.utils import preprocessor
     import scipy.stats
     nx_time = list()
     test_time = list()
     repeats = 50
     for _ in range(repeats):
         G = create_test_graph()
         tic = time.clock()
         ranker(to_scipy=preprocessor('col')).rank(G)
         test_time.append(time.clock()-tic)
         tic = time.clock()
         nx.pagerank_scipy(G)
         nx_time.append(time.clock()-tic)
     self.assertLessEqual(scipy.stats.ttest_ind(nx_time, test_time)[1], 0.001, msg="PageRank time comparable to nx with p-value<0.001")
Example #20
0
def pre_calculate(X, k=100, ntop=50, calculate_important=None):
    """ Calculate the k-nearest neighbors matrix
        Calculate Hubs or Pagerank for each points
    """
    from sklearn.neighbors import NearestNeighbors
    model = NearestNeighbors(n_neighbors=k, algorithm='ball_tree')
    model.fit(X)
    distances, indices = model.kneighbors()

    if calculate_important is None:
        top_important = []
    else:
        nn = model.kneighbors_graph(mode='distance')
        g = nx.from_scipy_sparse_matrix(nn)
        if calculate_important == 'pagerank':
            pageranks = nx.pagerank_scipy(g)
            top_important = sorted(pageranks, key=pageranks.get, reverse=True)
        elif calculate_important == 'hubs':
            hubs, authorities = nx.hits_scipy(g)
            top_important = sorted(hubs, key=hubs.get, reverse=True)

    return {
        'distances': distances.tolist(),
        'neighbors': list(map(lambda s: list(map(str, s)), indices)),
        'importantPoints': list(map(str, top_important[:ntop])),
        'infoMsg': 'Dataset size: {}'.format(X.shape)
    }
Example #21
0
    def candidate_weighting(self, window=10, pos=None, normalized=False):
        """Keyphrase candidate ranking using the weighted variant of the
        TextRank formulae. Candidates are scored by the sum of the scores of
        their words.

        Args:
            window (int): the window within the sentence for connecting two
                words in the graph, defaults to 10.
            pos (set): the set of valid pos for words to be considered as nodes
                in the graph, defaults to ('NOUN', 'PROPN', 'ADJ').
            normalized (False): normalize keyphrase score by their length,
                defaults to False.
        """

        if pos is None:
            pos = {'NOUN', 'PROPN', 'ADJ'}

        # build the word graph
        self.build_word_graph(window=window, pos=pos)

        # compute the word scores using random walk
        w = nx.pagerank_scipy(self.graph,
                              alpha=0.85,
                              tol=0.0001,
                              weight='weight')

        # loop through the candidates
        for k in self.candidates.keys():
            tokens = self.candidates[k].lexical_form
            self.weights[k] = sum([w[t] for t in tokens])
            if normalized:
                self.weights[k] /= len(tokens)

            # use position to break ties
            self.weights[k] += self.candidates[k].offsets[0] * 1e-8
Example #22
0
    def run_pagerank(self):

        self.extract_filenamedict()
        self.get_corr_matrix(self.corr_method)
        pid_filter = self.get_pid_filter()

        adj_mat = np.multiply(self.corr_mat, pid_filter)
        adj_mat_values = adj_mat.values

        st = time.time()
        g = nx.DiGraph()
        for i,ni in enumerate(adj_mat.index):
            #print(i)
            for j,nj in enumerate(adj_mat.columns):

                if adj_mat_values[i][j] !=0:

                    g.add_edge(ni,nj, weight = adj_mat_values[i][j])

        cost = time.time()- st
        print('Time Cost to build the graph:', cost)

        #run pagerank
        st = time.time()
        self.pr = nx.pagerank_scipy(g, alpha=self.alpha, personalization=self.filename2score, max_iter=300, tol=1.0e-12)
        #print (pr)
        print ('Time Cost to run pagerank:', time.time() - st)
Example #23
0
    def get_summary(self,
                    docs,
                    topK=5,
                    stopwords=None,
                    with_importance=False,
                    standard_name=True):
        import networkx as nx

        def sent_sim1(words1, words2):
            if len(words1) <= 1 or len(words2) <= 1:
                return 0.0
            return (len(set(words1) & set(words2))) / (np.log2(len(words1)) +
                                                       np.log2(len(words2)))

        # 使用standard_name,相似度可以基于实体链接的结果计算而更加准确
        sents = [
            self.seg(doc.strip(),
                     standard_name=standard_name,
                     stopwords=stopwords) for doc in docs
        ]
        sents = [sent for sent in sents if len(sent) > 0]
        G = nx.Graph()
        for u, v in combinations(range(len(sents)), 2):
            G.add_edge(u, v, weight=sent_sim1(sents[u], sents[v]))

        pr = nx.pagerank_scipy(G)
        pr_sorted = sorted(pr.items(), key=lambda x: x[1], reverse=True)
        if with_importance:
            return [(docs[i], imp) for i, imp in pr_sorted[:topK]]
        else:
            return [docs[i] for i, rank in pr_sorted[:topK]]
Example #24
0
def rooted_pagerank(G, root, alpha=0.85, beta=0, weight='weight'):
    """Return the rooted PageRank of all nodes with respect to node `root`

    Parameters
    ----------

    G : a networkx.(Di)Graph
        network to compute PR on

    root : a node from the network
        the node that will be the starting point of all random walks

    alpha : float
        PageRank probability that we will advance to a neighbour of the
        current node in a random walk

    beta : float or int
        Normally, we return to the root node with probability 1 - alpha.
        With this parameter, we can also advance to a random other node in the
        network with probability beta. Thus, we get back to the root node with
        probability 1 - alpha - beta. This is off (0) by default.

    weight : string or None
        The edge attribute that holds the numerical value used for
        the edge weight.  If None then treat as unweighted.

    """
    personalization = dict.fromkeys(G, beta)
    personalization[root] = 1 - beta

    return networkx.pagerank_scipy(G, alpha, personalization, weight=weight)
    def candidate_weighting(self,
                            threshold=0.74,
                            method='average',
                            alpha=1.1):
        """ Candidate weight calculation using random walk.

            Args:
                threshold (float): the minimum similarity for clustering,
                    defaults to 0.25.
                method (str): the linkage method, defaults to average.
                alpha (float): hyper-parameter that controls the strength of the
                    weight adjustment, defaults to 1.1.
        """

        # cluster the candidates
        self.topic_clustering(threshold=threshold, method=method)

        # build the topic graph
        self.build_topic_graph()

        if alpha > 0.0:
            self.weight_adjustment(alpha)

        # compute the word scores using random walk
        self.weights = nx.pagerank_scipy(self.graph)
Example #26
0
    def findBestChilds(self,nodes,k = 4):
        n = len(nodes)
        node_list = dict()
        i = 0
        for node in nodes:
            node_list[i] = node
            i += 1
            
        self.stateGraph = np.zeros(shape=(n, n), dtype=np.byte)
        
        [self.buildSubGraph(i, n, node_list) for i in range(n)]

        try:
            self.logger.debug (len(self.stateGraph))
            h = (nx.pagerank_scipy(nx.Graph(self.stateGraph), max_iter=100, tol=1e-07))

            res = list(sorted(h, key=h.__getitem__, reverse=True))

            important = res[:k]          
        except:
            self.logger.error ('Graph is empty')
            self.logger.error (sys.exc_info())
        
        dereffed_list = set([self.sub(i, node_list) for i in important])
        if len(dereffed_list) > 1:
            dereffed_list.discard(0)
            dereffed_list.discard(1)
        return list(dereffed_list)
Example #27
0
def create_node_embeddings(graph, neighborhoods):
    """
    Creates node "embeddings" based on the degrees of neighboring vertices and approximate
    centrality measures
    """
    num_nodes = graph.number_of_nodes()
    embeddings = np.zeros(shape=(num_nodes, 2 * (len(neighborhoods) - 1) + 2), dtype=float)

    eigen = nx.eigenvector_centrality_numpy(graph)
    pagerank = nx.pagerank_scipy(graph, alpha=0.85)

    out_neighbors = []
    in_neighbors = []
    for i in range(1, len(neighborhoods)):
        out_neighbors.append(neighborhoods[i].sum(axis=1, dtype=float))
        in_neighbors.append(neighborhoods[i].sum(axis=0, dtype=float))

    for i, node in enumerate(graph.nodes()):
        for j in range(len(out_neighbors)):
            embeddings[i][2*j] = out_neighbors[j][i, 0] / graph.number_of_nodes()
            embeddings[i][2*j+1] = in_neighbors[j][0, i] / graph.number_of_nodes()

        embeddings[i][-2] = eigen[node]
        embeddings[i][-1] = pagerank[node]

    return np.array(embeddings)
Example #28
0
    def embed(self, network):
        """ Create an embedding of the network

        Args:
            network (scipy sparse matrix): Sparse network adjacency matrix .

        Returns:
            scipy sparse matrix: Symbolic node embedding.
        """
        logging.info("Generating and hashing random walks")
        hashes = self.generate_walk_hashes(network)

        # Rank nodes
        pagerank_scores = nx.pagerank_scipy(
            nx.from_scipy_sparse_matrix(network))
        ranked_features = np.argsort(
            [pagerank_scores[i] for i in range(len(pagerank_scores))])[::-1]

        logging.info("Generating similarity matrix")
        if self.fixed_dimension:
            embedding = self.generate_similarity_fixed(
                hashes,
                ranked_features[:min(self.dimension, network.shape[0])])
        else:
            embedding = self.generate_similarity_matrix(
                hashes, ranked_features).tocsr()

        # Check if embedding size is less then tau
        assert (not sparse.issparse(embedding)) or len(
            embedding.data) <= (self.dimension * network.shape[0])

        logging.info("Embedding done")
        return embedding
Example #29
0
def documentPagerankPrediction(G, dataset, synsets_dictionary):

    predicted = []

    for d in dataset:
        pre = []

        near = set()
        to_add = {}

        for l in d:
            near.update(synsets_dictionary[l].keys())
            to_add.update({l: synsets_dictionary[l]})

        TG = extendGraph(G, to_add, document_graph=False)
        pr = nx.pagerank_scipy(TG, personalization={n: 1 for n in near})

        for l in d:
            max_prob = 0
            best_syn = 0
            for synsets in synsets_dictionary[l].keys():
                rank = pr[synsets]
                if rank > max_prob:
                    max_prob = rank
                    best_syn = synsets

            if best_syn == 0:
                best_syn = np.random.choice(list(near))

            assert (best_syn != 0)
            pre.append(best_syn)

        predicted.append(pre)

    return predicted
Example #30
0
def test_pagerank_by_hand():
    graph = Graph('gr', 'gr.xml', 4)
    graph.add_node(Node(0, LabelNodeLetter(0, 0)))
    graph.add_node(Node(1, LabelNodeLetter(0, 0)))
    graph.add_node(Node(2, LabelNodeLetter(0, 0)))
    graph.add_node(Node(3, LabelNodeLetter(0, 0)))

    graph.add_edge(Edge(0, 1, LabelEdge(0)))
    graph.add_edge(Edge(1, 2, LabelEdge(0)))
    graph.add_edge(Edge(2, 3, LabelEdge(0)))

    pagerank = PageRank()
    results = pagerank.calc_centrality_score(graph)
    results = np.asarray(results)

    graph2 = nx.Graph()
    graph2.add_node(1)
    graph2.add_node(2)
    graph2.add_node(3)
    graph2.add_node(4)
    graph2.add_edge(1, 2)
    graph2.add_edge(2, 3)
    graph2.add_edge(3, 4)

    expected = np.array([val for _, val in nx.pagerank_scipy(graph2).items()])
    print(results)

    assert np.linalg.norm(results - expected) < 1e-6
Example #31
0
    def get_summary(self, docs, topK=5, stopwords=None, with_importance=False, standard_name=True,
                    maxlen=None, avoid_repeat=False):
        """
        使用Textrank算法得到文本中的关键句
        :param docs: str句子列表
        :param topK: 选取几个句子, 如果设置了maxlen,则优先考虑长度
        :param stopwords: 在算法中采用的停用词
        :param with_importance: 返回时是否包括算法得到的句子重要性
        :param standard_name: 如果有entity_mention_list的话,在算法中正规化实体名,一般有助于提升算法效果
        :param maxlen: 设置得到的摘要最长不超过多少字数,如果已经达到长度限制但未达到topK句也会停止
        :param avoid_repeat: 使用MMR principle惩罚与已经抽取的摘要重复的句子,避免重复
        :return: 句子列表,或者with_importance=True时,(句子,分数)列表
        """
        assert topK > 0
        import networkx as nx
        maxlen = float('inf') if maxlen is None else maxlen
        # 使用standard_name,相似度可以基于实体链接的结果计算而更加准确
        sents = [self.seg(doc.strip(), standard_name=standard_name, stopwords=stopwords) for doc in docs]
        sents = [sent for sent in sents if len(sent) > 0]
        G = nx.Graph()
        for u, v in combinations(range(len(sents)), 2):
            G.add_edge(u, v, weight=sent_sim_textrank(sents[u], sents[v]))

        pr = nx.pagerank_scipy(G)
        pr_sorted = sorted(pr.items(), key=lambda x: x[1], reverse=True)
        if not avoid_repeat:
            ret = []
            curr_len = 0
            for i, imp in pr_sorted[:topK]:
                curr_len += len(docs[i])
                if curr_len > maxlen: break
                ret.append((docs[i], imp) if with_importance else docs[i])
            return [ ]
        else:
            assert topK <= len(sents)
            ret = []
            curr_len = 0
            curr_sumy_words = []
            candidate_ids = list(range(len(sents)))
            i, imp = pr_sorted[0]
            curr_len += len(docs[i])
            if curr_len > maxlen:
                return ret
            ret.append((docs[i], imp) if with_importance else docs[i])
            curr_sumy_words.extend(sents[i])
            candidate_ids.remove(i)
            for iter in range(topK-1):
                importance = [pr[i] for i in candidate_ids]
                norm_importance = scipy.special.softmax(importance)
                redundancy = np.array([sent_sim_cos(curr_sumy_words, sents[i]) for i in candidate_ids])
                scores = 0.6*norm_importance - 0.4*redundancy
                id_in_cands = np.argmax(scores)
                i, imp = candidate_ids[id_in_cands], importance[id_in_cands]
                curr_len += len(docs[i])
                if curr_len > maxlen:
                    return ret
                ret.append((docs[i], imp) if with_importance else docs[i])
                curr_sumy_words.extend(sents[i])
                del candidate_ids[id_in_cands]
            return ret
Example #32
0
def text_summary(doc, sent_count):
    """
    Summarizes given text using word vectors and graph-based ranking.

    Args:
        doc: a spacy.Doc object
        sent_count: number (/ratio) of sentences in the summary
    Returns:
        Text summary
    """
    sents = list(doc.sents)
    sent_graph = networkx.Graph()
    sent_graph.add_nodes_from(idx for idx, sent in enumerate(sents))

    for i, j in it.combinations(sent_graph.nodes_iter(), 2):
        # Calculate cosine similarity of two sentences transformed to the interval [0,1]
        similarity = (sents[i].similarity(sents[j]) + 1) / 2
        if similarity != 0:
            sent_graph.add_edge(i, j, weight=similarity)

    sent_ranks = networkx.pagerank_scipy(sent_graph)

    if 0 < sent_count < 1:
        sent_count = round(sent_count * len(sent_ranks))
    sent_count = int(sent_count)

    top_indices = top_keys(sent_count, sent_ranks)

    # Return the key sentences in chronological order
    top_sents = map(lambda i: sents[i], sorted(top_indices))

    return format_output(doc, list(top_sents))
Example #33
0
def rooted_pagerank(G, root, alpha=0.85, beta=0, weight='weight'):
    """Return the rooted PageRank of all nodes with respect to node `root`

    Parameters
    ----------

    G : a networkx.(Di)Graph
        network to compute PR on

    root : a node from the network
        the node that will be the starting point of all random walks

    alpha : float
        PageRank probability that we will advance to a neighbour of the
        current node in a random walk

    beta : float or int
        Normally, we return to the root node with probability 1 - alpha.
        With this parameter, we can also advance to a random other node in the
        network with probability beta. Thus, we get back to the root node with
        probability 1 - alpha - beta. This is off (0) by default.

    weight : string or None
        The edge attribute that holds the numerical value used for
        the edge weight.  If None then treat as unweighted.

    """
    personalization = dict.fromkeys(G, beta)
    personalization[root] = 1 - beta

    return networkx.pagerank_scipy(G, alpha, personalization, weight=weight)
Example #34
0
    def create_features(self, G_train, edge_bunch):
        i = 0
        X = []
        page_rank = nx.pagerank_scipy(G_train)
        for pair in edge_bunch:
            commmon_neighbors = len(
                list(nx.common_neighbors(G_train, pair[0], pair[1])))
            jaccard_coefficient = nx.jaccard_coefficient(G_train,
                                                         [pair]).next()[2]
            adamic_adar = nx.adamic_adar_index(G_train, [pair]).next()[2]
            degree_0 = nx.degree(G_train, pair[0])
            degree_1 = nx.degree(G_train, pair[1])
            prod = degree_0 * degree_1
            page_rank_0 = page_rank[pair[0]]
            page_rank_1 = page_rank[pair[1]]

            f = [
                degree_0,
                degree_1,
                prod,
                commmon_neighbors,
                jaccard_coefficient,
                adamic_adar,
                page_rank_0,
                page_rank_1,
            ]

            X.append(f)

            i += 1
            if i % 1000000 == 0:
                print(i)

        return np.array(X)
Example #35
0
    def random_walk_word_scoring(self):
        """Compute a random walk ranking on the words using the power method.

        """
        G = nx.Graph()

        # loop through the sentences to build the graph
        for i, sentence in enumerate(self.sentences):
            nodes = set([])
            for words, offset in sentence.candidates:
                for w in words:
                    nodes.add(w)

            # add the missing nodes to the graph
            for node in nodes:
                if not node in G:
                    G.add_node(node)
            
            # add the edges to the graph
            for n1, n2 in combinations(nodes, 2):
                if not G.has_edge(n1, n2):
                    G.add_edge(n1, n2, weight=0)
                G[n1][n2]['weight'] += 1.0

        # return the random walk scores
        return self.normalize(nx.pagerank_scipy(G))
Example #36
0
def calculate_weighted_page_rank(
    graph: Union[nx.MultiDiGraph, nx.MultiGraph, nx.Graph, nx.DiGraph],
    weight: str = "weight",
) -> OrderedDict:
    """
    Calculate Page Rank for ARRG.

    Parameters
    ----------
    graph : networkx.DiGraph
        Graph of aspect-aspect relation ARRG.

    weight : str, optional
        Name of edge attribute that consists of weight for an edge. it is
        used to calculate Weighted version of Page Rank.

    Returns
    -------
    page_ranks : OrderedDict
        Page Rank values for ARRG.

    """
    logger.info("Weighted Page Rank calculation starts.")
    page_ranks = nx.pagerank_scipy(graph, weight=weight)
    logger.info("Weighted Page Rank calculation ended.")
    return OrderedDict(
        sorted(page_ranks.items(), key=itemgetter(1), reverse=True))
Example #37
0
 def test_empty_scipy(self):
     try:
         import scipy
     except ImportError:
         raise SkipTest('scipy not available.')
     G = networkx.Graph()
     assert_equal(networkx.pagerank_scipy(G), {})
 def test_empty_scipy(self):
     try:
         import scipy
     except ImportError:
         raise SkipTest("scipy not available.")
     G = networkx.Graph()
     assert_equal(networkx.pagerank_scipy(G), {})
Example #39
0
    def candidate_weighting(self, doc: Doc) -> List[Tuple[Candidate, float]]:
        """Compute the weighted score of each keyword candidate.

        Args:
            doc (Doc): doc.

        Returns:
            list of tuples, candidate with a score.
        """
        res = []
        G = self.build_graph(doc)
        W = nx.pagerank_scipy(G, alpha=self.cfg["alpha"], tol=self.cfg["tol"])
        for candidate in doc._.kw_candidates:
            chunk_len = len(candidate.lexical_form)
            non_lemma = 0
            rank = 0.0
            for t in candidate.lexical_form:
                if t in W:
                    rank += W[t]
                else:
                    non_lemma += 1
            non_lemma_discount = chunk_len / (chunk_len + (2.0 * non_lemma) + 1.0)
            candidate_w = np.sqrt(rank / (chunk_len + non_lemma)) * non_lemma_discount
            candidate_w += candidate.offsets[0] * 1e-8  # break ties according to position in text
            res.append((candidate, candidate_w))
        res.sort(key=lambda x: x[1], reverse=True)
        return res
def generate_global_zscore(full_graph: pd.DataFrame, edgelist: pd.DataFrame, path: str, flag=False):
    """
    Function that generates a dictionary with all the zscore of movies. If flag is true, generate file,
    else only reads the file
    :param full_graph: full graph of the movie dataset
    :param path: path to save the generated DataFrame
    :param flag: True to generate file of DataFrame with global zscores, False to read it
    :return: dictionary with prop and obj keys and count and zscores as columns
    """
    if flag:
        full_slice = full_graph[['prop', 'obj']]
        full_split_dfs = pd.DataFrame()

        copy = full_graph.copy()
        copy['origin'] = ['M' + x for x in copy.index.astype(str)]
        copy['destination'] = copy['obj']
        full_edgelist = pd.concat([edgelist, copy[['origin', 'destination']]])

        # create graph
        G = nx.from_pandas_edgelist(full_edgelist, 'origin', 'destination')
        pr_np = nx.pagerank_scipy(G, max_iter=1000)

        for prop in full_slice['prop'].unique():
            df_prop = full_slice[full_slice['prop'] == prop]
            df_gzscore = df_prop.copy()
            df_gzscore['count'] = df_prop.groupby(by='obj')['obj'].transform('count')
            df_gzscore['global_zscore'] = (df_gzscore['count'] - df_gzscore['count'].mean()) / df_gzscore['count'].std()
            full_split_dfs = pd.concat([full_split_dfs, df_gzscore])

        full_split_dfs['pr'] = full_split_dfs.apply(lambda x: pr_np[x['obj']], axis=1)
        full_split_dfs['pr_zscore'] = (full_split_dfs['pr'] - full_split_dfs['pr'].mean()) / full_split_dfs['pr'].std()

        full_split_dfs.to_csv(path, mode='w', header=True, index=False)

    return pd.read_csv(path, usecols=['prop', 'obj', 'count', 'global_zscore', 'pr', 'pr_zscore']).set_index(['prop', 'obj']).to_dict()
Example #41
0
	def graph_stats(self, n):
		stats = {}
		stats['Top'] = self.top_nodes(n+1)
		stats['Pagerank'] = nx.pagerank_scipy(self.G)
		stats['Pagerank'] = sorted(stats['Pagerank'].iteritems(), key=itemgetter(1),reverse=True)[0:n+1]
		stats['Articulation Points'] = list(nx.articulation_points(self.G.to_undirected()))
		stats['Histogram'] = self.degree_histogram()[1:26]
		return stats
Example #42
0
def networkx_algo():
    import networkx as nx
    beta = GlobalPara.beta
    edges = LoadEdges()
    G = nx.DiGraph(edges)
    # print(G.edges())
    pagerank_dict = nx.pagerank_scipy(G, alpha=beta)
    print(pagerank_dict[99])
Example #43
0
def lexrank(sentences, continuous=False, sim_threshold=0.1, alpha=0.9):
    """
    compute centrality score of sentences.

    Args:
      sentences: [u'こんにちは.', u'私の名前は飯沼です.', ... ]
      continuous: if True, apply continuous LexRank. (see reference)
      sim_threshold: if continuous is False and smilarity is greater or
        equal to sim_threshold, link the sentences.
      alpha: the damping factor of PageRank

    Returns: tuple
      (
        {
          # sentence index -> score
          0: 0.003,
          1: 0.002,
          ...
        },
        similarity_matrix
      )
    
    Reference:
      Günes Erkan and Dragomir R. Radev.
      LexRank: graph-based lexical centrality as salience in text
      summarization. (section 3)
      http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html
    """
    graph = networkx.DiGraph()

    # sentence -> tf
    sent_tf_list = []
    for sent in sentences:
        words = tools.word_segmenter_ja(sent)
        tf = collections.Counter(words)
        sent_tf_list.append(tf)

    sent_vectorizer = DictVectorizer(sparse=True)
    sent_vecs = sent_vectorizer.fit_transform(sent_tf_list)

    # compute similarities between senteces
    sim_mat = 1 - pairwise_distances(sent_vecs, sent_vecs, metric="cosine")

    if continuous:
        linked_rows, linked_cols = numpy.where(sim_mat > 0)
    else:
        linked_rows, linked_cols = numpy.where(sim_mat >= sim_threshold)

    # create similarity graph
    graph.add_nodes_from(range(sent_vecs.shape[0]))
    for i, j in zip(linked_rows, linked_cols):
        if i == j:
            continue
        weight = sim_mat[i, j] if continuous else 1.0
        graph.add_edge(i, j, {"weight": weight})

    scores = networkx.pagerank_scipy(graph, alpha=alpha, max_iter=1000)
    return scores, sim_mat
Example #44
0
 def test_scipy_pagerank(self):
     G=self.G
     try:
         p=networkx.pagerank_scipy(G,alpha=0.9,
                                                        tol=1.e-08)
         for (a,b) in zip(p,self.G.pagerank):
             assert_almost_equal(a,b)
     except ImportError:
         print "Skipping pagerank_scipy test"
Example #45
0
 def test_scipy_pagerank(self):
     G=self.G
     try:
         import scipy
     except ImportError:
         raise SkipTest('scipy not available.')
     p=networkx.pagerank_scipy(G,alpha=0.9,tol=1.e-08)
     for n in G:
         assert_almost_equal(p[n],G.pagerank[n],places=4)    
 def test_scipy_pagerank(self):
     G=self.G
     try:
         import scipy
     except ImportError:
         raise SkipTest('scipy not available.')
     p=networkx.pagerank_scipy(G,alpha=0.9,tol=1.e-08)
     for (a,b) in zip(p,self.G.pagerank):
         assert_almost_equal(a,b)
    def compute(self, own_public_key):
        """
        Compute the reputation based on the data in the TrustChain database using the Temporal PageRank algorithm.
        """

        nodes = set()
        G = nx.DiGraph()

        for block in self.blocks:
            if block.link_sequence_number == UNKNOWN_SEQ or block.type != 'tx_done' \
                    or 'tx' not in block.transaction:
                continue  # Don't consider half interactions

            pubkey_requester = block.link_public_key
            pubkey_responder = block.public_key

            sequence_number_requester = block.link_sequence_number
            sequence_number_responder = block.sequence_number

            # In our market, we consider the amount of Bitcoin that have been transferred from A -> B.
            # For now, we assume that the value from B -> A is of equal worth.

            value_exchange = block.transaction["tx"]["transferred"]["first"]["amount"]

            G.add_edge((pubkey_requester, sequence_number_requester), (pubkey_requester, sequence_number_requester + 1),
                       contribution=value_exchange)
            G.add_edge((pubkey_requester, sequence_number_requester), (pubkey_responder, sequence_number_responder + 1),
                       contribution=value_exchange)

            G.add_edge((pubkey_responder, sequence_number_responder), (pubkey_responder, sequence_number_responder + 1),
                       contribution=value_exchange)
            G.add_edge((pubkey_responder, sequence_number_responder), (pubkey_requester, sequence_number_requester + 1),
                       contribution=value_exchange)

            nodes.add(pubkey_requester)
            nodes.add(pubkey_responder)

        personal_nodes = [node1 for node1 in G.nodes() if node1[0] == own_public_key]
        number_of_nodes = len(personal_nodes)
        if number_of_nodes == 0:
            return {}
        personalisation = {node_name: 1.0 / number_of_nodes if node_name in personal_nodes else 0
                           for node_name in G.nodes()}

        try:
            result = nx.pagerank_scipy(G, personalization=personalisation, weight='contribution')
        except nx.NetworkXException:
            self._logger.info("Empty Temporal PageRank, returning empty scores")
            return {}

        sums = {}

        for interaction in result.keys():
            sums[interaction[0]] = sums.get(interaction[0], 0) + result[interaction]

        return sums
 def test_empty(self):
     try:
         import numpy
     except ImportError:
         raise SkipTest('numpy not available.')
     G=networkx.Graph()
     assert_equal(networkx.pagerank(G),{})
     assert_equal(networkx.pagerank_numpy(G),{})
     assert_equal(networkx.pagerank_scipy(G),{})
     assert_equal(networkx.google_matrix(G).shape,(0,0))
    def OrigPagerank(self):
        """ returns a 2d array containing the pagerank of the origin node for all edges

        probas = np.dot(
            np.array(nx.pagerank_scipy(self).values(), dtype=float).reshape(-1, 1),
            np.ones((1, self.number_of_nodes())))
        """
        try:
            return self.Orig(nx.pagerank_scipy(self))
        except:
            return self.Orig(np.ones(self.number_of_nodes(), dtype=float) / self.number_of_nodes())
    def TargPagerank(self):
        """ returns a 2d array containing the pagerank of the target node for all edges

        probas = np.dot(
            np.ones((self.number_of_nodes(), 1)),
            np.array(nx.pagerank_scipy(self).values(), dtype=float).reshape(1, -1)
        )
        """
        try:
            return self.Targ(nx.pagerank_scipy(self))
        except:
            return self.Targ(np.ones(self.number_of_nodes(), dtype=float) / self.number_of_nodes())
Example #51
0
    def save_data(self, filename):
        """Output authors data to a CSV file."""
        logger = logging.getLogger("twitter.compute")
        with timed(logger.info, "computing pageranks"):
            npr = nx.pagerank_scipy(self.g_authors)
            wpr = nx.pagerank_scipy(self.g_weighted_authors)
            rt_npr = nx.pagerank_scipy(self.g_rt_authors)
            rt_wpr = nx.pagerank_scipy(self.g_rt_weighted_authors)

        logger = logging.getLogger("twitter.save")
        conn = self.engine.connect()
        with timed(logger.info, "saving authors to %r", filename):
            with open(filename, "wb") as f:
                writer = csv.writer(f)
                writer.writerow([
                    "screen_name", "followers", "listed", "friends",
                    "total_tweets", "tweets",
                    "rtrank", "rtpercentile",
                    "in_degree", "out_degree",
                    "rt_in_degree", "rt_out_degree",
                    "unweighted_pr", "weighted_pr",
                    "rt_unweighted_pr", "rt_weighted_pr",
                ])
                def write_row(row):
                    writer.writerow([str(x) if x is not None else "" for x in row])

                format_pr = lambda pr: "%f" % pr
                for author_id in sorted(self.g_authors,
                                        key=lambda x: wpr[x], reverse=True):
                    a = self.authors[author_id]
                    write_row([
                        author_id, a.followers, a.listed, a.friends,
                        a.statuses, a.ntweets, a.rtrank, a.rtpercentile,
                        a.ninrefs, a.noutrefs, a.ninrts, a.noutrts,
                        format_pr(npr[author_id]), format_pr(wpr[author_id]),
                        format_pr(rt_npr[author_id]), format_pr(rt_wpr[author_id]),
                    ])
        conn.close()
	def personalizedPageRank(self,rootID):
		personalize = dict((n, 0) for n in self.graph)
		personalize[rootID] =1 
		x = nx.pagerank_scipy(self.graph, alpha=0.15, tol=1.e-05, personalization=personalize)
		sorted_x = sorted(x.items(), key=operator.itemgetter(1),reverse=True)
		count = 0
		result = ''
		for key in sorted_x:
			if not self.graph.has_edge(rootID,key[0]) and rootID != key[0]:
				count += 1
				result += rootID+','+key[0]+'\n'
			if count == 5:
				break
		return result
Example #53
0
def top_k_with_score(k, g, p = None, alpha = 0.85):
    '''
    k: the top-k
    g: networkx instance
    p: personalization dictionary
    '''
    pr = nx.pagerank_scipy(g, alpha = alpha, personalization = p)
    sorted_pr = sorted(pr.items(), key=itemgetter(1), reverse=True)

    if k:
        top_nodes = sorted_pr[:k]
    else:
        top_nodes = sorted_pr

    return top_nodes
Example #54
0
def top_k(k, g, p = None):
    '''
    k: the top-k
    g: networkx instance
    p: personalization dictionary
    '''
    pr = nx.pagerank_scipy(g, personalization = p)
    sorted_pr = sorted(pr.items(), key=itemgetter(1), reverse=True)
    sorted_nodes = map(itemgetter(0), sorted_pr)

    if k:
        top_nodes = sorted_nodes[:k]
    else:
        top_nodes = sorted_nodes

    return top_nodes
Example #55
0
    def candidate_weighting(self, threshold=0.74, method='average',
                            heuristic=None):
        """ Candidate weight calculation using random walk.

            Args:
                threshold (float): the minimum similarity for clustering,
                    defaults to 0.74.
                method (str): the linkage method, defaults to average.
                heuristic (str): the heuristic for selecting the best candidate
                    for each topic, defaults to first occurring candidate. Other
                    options are 'frequent' (most frequent candidate, position
                    is used for ties).
        """

        # cluster the candidates
        self.topic_clustering(threshold=threshold, method=method)

        # build the topic graph
        self.build_topic_graph()

        # compute the word scores using random walk
        w = nx.pagerank_scipy(self.graph)

        # loop throught the topics
        for i, topic in enumerate(self.topics):

            # get the offsets of the topic candidates
            offsets = [self.candidates[t].offsets[0] for t in topic]

            # get first candidate from topic
            if heuristic == 'frequent':

                # get frequencies for each candidate within the topic
                freq = [len(self.candidates[t].surface_forms) for t in topic]

                # get the indexes of the most frequent candidates
                indexes = [j for j, f in enumerate(freq) if f == max(freq)]

                # offsets of the indexes
                indexes_offsets = [offsets[j] for j in indexes]
                most_frequent = indexes_offsets.index(min(indexes_offsets))
                self.weights[topic[most_frequent]] = w[i]

            else:
                first = offsets.index(min(offsets))
                self.weights[topic[first]] = w[i]
Example #56
0
    def pagerank(self):
        """Compute pagerank centrality for words coded by Free Association.

        Returns
        -------
        pagerank : dict
            The association of each word to its pagerank. FA link weights are
            taken into account in the computation. Words with pagerank zero are
            removed from the dict.

        """

        # Assumes a directed weighted graph.
        logger.info('Computing FreeAssociation pagerank')
        pagerank = nx.pagerank_scipy(self._norms_graph, max_iter=10000,
                                     tol=1e-15, weight='weight')
        self._remove_zeros(pagerank)
        logger.info('Done computing FreeAssociation pagerank')
        return pagerank
Example #57
0
    def report(self):
        """Generate a LaTeX report, return as `str`."""

        metrics = pandas.DataFrame({
            'indegree': self.network.in_degree(),
            'pagerank': networkx.pagerank_scipy(self.network, max_iter=200),
        })
        mean = metrics.mean()
        std = metrics.std()

        for field, series in self.metadata.items():
            successes = 0
            for node in random.sample(series.index.tolist(), 100):
                value = self.classify(node, field)
                try:
                    if value == series.loc[node] or value in series.loc[node]:
                        successes += 1
                except Exception:
                    pass
            print(field, successes)
Example #58
0
def analyze_pagerank(graph, show_table=False, show_plot=False):
    """Run analysis on pagerank."""
    if not (show_table or show_plot):
        return # expensive computation, skip if unneccessary
    indegrees = pandas.Series(graph.in_degree(), name='indegree')
    pagerank = pandas.Series(networkx.pagerank_scipy(graph, max_iter=200),
            name='pagescore')
    table = (pandas.DataFrame({'indegree': graph.in_degree()})
                   .sort(columns='indegree', ascending=False))
    table['indegree_rank'] = pandas.Series(range(1, len(table)+1),
                                           index=table.index)
    table = table.join(pagerank).sort(columns='pagescore',
            ascending=False)
    table['page_rank'] = pandas.Series(range(1, len(table)+1),
                                       index=table.index)
    slope, intercept, r_val, p_val, stderr = scipy.stats.linregress(
            table['pagescore'], table['indegree'])
    if show_table:
        print('pagescore and indegree have r == {}'.format(r_val))
        print(table.head(10))
    def _gen_sim_scores(self, term_matrix, LR_method, pos_seed_vector, neg_seed_vector, pos_weight, neg_weight):
        if LR_method == 'unbiased':
            #Switch from distance to similarity measures here
            weights = -1*(scipy.spatial.distance.pdist(term_matrix.toarray(), 'cosine')-1)
            #check weights here and threshold them
            weights[weights < .2] = 0
            weights[numpy.isnan(weights)] = 0

            graph = networkx.from_numpy_matrix(scipy.spatial.distance.squareform(weights))
            scores = networkx.pagerank_scipy(graph, max_iter=5000, alpha = .85)


        elif LR_method == 'biased':
            weights = -1*(scipy.spatial.distance.pdist(term_matrix.toarray(), 'cosine')-1)
            #check weights here and threshold them
            weights[weights < .2] = 0
            nan2zero(weights)

            graph = networkx.from_numpy_matrix(scipy.spatial.distance.squareform(weights))


            #check if seed is empty and return something with correct format
            if str(pos_seed_vector.nonzero()) == '(array([], dtype=int32), array([], dtype=int32))':
                pos_seed_scores = scipy.zeros_like(neg_seed_vector)
            else:
                pos_seed_scores = baseline_scorer(term_matrix, pos_seed_vector)


            if str(neg_seed_vector.nonzero()) == '(array([], dtype=int32), array([], dtype=int32))':
                neg_seed_scores = scipy.zeros_like(pos_seed_scores)
            else:
                neg_seed_scores = baseline_scorer(term_matrix, neg_seed_vector)

            #add a ballast to act against neg seed scores
            ballast = scipy.zeros_like(neg_seed_scores)
            ballast[neg_seed_scores == 0] = neg_weight

            seed_scores = pos_seed_scores*pos_weight + neg_seed_scores*neg_weight +ballast
            scores = biased_lexrank.b_lexrank(graph, seed_scores, personalization = 'biased', alpha=.85, max_iter = 5000, seed_weight = pos_weight)

        return scores