def author_centrality(titles_to_authors): author_graph = digraph() author_graph.add_nodes(map(lambda x: u"title_%s" % x, titles_to_authors.keys())) author_graph.add_nodes(list(set( [u'author_%s' % author[u'user'] for authors in titles_to_authors.values() for author in authors]))) for title in titles_to_authors: log.debug(u"Working on title: %s" % title) for author in titles_to_authors[title]: try: author_graph.add_edge( (u'title_%s' % title, u'author_%s' % author[u'user'])) except AdditionError: pass centralities = dict([ ('_'.join(item[0].split('_')[1:]), item[1]) for item in pagerank(author_graph).items() if item[0].startswith(u'author_')]) centrality_scaler = MinMaxScaler(centralities.values()) return dict([(cent_author, centrality_scaler.scale(cent_val)) for cent_author, cent_val in centralities.items()])
def test_pagerank(self): #Test example from wikipedia: http://en.wikipedia.org/wiki/File:Linkstruct3.svg G = digraph() G.add_nodes([1, 2, 3, 4, 5, 6, 7]) G.add_edge((1, 2)) G.add_edge((1, 3)) G.add_edge((1, 4)) G.add_edge((1, 5)) G.add_edge((1, 7)) G.add_edge((2, 1)) G.add_edge((3, 1)) G.add_edge((3, 2)) G.add_edge((4, 2)) G.add_edge((4, 3)) G.add_edge((4, 5)) G.add_edge((5, 1)) G.add_edge((5, 3)) G.add_edge((5, 4)) G.add_edge((5, 6)) G.add_edge((6, 1)) G.add_edge((6, 5)) G.add_edge((7, 5)) expected_pagerank = { 1: 0.280, 2: 0.159, 3: 0.139, 4: 0.108, 5: 0.184, 6: 0.061, 7: 0.069, } pr = pagerank(G) for k in pr: self.assertAlmostEqual(pr[k], expected_pagerank[k], places=3)
def author_centrality(titles_to_authors): """ Identifies the centrality of an author :param titles_to_authors: a dict keying title strings to the authors associated :type titles_to_authors: dict :return: a dict matching author to centrality :rtype: dict """ author_graph = digraph() author_graph.add_nodes(map(lambda x: u"title_%s" % x, titles_to_authors.keys())) author_graph.add_nodes(list(set([u'author_%s' % author[u'user'] for authors in titles_to_authors.values() for author in authors]))) for title in titles_to_authors: for author in titles_to_authors[title]: try: author_graph.add_edge((u'title_%s' % title, u'author_%s' % author[u'user'])) except AdditionError: pass centralities = dict([('_'.join(item[0].split('_')[1:]), item[1]) for item in pagerank(author_graph).items() if item[0].startswith(u'author_')]) centrality_scaler = MinMaxScaler(centralities.values()) return dict([(cent_author, centrality_scaler.scale(cent_val)) for cent_author, cent_val in centralities.items()])
def compute_textrank(phrase_list, window, lemma, ratio=2): """(list, int, int, int) -> dict Return a dict of KWs with textrank scores. Arguments: - phrase_list: list of candidate phrase lists - window: size of cocccurrence window - lemma: 1 if lemmas are used instead of wordforms - ratio: ratio of all KWs included in the top N list """ # flatten out input phrase list to get back text for postprocessing # this is not very optimal text = list(itertools.chain.from_iterable(phrase_list)) numwords = len(text) # set up graph gr = digraph() gr.add_nodes(unique(text)) if debug: write_out(("TEXT:", text)) # add edges for words within window, be careful to add members # for the last shrinking windows at the end of list! i = 0 while i < numwords - 1: source = text[i] firstindex = i + 1 lastindex = i + window if lastindex > numwords: lastindex = numwords if firstindex > numwords - 1: break for w in text[firstindex:lastindex]: if debug: write_out(("EGDE BTW:", source, "and", w)) try: gr.add_edge((source, w)) except AdditionError: sys.stderr.write('Already added: {0}\t{1}\n'.format(source, w)) i += 1 # calculate pagerank prdict = pagerank(gr) prlist = [(key, prdict[key]) for key in sorted(prdict, key=prdict.get, reverse=True)] # get first number of nodes/ratio elements if debug: write_out(("TR FULL LIST:", prlist)) prlist = prlist[:numwords / ratio] if debug: write_out(("TR SHORT LIST:", prlist)) # make a dict from the list to facilitate postprocessing prdict = dict(prlist) # postrocess initial result textranked = post_textrank(prdict, phrase_list) return textranked
def test_pagerank_cycle(self): #Test if all nodes in a cycle graph have the same value G = digraph() G.add_nodes([1, 2, 3, 4, 5]) G.add_edge((1, 2)) G.add_edge((2, 3)) G.add_edge((3, 4)) G.add_edge((4, 5)) G.add_edge((5, 1)) self.assertEqual(pagerank(G), {1: 0.2, 2: 0.2, 3: 0.2, 4: 0.2, 5: 0.2})
def test_pagerank_random(self): G = testlib.new_digraph() md = 0.00001 df = 0.85 pr = pagerank(G, damping_factor=df, min_delta=md) min_value = (1.0-df)/len(G) for node in G: expected = min_value for each in G.incidents(node): expected += (df * pr[each] / len(G.neighbors(each))) assert abs(pr[node] - expected) < md
def by_pagerank(graph): """ Probabilistic scheduler based on PageRank of nodes in the graph. Just for fun^^ """ ranks = pagerank(graph) rvar = RandomVariable(list(ranks.keys())) scope = [rvar] beliefs = scipy.array(list(ranks.values())) factor = Factor(scope, beliefs) while True: node = factor.sample()[0] yield node
def page_rank(filename): data = reader.loadJL(filename) gr = digraph() for site in data: if not gr.has_node(site["url"]): gr.add_nodes([site["url"]]) for link in site["links"]: if not gr.has_node(link): gr.add_nodes([link]) if not gr.has_edge((site["url"], link)): gr.add_edge((site["url"], link)) pg_values = pagerank.pagerank(gr) Persist("page_rank").dump(pg_values) print 'page rank finish'
def get_pagerank(args, all_titles): pool = multiprocessing.Pool(processes=args.processes) r = pool.map_async(links_for_page, all_titles) r.wait() all_links = r.get() all_title_strings = list(set([to_string for response in all_links for to_string in response[1]] + [obj[u'title'] for obj in all_titles])) wiki_graph = digraph() wiki_graph.add_nodes(all_title_strings) # to prevent missing node_neighbors table for title_object in all_titles: for target in links_for_page(title_object)[1]: try: wiki_graph.add_edge((title_object[u'title'], target)) except AdditionError: pass return pagerank(wiki_graph)
def paper_rank(): print("start page ranking .....") dg = digraph() conn = sqlite3.connect(PM.db) qry = 'select p_citer,p_cited from reference' p_id = sql.read_sql_query(qry, conn) print(str(p_id.shape) + '<---------p_id') citer = p_id.p_citer.unique() p_id = p_id.dropna(axis=0) cited = p_id.p_cited.unique() nd = set(citer).union(set(cited)) nd = list(nd) print('node is created .....') # add nodes nodes = np.array(nd).astype(np.int64) dg.add_nodes(nodes) print("add nodes finished .... ") # add edges edges = [ x for x in zip(p_id['p_citer'].astype(np.int64), p_id['p_cited'].astype(np.int64)) ] for ed in edges: dg.add_edge(ed) print('add edges finished ....') pg = pagerank(dg, damping_factor=0.85, max_iterations=100, min_delta=1e-06) pprk = pd.DataFrame(pd.Series(pg)) pprk.columns = ['pp_ranking'] pprk.index.name = 'paper_index' pprk.to_csv(PM.paper_rank, sep=u'|', header=1, index=True) print(pprk[:2])
gr.add_nodes(["a", "b", "c", "d", "e", "f"]) # notar que los vertices o edge, ahora tienen definido un peso gr.add_edge(("a", "b"), 3) gr.add_edge(("a", "f"), 5) gr.add_edge(("a", "d"), 4) gr.add_edge(("b", "f"), 1) gr.add_edge(("b", "c"), 1) gr.add_edge(("f", "d"), 2) gr.add_edge(("c", "d"), 2) gr.add_edge(("d", "b"), 3) gr.add_edge(("e", "f"), 2) gr.add_edge(("e", "d"), 3) #mostrar el grafo print "\nEl grafo: \n" print gr #### ruta mas corta, iniciando en Anchorage st, dist = shortest_path(gr, "a") # mostrar la ruta, indicada en el spanning tree print "\nRuta:\n" print st # mostrar los valores de las rutas: print "\nValores de la ruta mas corta, a cada vertice:\n" print dist print "\nPAGE RANK:\n" r = pagerank(gr, damping_factor=0.85, max_iterations=100, min_delta=1e-05) print r
infile_name = "/tmp/inputfile" with open(infile_name, 'r') as f: text = f.read() f.closed text = nltk.word_tokenize(text) tagged = nltk.pos_tag(text) tagged = filter_for_tags(tagged) tagged = normalize(tagged) unique_word_set = unique_everseen([x[0] for x in tagged]) gr = digraph() gr.add_nodes(list(unique_word_set)) co_occurence_edge(gr) calculated_keyword_rank = pagerank(gr) sorted_keyword_rank = sorted(calculated_keyword_rank.iteritems(), key=itemgetter(1), reverse=True) # first 20 keywords keywordlist = [k[0] for k in sorted_keyword_rank[0:20]] print keywordlist # mark text with keyword candidates marked_text = [(w, isTag(w, keywordlist)) for w in text] print marked_text final_keywords = collaps_keywords(marked_text) print final_keywords
for subdir in os.listdir(view_dir): for fn in os.listdir(view_dir + subdir): #print fn key = fn.split('__')[4] key = key.split('?')[0] #print fn.split('__') #print fn,key rc = get_otherviewers(view_dir + subdir + '/' + fn) try: if key not in added_nodes: gr.add_node(key) added_nodes.add(key) for edg in rc: if edg not in added_nodes: gr.add_node(edg) added_nodes.add(edg) if (key, edg) not in added_edges: gr.add_edge((key, edg)) added_edges.add((key, edg)) cnt += 1 #if cnt>100:break except Exception, ex: print ex print fn pr = pagerank.pagerank(gr) for k in pr: print k, pr[k]
def test_pagerank_empty(self): #Test if an empty dict is returned for an empty graph G = digraph() self.assertEqual(pagerank(G), {})
while True: window_words = tagged[window_start:window_end] if len(window_words) == 1: # print window_words[0][0], window_words[1][0] try: gr.add_edge((window_words[0][0], window_words[1][0])) except AdditionError, e: print 'already added %s, %s' % ((window_words[0][0].encode('utf-8'), window_words[1][0].encode('utf-8'))) else: break window_start += 1 window_end += 1 print '###KEYWORDS##' index = 0 calculated_page_rank = pagerank(gr) di = sorted(calculated_page_rank.iteritems(), key=itemgetter(1), reverse=True) for k, g in itertools.groupby(di, key=itemgetter(1)): for word in map(itemgetter(0), g): #textrank[word] = k if word not in stopwords and len(word) > 1: print word.encode('utf-8') index += 1 if index == 51: return if __name__ == '__main__': textrank()
import os import sys from pygraph.classes.digraph import digraph from pygraph.algorithms import pagerank import pickle f = open('pr.graph', 'rb') gr = pickle.load(f) f.close() pr = pagerank.pagerank(gr, max_iterations=500, min_delta=0.00001) for k in pr: print k, pr[k]
for subdir in os.listdir(view_dir): for fn in os.listdir(view_dir+subdir): #print fn key=fn.split('__')[4] key=key.split('?')[0] #print fn.split('__') #print fn,key rc=get_otherviewers(view_dir+subdir+'/'+fn) try: if key not in added_nodes: gr.add_node(key) added_nodes.add(key) for edg in rc: if edg not in added_nodes: gr.add_node(edg) added_nodes.add(edg) if (key,edg) not in added_edges: gr.add_edge((key,edg)) added_edges.add((key,edg)) cnt+=1 #if cnt>100:break except Exception, ex: print ex print fn #def pagerank(graph, damping_factor=0.85, max_iterations=100, min_delta=0.00001): pr=pagerank.pagerank(gr, max_iterations=500,min_delta=0.00001) for k in pr: print k,pr[k]
def generate_pagerank_from_graph(graph): print strftime("%Y-%m-%d %H:%M:%S", gmtime()) + " Calculando Pagerank..." calculated_page_rank = pagerank(graph) return calculated_page_rank
tagged = normalize(tagged) unique_word_set = unique_everseen([x[0] for x in tagged]) gr = digraph() gr.add_nodes(list(unique_word_set)) window_start = 0 window_end = 2 while 1: window_words = tagged[window_start:window_end] if len(window_words) == 2: print window_words try: gr.add_edge((window_words[0][0], window_words[1][0])) except AdditionError, e: print 'already added %s, %s' % ( (window_words[0][0], window_words[1][0])) else: break window_start += 1 window_end += 1 calculated_page_rank = pagerank(gr) di = sorted(calculated_page_rank.iteritems(), key=itemgetter(1)) for k, g in itertools.groupby(di, key=itemgetter(1)): print k, map(itemgetter(0), g)
for subdir in os.listdir(view_dir): for fn in os.listdir(view_dir+subdir): #print fn key=fn.split('__')[4] key=key.split('?')[0] #print fn.split('__') #print fn,key rc=get_otherviewers(view_dir+subdir+'/'+fn) try: if key not in added_nodes: gr.add_node(key) added_nodes.add(key) for edg in rc: if edg not in added_nodes: gr.add_node(edg) added_nodes.add(edg) if (key,edg) not in added_edges: gr.add_edge((key,edg)) added_edges.add((key,edg)) cnt+=1 #if cnt>100:break except Exception, ex: print ex print fn pr=pagerank.pagerank(gr) for k in pr: print k,pr[k]
gr.add_nodes(['A','B','C']) # Add edges gr.add_edge(('X','Y')) gr.add_edge(('X','Z')) gr.add_edge(('A','B')) gr.add_edge(('A','C')) gr.add_edge(('Y','B')) gr.add_edge(('X','B')) # Depth first search rooted on node X st, pre, post = depth_first_search(gr, root='X') # Print the spanning tree print st print gr.incidents('B') pr=pagerank.pagerank(gr,min_delta=0.0000001) print pr l=[] for x in xrange(10000): l.append(x) import sys import types def get_refcounts(): d = {} sys.modules # collect all classes for m in sys.modules.values(): for sym in dir(m):
gr.add_nodes(['A', 'B', 'C']) # Add edges gr.add_edge(('X', 'Y')) gr.add_edge(('X', 'Z')) gr.add_edge(('A', 'B')) gr.add_edge(('A', 'C')) gr.add_edge(('Y', 'B')) gr.add_edge(('X', 'B')) # Depth first search rooted on node X st, pre, post = depth_first_search(gr, root='X') # Print the spanning tree print st print gr.incidents('B') pr = pagerank.pagerank(gr, min_delta=0.0000001) print pr l = [] for x in xrange(10000): l.append(x) import sys import types def get_refcounts(): d = {} sys.modules # collect all classes for m in sys.modules.values():