Ejemplo n.º 1
0
def author_centrality(titles_to_authors):
    author_graph = digraph()
    author_graph.add_nodes(map(lambda x: u"title_%s" % x,
                               titles_to_authors.keys()))
    author_graph.add_nodes(list(set(
        [u'author_%s' % author[u'user'] for authors in
         titles_to_authors.values() for author in authors])))

    for title in titles_to_authors:
        log.debug(u"Working on title: %s" % title)
        for author in titles_to_authors[title]:
            try:
                author_graph.add_edge(
                    (u'title_%s' % title, u'author_%s' % author[u'user']))
            except AdditionError:
                pass

    centralities = dict([
        ('_'.join(item[0].split('_')[1:]), item[1]) for item in
        pagerank(author_graph).items() if item[0].startswith(u'author_')])

    centrality_scaler = MinMaxScaler(centralities.values())

    return dict([(cent_author, centrality_scaler.scale(cent_val))
                 for cent_author, cent_val in centralities.items()])
Ejemplo n.º 2
0
 def test_pagerank(self):
     #Test example from wikipedia: http://en.wikipedia.org/wiki/File:Linkstruct3.svg
     G = digraph()
     G.add_nodes([1, 2, 3, 4, 5, 6, 7])        
     G.add_edge((1, 2))
     G.add_edge((1, 3))
     G.add_edge((1, 4))
     G.add_edge((1, 5))
     G.add_edge((1, 7))
     G.add_edge((2, 1))
     G.add_edge((3, 1))
     G.add_edge((3, 2))
     G.add_edge((4, 2))
     G.add_edge((4, 3))
     G.add_edge((4, 5))
     G.add_edge((5, 1))
     G.add_edge((5, 3))
     G.add_edge((5, 4))
     G.add_edge((5, 6))
     G.add_edge((6, 1))
     G.add_edge((6, 5))
     G.add_edge((7, 5))
     expected_pagerank = {
         1: 0.280, 
         2: 0.159,
         3: 0.139,
         4: 0.108,
         5: 0.184,
         6: 0.061,
         7: 0.069,
     }
     pr = pagerank(G)
     for k in pr:
         self.assertAlmostEqual(pr[k], expected_pagerank[k], places=3)
Ejemplo n.º 3
0
def author_centrality(titles_to_authors):
    """
    Identifies the centrality of an author

    :param titles_to_authors: a dict keying title strings to the authors associated
    :type titles_to_authors: dict

    :return: a dict matching author to centrality
    :rtype: dict
    """
    author_graph = digraph()
    author_graph.add_nodes(map(lambda x: u"title_%s" % x, titles_to_authors.keys()))
    author_graph.add_nodes(list(set([u'author_%s' % author[u'user']
                                     for authors in titles_to_authors.values()
                                     for author in authors])))

    for title in titles_to_authors:
        for author in titles_to_authors[title]:
            try:
                author_graph.add_edge((u'title_%s' % title, u'author_%s' % author[u'user']))
            except AdditionError:
                pass

    centralities = dict([('_'.join(item[0].split('_')[1:]), item[1])
                         for item in pagerank(author_graph).items() if item[0].startswith(u'author_')])

    centrality_scaler = MinMaxScaler(centralities.values())

    return dict([(cent_author, centrality_scaler.scale(cent_val))
                 for cent_author, cent_val in centralities.items()])
Ejemplo n.º 4
0
def compute_textrank(phrase_list, window, lemma, ratio=2):
    """(list, int, int, int) -> dict

    Return a dict of KWs with textrank scores.
    Arguments:
    - phrase_list: list of candidate phrase lists
    - window: size of cocccurrence window
    - lemma: 1 if lemmas are used instead of wordforms
    - ratio: ratio of all KWs included in the top N list
    """

    # flatten out input phrase list to get back text for postprocessing
    # this is not very optimal
    text = list(itertools.chain.from_iterable(phrase_list))
    numwords = len(text)
    # set up graph
    gr = digraph()
    gr.add_nodes(unique(text))
    if debug:
        write_out(("TEXT:", text))
    # add edges for words within window, be careful to add members
    # for the last shrinking windows at the end of list!
    i = 0
    while i < numwords - 1:
        source = text[i]
        firstindex = i + 1
        lastindex = i + window
        if lastindex > numwords:
            lastindex = numwords
        if firstindex > numwords - 1:
            break

        for w in text[firstindex:lastindex]:
            if debug:
                write_out(("EGDE BTW:", source, "and", w))
            try:
                gr.add_edge((source, w))
            except AdditionError:
                sys.stderr.write('Already added: {0}\t{1}\n'.format(source, w))

        i += 1

    # calculate pagerank
    prdict = pagerank(gr)
    prlist = [(key, prdict[key])
              for key in sorted(prdict, key=prdict.get, reverse=True)]
    # get first number of nodes/ratio elements
    if debug:
        write_out(("TR FULL LIST:", prlist))
    prlist = prlist[:numwords / ratio]
    if debug:
        write_out(("TR SHORT LIST:", prlist))
    # make a dict from the list to facilitate postprocessing
    prdict = dict(prlist)
    # postrocess initial result
    textranked = post_textrank(prdict, phrase_list)
    return textranked
Ejemplo n.º 5
0
 def test_pagerank_cycle(self):
     #Test if all nodes in a cycle graph have the same value
     G = digraph()
     G.add_nodes([1, 2, 3, 4, 5])
     G.add_edge((1, 2))
     G.add_edge((2, 3))
     G.add_edge((3, 4))
     G.add_edge((4, 5))
     G.add_edge((5, 1))
     self.assertEqual(pagerank(G), {1: 0.2, 2: 0.2, 3: 0.2, 4: 0.2, 5: 0.2})
Ejemplo n.º 6
0
 def test_pagerank_random(self):
     G = testlib.new_digraph()
     md = 0.00001
     df = 0.85
     pr = pagerank(G, damping_factor=df, min_delta=md)
     min_value = (1.0-df)/len(G)
     for node in G:
         expected = min_value
         for each in G.incidents(node):
             expected += (df * pr[each] / len(G.neighbors(each)))
         assert abs(pr[node] - expected) < md
Ejemplo n.º 7
0
 def by_pagerank(graph):
     """
     Probabilistic scheduler based on PageRank of nodes in the graph.
     
     Just for fun^^
     """
     ranks = pagerank(graph)
     rvar = RandomVariable(list(ranks.keys()))
     scope = [rvar]
     beliefs = scipy.array(list(ranks.values()))
     factor = Factor(scope, beliefs)
     while True:
         node = factor.sample()[0]
         yield node
Ejemplo n.º 8
0
def page_rank(filename):
    data = reader.loadJL(filename)
    gr = digraph()
    for site in data:
        if not gr.has_node(site["url"]):
            gr.add_nodes([site["url"]])
        for link in site["links"]:
            if not gr.has_node(link):
                gr.add_nodes([link])
            if not gr.has_edge((site["url"], link)):
                gr.add_edge((site["url"], link))

    pg_values = pagerank.pagerank(gr)
    Persist("page_rank").dump(pg_values)

    print 'page rank finish'
Ejemplo n.º 9
0
def get_pagerank(args, all_titles):
    pool = multiprocessing.Pool(processes=args.processes)
    r = pool.map_async(links_for_page, all_titles)
    r.wait()
    all_links = r.get()
    all_title_strings = list(set([to_string for response in all_links for to_string in response[1]]
                                 + [obj[u'title'] for obj in all_titles]))

    wiki_graph = digraph()
    wiki_graph.add_nodes(all_title_strings)  # to prevent missing node_neighbors table
    for title_object in all_titles:
        for target in links_for_page(title_object)[1]:
            try:
                wiki_graph.add_edge((title_object[u'title'], target))
            except AdditionError:
                pass

    return pagerank(wiki_graph)
Ejemplo n.º 10
0
def paper_rank():
    print("start page ranking .....")

    dg = digraph()

    conn = sqlite3.connect(PM.db)
    qry = 'select p_citer,p_cited from reference'
    p_id = sql.read_sql_query(qry, conn)
    print(str(p_id.shape) + '<---------p_id')

    citer = p_id.p_citer.unique()
    p_id = p_id.dropna(axis=0)
    cited = p_id.p_cited.unique()
    nd = set(citer).union(set(cited))
    nd = list(nd)

    print('node is created .....')
    # add nodes
    nodes = np.array(nd).astype(np.int64)
    dg.add_nodes(nodes)
    print("add nodes finished .... ")
    # add edges

    edges = [
        x for x in zip(p_id['p_citer'].astype(np.int64),
                       p_id['p_cited'].astype(np.int64))
    ]
    for ed in edges:
        dg.add_edge(ed)
    print('add edges finished ....')

    pg = pagerank(dg, damping_factor=0.85, max_iterations=100, min_delta=1e-06)
    pprk = pd.DataFrame(pd.Series(pg))
    pprk.columns = ['pp_ranking']
    pprk.index.name = 'paper_index'
    pprk.to_csv(PM.paper_rank, sep=u'|', header=1, index=True)
    print(pprk[:2])
Ejemplo n.º 11
0
gr.add_nodes(["a", "b", "c", "d", "e", "f"])

# notar que los vertices o edge, ahora tienen definido un peso
gr.add_edge(("a", "b"), 3)
gr.add_edge(("a", "f"), 5)
gr.add_edge(("a", "d"), 4)
gr.add_edge(("b", "f"), 1)
gr.add_edge(("b", "c"), 1)
gr.add_edge(("f", "d"), 2)
gr.add_edge(("c", "d"), 2)
gr.add_edge(("d", "b"), 3)
gr.add_edge(("e", "f"), 2)
gr.add_edge(("e", "d"), 3)

#mostrar el grafo
print "\nEl grafo: \n"
print gr

#### ruta mas corta, iniciando en Anchorage
st, dist = shortest_path(gr, "a")
# mostrar la ruta, indicada en el spanning tree
print "\nRuta:\n"
print st
# mostrar los valores de las rutas:
print "\nValores de la ruta mas corta, a cada vertice:\n"
print dist

print "\nPAGE RANK:\n"
r = pagerank(gr, damping_factor=0.85, max_iterations=100, min_delta=1e-05)
print r
Ejemplo n.º 12
0
infile_name = "/tmp/inputfile"

with open(infile_name, 'r') as f:
  text = f.read()
f.closed

text = nltk.word_tokenize(text)
tagged = nltk.pos_tag(text)

tagged = filter_for_tags(tagged)
tagged = normalize(tagged)
unique_word_set = unique_everseen([x[0] for x in tagged])

gr = digraph()
gr.add_nodes(list(unique_word_set))
co_occurence_edge(gr)

calculated_keyword_rank = pagerank(gr)
sorted_keyword_rank = sorted(calculated_keyword_rank.iteritems(), key=itemgetter(1), reverse=True)

# first 20 keywords
keywordlist = [k[0] for k in sorted_keyword_rank[0:20]]
print  keywordlist

# mark text with keyword candidates
marked_text = [(w, isTag(w, keywordlist)) for w in text]
print marked_text

final_keywords = collaps_keywords(marked_text)
print final_keywords
Ejemplo n.º 13
0
for subdir in os.listdir(view_dir):
    for fn in os.listdir(view_dir + subdir):
        #print fn
        key = fn.split('__')[4]
        key = key.split('?')[0]
        #print fn.split('__')
        #print fn,key
        rc = get_otherviewers(view_dir + subdir + '/' + fn)
        try:
            if key not in added_nodes:
                gr.add_node(key)
                added_nodes.add(key)
            for edg in rc:
                if edg not in added_nodes:
                    gr.add_node(edg)
                    added_nodes.add(edg)
                if (key, edg) not in added_edges:
                    gr.add_edge((key, edg))
                    added_edges.add((key, edg))
            cnt += 1
            #if cnt>100:break
        except Exception, ex:
            print ex
            print fn

        pr = pagerank.pagerank(gr)

for k in pr:
    print k, pr[k]
Ejemplo n.º 14
0
 def test_pagerank_empty(self):
     #Test if an empty dict is returned for an empty graph
     G = digraph()
     self.assertEqual(pagerank(G), {})
Ejemplo n.º 15
0
    while True:

        window_words = tagged[window_start:window_end]
        if len(window_words) == 1:
            # print window_words[0][0], window_words[1][0]
            try:

                gr.add_edge((window_words[0][0], window_words[1][0]))
            except AdditionError, e:
                print 'already added %s, %s' % ((window_words[0][0].encode('utf-8'), window_words[1][0].encode('utf-8')))
        else:
            break

        window_start += 1
        window_end += 1
    print '###KEYWORDS##'
    index = 0
    calculated_page_rank = pagerank(gr)
    di = sorted(calculated_page_rank.iteritems(), key=itemgetter(1), reverse=True)
    for k, g in itertools.groupby(di, key=itemgetter(1)):
        for word in map(itemgetter(0), g):
            #textrank[word] = k
            if word not in stopwords and len(word) > 1:
                print word.encode('utf-8')
                index += 1
                if index == 51:
                    return

if __name__ == '__main__':
    textrank()
Ejemplo n.º 16
0
import os
import sys
from pygraph.classes.digraph import digraph
from pygraph.algorithms import pagerank
import pickle

f = open('pr.graph', 'rb')
gr = pickle.load(f)
f.close()
pr = pagerank.pagerank(gr, max_iterations=500, min_delta=0.00001)

for k in pr:
    print k, pr[k]
Ejemplo n.º 17
0
for subdir in os.listdir(view_dir):
  for fn in os.listdir(view_dir+subdir):
    #print fn
    key=fn.split('__')[4]
    key=key.split('?')[0]
    #print fn.split('__')
    #print fn,key
    rc=get_otherviewers(view_dir+subdir+'/'+fn)
    try:
      if key not in added_nodes:
        gr.add_node(key)
        added_nodes.add(key)
      for edg in rc:
        if edg not in added_nodes:
          gr.add_node(edg)
          added_nodes.add(edg)
        if (key,edg) not in added_edges:
          gr.add_edge((key,edg))
          added_edges.add((key,edg))
      cnt+=1
      #if cnt>100:break
    except Exception, ex:
      print ex
      print fn

    #def pagerank(graph, damping_factor=0.85, max_iterations=100, min_delta=0.00001):
    pr=pagerank.pagerank(gr, max_iterations=500,min_delta=0.00001)
    
for k in pr:
  print k,pr[k]
def generate_pagerank_from_graph(graph):
    print strftime("%Y-%m-%d %H:%M:%S", gmtime()) + " Calculando Pagerank..."
    calculated_page_rank = pagerank(graph)
    return calculated_page_rank
Ejemplo n.º 19
0
tagged = normalize(tagged)

unique_word_set = unique_everseen([x[0] for x in tagged])

gr = digraph()
gr.add_nodes(list(unique_word_set))

window_start = 0
window_end = 2

while 1:

    window_words = tagged[window_start:window_end]
    if len(window_words) == 2:
        print window_words
        try:
            gr.add_edge((window_words[0][0], window_words[1][0]))
        except AdditionError, e:
            print 'already added %s, %s' % (
                (window_words[0][0], window_words[1][0]))
    else:
        break

    window_start += 1
    window_end += 1

calculated_page_rank = pagerank(gr)
di = sorted(calculated_page_rank.iteritems(), key=itemgetter(1))
for k, g in itertools.groupby(di, key=itemgetter(1)):
    print k, map(itemgetter(0), g)
Ejemplo n.º 20
0
for subdir in os.listdir(view_dir):
  for fn in os.listdir(view_dir+subdir):
    #print fn
    key=fn.split('__')[4]
    key=key.split('?')[0]
    #print fn.split('__')
    #print fn,key
    rc=get_otherviewers(view_dir+subdir+'/'+fn)
    try:
      if key not in added_nodes:
        gr.add_node(key)
        added_nodes.add(key)
      for edg in rc:
        if edg not in added_nodes:
          gr.add_node(edg)
          added_nodes.add(edg)
        if (key,edg) not in added_edges:
          gr.add_edge((key,edg))
          added_edges.add((key,edg))
      cnt+=1
      #if cnt>100:break
    except Exception, ex:
      print ex
      print fn

    pr=pagerank.pagerank(gr)
    
for k in pr:
  print k,pr[k]
Ejemplo n.º 21
0
gr.add_nodes(['A','B','C'])
# Add edges
gr.add_edge(('X','Y'))
gr.add_edge(('X','Z'))
gr.add_edge(('A','B'))
gr.add_edge(('A','C'))
gr.add_edge(('Y','B'))
gr.add_edge(('X','B'))
# Depth first search rooted on node X
st, pre, post = depth_first_search(gr, root='X')
# Print the spanning tree
print st

print gr.incidents('B')

pr=pagerank.pagerank(gr,min_delta=0.0000001)
print pr

l=[]
for x in xrange(10000):
  l.append(x)

import sys
import types

def get_refcounts():
    d = {}
    sys.modules
    # collect all classes
    for m in sys.modules.values():
        for sym in dir(m):
Ejemplo n.º 22
0
gr.add_nodes(['A', 'B', 'C'])
# Add edges
gr.add_edge(('X', 'Y'))
gr.add_edge(('X', 'Z'))
gr.add_edge(('A', 'B'))
gr.add_edge(('A', 'C'))
gr.add_edge(('Y', 'B'))
gr.add_edge(('X', 'B'))
# Depth first search rooted on node X
st, pre, post = depth_first_search(gr, root='X')
# Print the spanning tree
print st

print gr.incidents('B')

pr = pagerank.pagerank(gr, min_delta=0.0000001)
print pr

l = []
for x in xrange(10000):
    l.append(x)

import sys
import types


def get_refcounts():
    d = {}
    sys.modules
    # collect all classes
    for m in sys.modules.values():