Ejemplo n.º 1
0
 def calc_page_ranks(self, d=0.85):
     self.adj = numpy.zeros( (len(self.pages_with_ids),len(self.pages_with_ids)) )
     pbar = ProgressBar(widgets=['Processing links: ', SimpleProgress()], maxval=len(self.pages_with_ids.keys())).start()
     progress = 1
     for (ID, page) in self.pages_with_ids.iteritems():
         pbar.update(progress)
         # magic PageRank
         for a in page.a:
             href = a.get('href')
             # normalize URLS
             url = page.normalize_url(href)
             if url in self.S:
                 soup = BeautifulSoup(helpers.get_html(self.urls_with_nums[url]).encode('utf-8', 'ignore'), 'lxml')
                 ID = helpers.page_hash(soup.prettify())
                 if ID in self.pages_with_ids.keys():
                     #print "%s (#%d) cites %s (#%d)" % (page.num, page.index, self.pages_with_ids[ID].num, self.pages_with_ids[ID].index)
                     #print self.urls[int(self.pages_with_ids[ID].num)-1]
                     self.adj[page.index][self.pages_with_ids[ID].index] = 1.0
         progress += 1
     # Normalize adjacency matrix into PageRanks
     pbar = ProgressBar(widgets=['Normalizing adjacencies: ', SimpleProgress()], maxval=len(self.pages_with_ids.keys())).start()
     progress = 1
     col_sums = numpy.sum(self.adj, axis=1)
     for (ID, page) in self.pages_with_ids.iteritems():
         pbar.update(progress)
         for k in xrange(len(self.adj[page.index])):
             if col_sums[page.index] != 0:
                 self.adj[page.index][k] = self.adj[page.index][k] / col_sums[page.index]
             else:
                 self.adj[page.index][k] = 0.0
             self.indices_with_pages[k]
         progress += 1  
     pbar.finish()
     numpy.savetxt("adj.txt", self.adj)
     # Run PageRank and converge to principal eigenvector of adj matrix
     self.ranks = numpy.ones(len(self.pages_with_ids.keys()))
     z = numpy.ones(len(self.pages_with_ids.keys()))
     b = 1.0 - d
     pbar = ProgressBar(widgets=['Running PageRank: ', SimpleProgress()], maxval=1000).start()
     for m in xrange(1000):
         pbar.update(m)
         u = numpy.dot(self.adj, self.ranks)
         e = d*u
         f = b*z
         self.ranks = e+f
     pbar.finish()
     # Updating ranks of the pages
     pbar = ProgressBar(widgets=['Updating pages with new ranks: ', SimpleProgress()], maxval=len(self.pages_with_ids.keys())).start()
     progress = 1
     for (ID, page) in self.pages_with_ids.iteritems():
         pbar.update(progress)
         page.rank = self.ranks[page.index]
         progress += 1
     pbar.finish()
     numpy.savetxt("page_ranks.txt", self.ranks)
Ejemplo n.º 2
0
Archivo: page.py Proyecto: parkr/gargle
 def __init__(self, title, num, html, url, text):
     self.ID = helpers.page_hash(html)
     self.num = num
     self.title = title
     self.urls = [url]
     self.anchor_texts = [] # also contains alt text of <img>'s within <a></a>
     self.inlinks = 0.0
     self.rank = 0.0
     self.snippet = ' '.join(text.split(' ')[100:110])
     self.a = []
     self.index = 0