This repository has been archived by the owner on Jun 21, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
brain.py
148 lines (141 loc) · 6.15 KB
/
brain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from bs4 import BeautifulSoup
from xml.dom.minidom import Document
from page import Page
from progressbar import *
import urllib2, helpers, numpy, datetime
class Brain:
# Public: Initializes a new instance of Brain and sets up instance variables
#
# Returns nothing
def __init__(self):
self.S = []
self.urls = []
self.pages = []
self.pages_with_ids = {}
self.urls_with_nums = {}
self.indices_with_pages = {}
self.adj = None
self.ranks = None
# Public: reads in the URLs from the file
#
# filename - the filename of the file with the newline-delimited
# (num,url) comma-separated values
#
# Returns nothing
def parse_urls(self, filename='./test/test3.txt'):
# reads in URLs & normalizes
f = open(filename, 'r')
for line in f.readlines():
els = line.strip().split(',')
self.urls.append( (els[0], els[1]) )
self.S.append(els[1])
# Public: Grabs the HTML and creates the instances of Page for each URL
#
# Returns nothing
def process_pages(self):
skipped = []
pbar = ProgressBar(widgets=['Processing pages: ', SimpleProgress()], maxval=len(self.urls)).start()
i = 0
for (num, url) in self.urls:
pbar.update(int(num))
if (num and url):
html = helpers.get_html(num, url)
if html is not None:
self.urls_with_nums[url] = num
soup = BeautifulSoup(html.encode('utf-8', 'ignore'), 'lxml')
page = Page(title=soup.title.string, num=num, html=soup.prettify(), url=url, text=soup.body.get_text())
page.index = i
self.indices_with_pages[i] = page
if page.ID not in self.pages_with_ids.keys():
self.pages_with_ids[page.ID] = page
else:
raise RuntimeError('COLLISION: %s collides with %s with hash %s.' % (page.num, self.pages_with_ids[page.ID].num, page.ID))
for link in soup.find_all('a'):
if link.get('href') and 'mailto:' != link.get('href').strip()[0:7]:
page.a.append(link)
self.pages.append(page)
i += 1
else:
skipped.append(num)
else:
skipped.append(num)
pbar.finish()
print "Skipped page(s) %s because of an error." % (', '.join(skipped))
# Public: Calculates the PageRank value for all the pages
#
# Returns nothing
def calc_page_ranks(self, d=0.85):
self.adj = numpy.zeros( (len(self.pages_with_ids),len(self.pages_with_ids)) )
pbar = ProgressBar(widgets=['Processing links: ', SimpleProgress()], maxval=len(self.pages_with_ids.keys())).start()
progress = 1
for (ID, page) in self.pages_with_ids.iteritems():
pbar.update(progress)
# magic PageRank
for a in page.a:
href = a.get('href')
# normalize URLS
url = page.normalize_url(href)
if url in self.S:
soup = BeautifulSoup(helpers.get_html(self.urls_with_nums[url]).encode('utf-8', 'ignore'), 'lxml')
ID = helpers.page_hash(soup.prettify())
if ID in self.pages_with_ids.keys():
#print "%s (#%d) cites %s (#%d)" % (page.num, page.index, self.pages_with_ids[ID].num, self.pages_with_ids[ID].index)
#print self.urls[int(self.pages_with_ids[ID].num)-1]
self.adj[page.index][self.pages_with_ids[ID].index] = 1.0
progress += 1
# Normalize adjacency matrix into PageRanks
pbar = ProgressBar(widgets=['Normalizing adjacencies: ', SimpleProgress()], maxval=len(self.pages_with_ids.keys())).start()
progress = 1
col_sums = numpy.sum(self.adj, axis=1)
for (ID, page) in self.pages_with_ids.iteritems():
pbar.update(progress)
for k in xrange(len(self.adj[page.index])):
if col_sums[page.index] != 0:
self.adj[page.index][k] = self.adj[page.index][k] / col_sums[page.index]
else:
self.adj[page.index][k] = 0.0
self.indices_with_pages[k]
progress += 1
pbar.finish()
numpy.savetxt("adj.txt", self.adj)
# Run PageRank and converge to principal eigenvector of adj matrix
self.ranks = numpy.ones(len(self.pages_with_ids.keys()))
z = numpy.ones(len(self.pages_with_ids.keys()))
b = 1.0 - d
pbar = ProgressBar(widgets=['Running PageRank: ', SimpleProgress()], maxval=1000).start()
for m in xrange(1000):
pbar.update(m)
u = numpy.dot(self.adj, self.ranks)
e = d*u
f = b*z
self.ranks = e+f
pbar.finish()
# Updating ranks of the pages
pbar = ProgressBar(widgets=['Updating pages with new ranks: ', SimpleProgress()], maxval=len(self.pages_with_ids.keys())).start()
progress = 1
for (ID, page) in self.pages_with_ids.iteritems():
pbar.update(progress)
page.rank = self.ranks[page.index]
progress += 1
pbar.finish()
numpy.savetxt("page_ranks.txt", self.ranks)
# Public: Writes the page metadata to metadata.xml
#
# Returns nothing
def write_metadata(self):
# write metadata.xml
doc = Document()
index = doc.createElement("index")
doc.createElement(index)
# input the date
datex = doc.createElement("date")
date = doc.createTextNode(datetime.datetime.now().isoformat())
datex.appendChild(date)
index.appendChild(datex)
# add pages
pagesx = doc.createElement("pages")
index.appendChild(pagesx)
for (ID, p) in self.pages_with_ids.iteritems():
pagesx.appendChild(p.metadata(doc))
with open('metadata.xml', 'w') as f:
f.write(doc.toprettyxml(indent=' '))