/
web_scraper.py
executable file
·87 lines (68 loc) · 2.6 KB
/
web_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# Accepts the graph generated by the web crawler and generates an index
# that represents the content on the set of the pages and the URLs at which each content element
# was found.
import page_rank
import urllib2
import ssl
import re
# required to gain access to https urls on the school server from home
ssl._create_default_https_context = ssl._create_unverified_context
index = {}
def get_page_text(url, index, word_count=0):
response = urllib2.urlopen(url)
html = response.read()
page_text, page_words = "", []
html = html[html.find("<body") + 5:html.find("</body>")]
open_script_tag = html.find("<script")
while open_script_tag > -1:
end_script_tag = html.find("</script>")
html = html[:open_script_tag] + html[end_script_tag + 9:]
open_script_tag = html.find("<script")
ignore_words = []
fin = open("ignoreList.txt", "r")
for word in fin:
ignore_words.append(word.strip())
fin.close()
finished = False
while not finished:
next_close_tag = html.find(">")
next_open_tag = html.find("<")
if next_open_tag > -1:
content = " ".join(html[next_close_tag + 1:next_open_tag].strip().split())
page_text = page_text + " " + content
html = html[next_open_tag + 1:]
else:
finished = True
for word in page_text.split():
word = word.lower()
word = re.sub(r'[^\w\s]', '', word) # remove punctuation
if word[0].isalnum() and not word in ignore_words:
page_words.append(word)
for word in page_words:
word_count += 1
add_to_index(index, word, url, word_count)
def add_to_index(index, keyword, url, word_count):
# for each url create a list and add the url and word position to it.
if keyword in index:
# handle if keyword already exists
second_dict = index[keyword]
if url in second_dict:
values = second_dict[url]
values.append(word_count)
else:
second_dict[url] = [word_count]
else:
# create a new dict and append to index
second_dict = {}
second_dict[url] = [word_count, ]
index[keyword] = second_dict
def scrape_page(crawled_graph):
checked_urls = []
for url in crawled_graph:
if url not in checked_urls:
get_page_text(url, index)
checked_urls.append(url)
# calculate the page ranks using the crawled graph
page_rank.compute_ranks(crawled_graph)
def get_index():
return index