/
web_crawler.py
executable file
·102 lines (82 loc) · 3.34 KB
/
web_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# Retrieves a url from the user to act as a seed to crawl finding all urls associated with that page
# and returns a dictionary of its
import web_scraper
import poodle
import urllib2
import ssl
import sys
# required to gain access to https urls on the school server from home
ssl._create_default_https_context = ssl._create_unverified_context
crawled_graph = {}
def get_url():
# ask for a URL to crawl e.g. https://dunluce.infc.ulst.ac.uk/d12wo/Web/B3/test_index.html
while True:
url_to_crawl = raw_input(" Enter a URL to crawl (-end to exit to menu) \n")
url_to_crawl = url_to_crawl.strip() # remove whitespace from start and end if any
url_to_crawl = url_to_crawl.lower() # convert to lowercase
if url_to_crawl[:7] == "http://" or url_to_crawl[:8] == "https://": # check if URL is http or https
if url_to_crawl[-1] == "/": # remove forward slash from the end
url_to_crawl = url_to_crawl[:-1]
break
else:
break
elif url_to_crawl == "-end":
poodle.init_menu_options()
else:
print " URL must begin with http:// or https://\n"
crawl_url(url_to_crawl)
def get_depth():
while True:
try:
max_depth = int(raw_input(" Enter the depth {1-10} you want to crawl this URL: "))
except ValueError:
print " Not a number -- Try Again."
else:
if 1 <= max_depth < 11:
break
else:
print " Out of range {1-10} -- Try Again."
return max_depth
def crawl_url(seed):
MAX_DEPTH = get_depth() # gets to interesting.html if set at 5. Limiting depth allows the search to be controlled
to_crawl = [seed]
crawled = []
while to_crawl and len(crawled) < MAX_DEPTH:
unique_urls = []
url = to_crawl.pop()
crawled.append(url)
# extract all undiscovered links in the source HTML of the current page
# and merge those links with the to_crawl list.
new_links = get_all_new_links_on_page(url, crawled, unique_urls)
to_crawl = list(set(to_crawl) | set(new_links))
# graph of unique urls
crawled_graph[url] = unique_urls
# print urls after the url has been crawled
for url in crawled_graph:
print url
# pass graph to scrape it
web_scraper.scrape_page(crawled_graph)
def get_all_new_links_on_page(page, prev_links, unique_urls):
response = urllib2.urlopen(page)
html = response.read()
links, pos, all_found = [], 0, False
while not all_found:
atag = html.find("<a href=", pos)
if atag > -1:
href = html.find('"', atag + 1)
end_href = html.find('"', href + 1)
url = html[href + 1:end_href]
if url[:8] == "https://" or url[:7] == "http://":
if url[-1] == "/":
url = url[:-1]
if not url in links and not url in prev_links:
links.append(url)
if url not in unique_urls:
unique_urls.append(url)
close_tag = html.find("</a>", atag)
pos = close_tag + 1
else:
all_found = True
return links
def get_graph():
return crawled_graph