/
crawler.py
71 lines (58 loc) · 2.37 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"""
A crawler is a program that starts with a url on the web
(ex: http://python.org), fetches the web-page corresponding to that url,
and parses all the links on that page into a repository of links. Next,
it fetches the contents of any of the url from the repository just created,
parses the links from this new content into the repository and continues
this process for all links in the repository until stopped or after a given
number of links are fetched.
"""
from bs4 import BeautifulSoup # pip install BeautifulSoup4
from urlparse import urlparse
import urllib2, sys
import pdb
total_collected_url = []
visited_url = []
class crawler:
def __call__(self, url, count_of_crawler):
"""
Function which fetch the content from the given URL and collect all the
URL in the content and pass the first url of the page to fetch the
content.
"""
try:
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read())
links_on_page = map(lambda anchor: anchor.get('href'),
soup.find_all('a'))
cleaned_url = map(lambda link: link if urlparse(link).scheme
and urlparse(url).netloc else (urlparse(url)
.scheme+"://"+urlparse(url).netloc+link if
link[0] == "/" else url+link), links_on_page)
visited_url.append(url)
total_collected_url.append(cleaned_url)
next_url_to_visit = [next_url for next_url in cleaned_url\
if not next_url in visited_url and not "#" in next_url][0]
if count_of_crawler and next_url_to_visit:
count_of_crawler = crawler(next_url_to_visit,
count_of_crawler-1)
except:
print "It seems there is some issue in URL "+url
return count_of_crawler
if __name__ == '__main__':
crawl = crawler()
#url = raw_input("Enter the url from which you want crawler to begin: ")
try:
#number_of_fetching = int(raw_input("Number to page crawler to "))
count_of_crawler = crawl(sys.argv[1], int(sys.argv[2]))
zipped = dict(zip(visited_url,total_collected_url))
print "Number of Links that crawler visited "+ str(len(visited_url))
print zipped
except ValueError:
print "You have to enter a whole digit number over there"
except IndexError:
print "Please the enter the arguements correctly first arguement \
should be the URL and Second should be the count of URL's you want to fetch"
except:
print "Oops !!! Duh ,It seems some big issue is there crawler is not working.\
you need to check it out"