/
SiteHandler.py
67 lines (54 loc) · 1.76 KB
/
SiteHandler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
'''
Used to consult Robots.txt for a site
Maintains a cache using base-urls as key.
Cache flushed when filled. Robots.txt updated if it expires
'''
import RobotExclusionParser
import utils
from urlparse import urljoin
cache = {}
cache_MAX_SIZE = 500
agent = 'PolyCrawler'#'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
def __init__(self):
pass
def add_To_Cache(baseURL):
global cache,cache_MAX_SIZE, agent
if(cache.__len__() >= cache_MAX_SIZE):
cache.clear()
temp=RobotExclusionParser.RobotExclusionRulesParser()
temp.fetch(urljoin(baseURL,'robots.txt'))
cache[baseURL] = temp
def keep_Fresh(baseURL):
global cache,cache_MAX_SIZE, agent
rerp = cache.get(baseURL)
if(rerp.is_expired):
cache.pop(baseURL)
temp=RobotExclusionParser.RobotExclusionRulesParser()
temp.fetch(urljoin(baseURL,'robots.txt'))
cache[baseURL]= temp
def is_Valid(url):
global cache,cache_MAX_SIZE, agent
try:
baseURL=utils.getBaseUrl(url)
if(cache.has_key(baseURL)):
#keep_Fresh(baseURL)
#print cache.get(baseURL).is_allowed(agent, url)
return cache.get(baseURL).is_allowed(agent, url)
else:
add_To_Cache(baseURL)
return cache.get(baseURL).is_allowed(agent, url)
except Exception:
return False
def get_Crawl_Delay(url):
global cache,cache_MAX_SIZE, agent
baseURL=utils.getBaseUrl(url)
cache=cache
if(cache.has_key(baseURL)):
rerp = cache.get(baseURL)
if(rerp.get_crawl_delay(agent)!=None):
return rerp.get_crawl_delay(agent)
else:
return 0
else:
add_To_Cache(baseURL)
return get_Crawl_Delay(url)