Exemple #1
0
class RobotHandler(object):
    
    def __init__(self):
        self._hostname_pool =   SafeDictionary()
        self._user_agent = "Mozilla/5.0"
        
    def check_hostname(self, html_task):
        if  (self._hostname_pool.has_key(html_task._homesiteurl)==True):
            return True
        else: 
            return False
    def is_allowed(self,html_task):
        try:
            timeout = 2
            socket.setdefaulttimeout(timeout)
            
            rerp = RobotExclusionRulesParser()
            if (self.check_hostname(html_task) == False):
               # print ("home_site for Robot", html_task._homesiteurl)
                #print html_task._url
                #print "fecth: robots:"+ html_task._homesiteurl + "/robots.txt"
                rerp.fetch(html_task._homesiteurl + "/robots.txt", 2)
                self._hostname_pool.addorupdate(html_task._homesiteurl,rerp) 
                #print "finish fecth"
            else:
                #print "in is_allowed1"+html_task._homesiteurl
                rerp = self._hostname_pool.valueofkey(html_task._homesiteurl)
               # print "in is_allowed 2"      
            return rerp.is_allowed(self._user_agent, html_task._url) 
            #return rerp.is_allowed(html_task._url)                   
        except Exception as e:
           # print e
            return True
Exemple #2
0
class EarlyVisitHandler(object):
    
    def __init__(self):
        """this dic stores normlized url(md5) and the html"""
        self._visited_dic   = SafeDictionary()
        
    def check_visited(self, html_task):
    
        if  self._visited_dic.has_key(html_task._md5):
            return True
        else: 
            return False
    
    def add_entry(self,key,value):
        self._visited_dic.addorupdate(key, value)
class SameIpHandler(object):
    
    def __init__(self):
        """key ip. value hostname"""
        self._ip_dic   = SafeDictionary()
        
    def check_same_ip(self, html_task):
    
        if  self._ip_dic.has_key(html_task._ip):
            return True
        else: 
            return False
 def __init__(self):
     """key ip. value hostname"""
     self._ip_dic   = SafeDictionary()
Exemple #5
0
 def __init__(self):
     """this dic stores normlized url(md5) and the html"""
     self._visited_dic   = SafeDictionary()
Exemple #6
0
 def __init__(self):
     self._hostname_pool =   SafeDictionary()
     self._user_agent = "Mozilla/5.0"