def __init__(self, user_agents=[USER_AGENT_ZUMBOT], match_url=".+"): """ Init RobotsValidator class If no argument about match_url, initialize user_agents=[DAUMOA, EDI], match_url=".+"(regular expression) The user_agents must be list or tuple. """ assert type(user_agents) in (list, tuple) self.robots = {} # Dict of robot URLs to robot parsers self.match_url = re.compile(match_url) self.user_agents = user_agents self.lock = threading.Lock() self.url_util = URLUtil() self.blocked = False self.delay = 3
class RobotsValidator: """ An object that handles checking if we should fetch and crawl a specific URL. This is based on the type of the URL (only crawl http URLs) and robot rules. Maintains a cache of robot rules already fetched. """ def __init__(self, user_agents=[USER_AGENT_ZUMBOT], match_url=".+"): """ Init RobotsValidator class If no argument about match_url, initialize user_agents=[DAUMOA, EDI], match_url=".+"(regular expression) The user_agents must be list or tuple. """ assert type(user_agents) in (list, tuple) self.robots = {} # Dict of robot URLs to robot parsers self.match_url = re.compile(match_url) self.user_agents = user_agents self.lock = threading.Lock() self.url_util = URLUtil() self.blocked = False self.delay = 3 def __repr__(self): """ Print user_agents and internal robots cache size """ return "inputed user_agents: %s\ninternal robots cache size: %d" % (self.user_agents, len(self.robots)) def __str__(self): """ Print user_agents and internal robots cache size """ ret_str = "" for key in self.robots.keys(): ret_str += "%s:\n\t%s\n" % (key, "\n\t".join(str(self.robots[key]).split("\n"))) return ret_str def flushInternalCache(self): """ Flush internal cache """ self.robots.clear() def flushAllCache(self): """ Flush all cache """ self.flushInternalCache() flush = flushAllCache def isDisallowSite(self, url, verbose=False): """ robots.txt가 아래 문구를 포함하면 True를 리턴. User-agent: * or zumbot Disallow: / """ logger = log.getLogger() self.delay = 3 robots_site_path = urlparse.urljoin(url, "/robots.txt") # Then the site-level # 3. download robots text self.blocked = False rules = self._parsingRobotsFile(robots_site_path) # First try site-level if self.blocked: return True, self.delay else: return False, self.delay def isCrawlable(self, url, verbose=False, detail=False): """ Returns True if it's OK to crawl the absolute URL provided. """ logger = log.getLogger() # daum.net일 경우 무조건 True (scheme, netloc, path, param, query, fragment) = urlparse.urlparse(url) try: self.lock.acquire() ret = None if not url.startswith("http") or not self.match_url.match(url): if verbose: logger.info("not match with 'http' or match_url: %s" % self.match_url.match(url)) ret = False if not ret: # clear internal cache if exceed MAX_ROBOTS_CACHE_SIZE if len(self.robots) > MAX_ROBOTS_CACHE_SIZE: self.robots.clear() #logger.debug("++ isCrawlable: A") robot_rules = self._getRules(url, verbose) #logger.debug("++ isCrawlable: B") if verbose: try: logger.info("URL: %s" % url) logger.info("ROBOT_RULES: \n%s" % robot_rules) #logger.info(" RULE: %s" % robot_rules.robots_file) except Exception, msg: logger.info("Exception: %s" % msg) parser = robotparser.RobotFileParser() parser.parse(robot_rules.robots_file) #logger.debug("++ isCrawlable: C") if robot_rules.is_useragent_configured: is_crawlable = True for user_agent in self.user_agents: is_crawlable = is_crawlable and parser.can_fetch(user_agent, url) if verbose: logger.info("maching: %s" % user_agent) logger.info("is_crawlable: %s" % is_crawlable) if verbose: logger.info("return %s" % is_crawlable) #return is_crawlable ret = is_crawlable else: #logger.debug("is_crawlable2: %s" % is_crawlable) is_crawlable = parser.can_fetch("*", url) if verbose: logger.info("is_crawlable3: %s" % is_crawlable) logger.info("is_crawlable4: %s" % parser.can_fetch("*", url)) logger.info("return2 %s" % is_crawlable) ret = is_crawlable except: self.lock.release() if verbose: logger.debug("++ robots_lock release") else: self.lock.release() if verbose: logger.debug("++ robots_lock release") if not ret: ret = False # URLType이 TOP/Directory일 경우에는 무조건 방문한다(08/07/08) # TP가 False에서 True로 바꼈는지 여부를 판단하기 위해서 위치 수정(08/11/26) try: forced = False url_type = self.url_util.getURLType(url) if url_type in (URL_TYPE_TP, URL_TYPE_DR) and ret == False: logger.debug("**VISIT forced, URLType: %s", url_type) ret = True forced = True except: pass #logger.debug("++ is_crawlable: FINISH") if detail: ret = (ret, forced) return ret def _getRules(self, url, verbose=False): """ Returns the RobotTextRules object for url(site-level or dir-level) First: use internal cache Second: use memcache Third: download robots.txt and parsing """ logger = log.getLogger() # 1. use stored robots dictionary cache robots_site_path = urlparse.urljoin(url, "/robots.txt") # Then the site-level if robots_site_path in self.robots: if verbose: logger.info("robotstxt in local memory: %s", robots_site_path) return self.robots[robots_site_path] # 2. use memcache rules = None try: # 3. download robots text rules = self._parsingRobotsFile(robots_site_path) # First try site-level if verbose: logger.info("robotstxt downloaded: %s: %s", rules.return_code, robots_site_path) self.robots[robots_site_path] = rules except: pass return rules def _parsingRobotsFile(self, url): """ Setup internal state after downloading robots.txt If urlopen.code in (401, 403), all user-agent is disallowed. -> 삭제 If urlopen.code >= 400, all user-agent is allowd. """ domain_name = urlparse.urlparse(url)[1] rules = RobotsTextRules() #getLogger().debug("++Trying to download: %s", url) opener = mechanize.build_opener(mechanize.HTTPRefreshProcessor,) rq = mechanize.Request(url) rq.add_header("User-agent", "Mozilla/5.0 (compatible; Windows NT 6.1?; ZumBot/1.0; http://help.zum.com/inquiry)") #shttp.setRequestHeader(user_agent = USER_AGENT) rs = None try: rs = opener.open(rq) header = rs.info() rules.return_code = rs.code except Exception, msg: try: if not url.startswith("http://www."): t_url = url.replace("http://", "http://www.") rq = mechanize.Request(t_url ) rq.add_header("User-agent", "Mozilla/5.0 (compatible; Windows NT 6.1?; ZumBot/1.0; http://help.zum.com/inquiry)") rs = opener.open(rq) header = rs.info() rules.return_code = rs.code except Exception, msg: return rules