예제 #1
0
 def __init__(self, starturl, callback, callpre=None, callfail=None, concount=MAXCONCOUNT, depth=2, accept_url_patterns=None, reject_url_patterns=None):
     self.concount = concount
     self.callback = callback
     self.callpre = callpre
     self.callfail = callfail
     self.depth = depth
     self.starturl = starturl
     self.baseurl = URL.baseurl(starturl)
     self.urls = []
     self.crawled = {}
     self.link_title_db = LinkTitleDB()
     self.accept_url_patterns = accept_url_patterns
     self.reject_url_patterns = reject_url_patterns
     self.robotstxt = RobotFileParser()
     self.robotstxt.set_url(urljoin(starturl, '/robots.txt'))
     self.referer = starturl
     try:
         self.robotstxt.read()
     except:
         logger.debug(Traceback())
예제 #2
0
 def reject_url(self, url):
     return self.baseurl != URL.baseurl(url) and (not self.accept_url_patterns or not re.match('|'.join(self.accept_url_patterns), url) or self.reject_url_patterns or re.match('|'.join(self.reject_url_patterns), url))