def __init__(self, starturl, callback, callpre=None, callfail=None, concount=MAXCONCOUNT, depth=2, accept_url_patterns=None, reject_url_patterns=None): self.concount = concount self.callback = callback self.callpre = callpre self.callfail = callfail self.depth = depth self.starturl = starturl self.baseurl = URL.baseurl(starturl) self.urls = [] self.crawled = {} self.link_title_db = LinkTitleDB() self.accept_url_patterns = accept_url_patterns self.reject_url_patterns = reject_url_patterns self.robotstxt = RobotFileParser() self.robotstxt.set_url(urljoin(starturl, '/robots.txt')) self.referer = starturl try: self.robotstxt.read() except: logger.debug(Traceback())
def reject_url(self, url): return self.baseurl != URL.baseurl(url) and (not self.accept_url_patterns or not re.match('|'.join(self.accept_url_patterns), url) or self.reject_url_patterns or re.match('|'.join(self.reject_url_patterns), url))