def __init__( self, log, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, max_pool=10, # Global limits. ): self.log = log self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.max_pool = max_pool self.todo = {} self.busy = {} self.done = {} self.pool = ConnectionPool(self.log, max_pool, max_tasks) self.root_domains = set() for root in roots: parts = urlparse.urlparse(root) host, port = urllib_splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) if host.startswith('www.'): self.root_domains.add(host[4:]) else: self.root_domains.add('www.' + host) else: parts = host.split('.') if len(parts) > 2: host = '.'.join(parts[-2:]) self.root_domains.add(host) for root in roots: self.add_url(root) self.governor = asyncio.locks.Semaphore(max_tasks) self.termination = asyncio.locks.Condition() self.t0 = time.time() self.t1 = None
def __init__(self, log, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, max_pool=10, # Global limits. ): self.log = log self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.max_pool = max_pool self.todo = {} self.busy = {} self.done = {} self.pool = ConnectionPool(self.log, max_pool, max_tasks) self.root_domains = set() for root in roots: parts = urlparse.urlparse(root) host, port = urllib_splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) if host.startswith('www.'): self.root_domains.add(host[4:]) else: self.root_domains.add('www.' + host) else: parts = host.split('.') if len(parts) > 2: host = '.'.join(parts[-2:]) self.root_domains.add(host) for root in roots: self.add_url(root) self.governor = asyncio.locks.Semaphore(max_tasks) self.termination = asyncio.locks.Condition() self.t0 = time.time() self.t1 = None
def add_url(self, url, max_redirect=None): """Add a URL to the todo list if not seen before.""" if self.exclude and re.search(self.exclude, url): return False parts = urlparse.urlparse(url) if parts.scheme not in ('http', 'https'): self.log(2, 'skipping non-http scheme in', url) return False host, port = urllib_splitport(parts.netloc) if not self.host_okay(host): self.log(2, 'skipping non-root host in', url) return False if max_redirect is None: max_redirect = self.max_redirect if url in self.todo or url in self.busy or url in self.done: return False self.log(1, 'adding', url, max_redirect) self.todo[url] = max_redirect return True