Example #1
0
 def __init__(
         self,
         log,
         roots,
         exclude=None,
         strict=True,  # What to crawl.
         max_redirect=10,
         max_tries=4,  # Per-url limits.
         max_tasks=10,
         max_pool=10,  # Global limits.
 ):
     self.log = log
     self.roots = roots
     self.exclude = exclude
     self.strict = strict
     self.max_redirect = max_redirect
     self.max_tries = max_tries
     self.max_tasks = max_tasks
     self.max_pool = max_pool
     self.todo = {}
     self.busy = {}
     self.done = {}
     self.pool = ConnectionPool(self.log, max_pool, max_tasks)
     self.root_domains = set()
     for root in roots:
         parts = urlparse.urlparse(root)
         host, port = urllib_splitport(parts.netloc)
         if not host:
             continue
         if re.match(r'\A[\d\.]*\Z', host):
             self.root_domains.add(host)
         else:
             host = host.lower()
             if self.strict:
                 self.root_domains.add(host)
                 if host.startswith('www.'):
                     self.root_domains.add(host[4:])
                 else:
                     self.root_domains.add('www.' + host)
             else:
                 parts = host.split('.')
                 if len(parts) > 2:
                     host = '.'.join(parts[-2:])
                 self.root_domains.add(host)
     for root in roots:
         self.add_url(root)
     self.governor = asyncio.locks.Semaphore(max_tasks)
     self.termination = asyncio.locks.Condition()
     self.t0 = time.time()
     self.t1 = None
Example #2
0
 def __init__(self, log,
              roots, exclude=None, strict=True,  # What to crawl.
              max_redirect=10, max_tries=4,  # Per-url limits.
              max_tasks=10, max_pool=10,  # Global limits.
              ):
     self.log = log
     self.roots = roots
     self.exclude = exclude
     self.strict = strict
     self.max_redirect = max_redirect
     self.max_tries = max_tries
     self.max_tasks = max_tasks
     self.max_pool = max_pool
     self.todo = {}
     self.busy = {}
     self.done = {}
     self.pool = ConnectionPool(self.log, max_pool, max_tasks)
     self.root_domains = set()
     for root in roots:
         parts = urlparse.urlparse(root)
         host, port = urllib_splitport(parts.netloc)
         if not host:
             continue
         if re.match(r'\A[\d\.]*\Z', host):
             self.root_domains.add(host)
         else:
             host = host.lower()
             if self.strict:
                 self.root_domains.add(host)
                 if host.startswith('www.'):
                     self.root_domains.add(host[4:])
                 else:
                     self.root_domains.add('www.' + host)
             else:
                 parts = host.split('.')
                 if len(parts) > 2:
                     host = '.'.join(parts[-2:])
                 self.root_domains.add(host)
     for root in roots:
         self.add_url(root)
     self.governor = asyncio.locks.Semaphore(max_tasks)
     self.termination = asyncio.locks.Condition()
     self.t0 = time.time()
     self.t1 = None
Example #3
0
 def add_url(self, url, max_redirect=None):
     """Add a URL to the todo list if not seen before."""
     if self.exclude and re.search(self.exclude, url):
         return False
     parts = urlparse.urlparse(url)
     if parts.scheme not in ('http', 'https'):
         self.log(2, 'skipping non-http scheme in', url)
         return False
     host, port = urllib_splitport(parts.netloc)
     if not self.host_okay(host):
         self.log(2, 'skipping non-root host in', url)
         return False
     if max_redirect is None:
         max_redirect = self.max_redirect
     if url in self.todo or url in self.busy or url in self.done:
         return False
     self.log(1, 'adding', url, max_redirect)
     self.todo[url] = max_redirect
     return True
Example #4
0
 def add_url(self, url, max_redirect=None):
     """Add a URL to the todo list if not seen before."""
     if self.exclude and re.search(self.exclude, url):
         return False
     parts = urlparse.urlparse(url)
     if parts.scheme not in ('http', 'https'):
         self.log(2, 'skipping non-http scheme in', url)
         return False
     host, port = urllib_splitport(parts.netloc)
     if not self.host_okay(host):
         self.log(2, 'skipping non-root host in', url)
         return False
     if max_redirect is None:
         max_redirect = self.max_redirect
     if url in self.todo or url in self.busy or url in self.done:
         return False
     self.log(1, 'adding', url, max_redirect)
     self.todo[url] = max_redirect
     return True