def __init__(self, urls, normalizer, result_saver=print, qsize=None, result_qsize=None, num_workers=1, max_tries=4, timeout=3, max_connections=30, *, loop=None): """Async URLCleaner. :param normalizer: callable that takes url and returns normalized url or False when url is invalid or None, when url can't be validated. """ self.urls = urls self.normalizer = normalizer self.result_saver = result_saver self.loop = loop or asyncio.get_event_loop() self.q = Queue(maxsize=qsize or num_workers * 10, loop=self.loop) self.result_q = Queue(maxsize=result_qsize or num_workers * 10, loop=self.loop) self.num_workers = num_workers self.max_tries = max_tries self.timeout = timeout proxy = os.environ.get('http_proxy') if proxy: self.connector = aiohttp.ProxyConnector(proxy=proxy, limit=max_connections, loop=self.loop) else: self.connector = aiohttp.TCPConnector(limit=max_connections, loop=self.loop) self.t0 = time.time() self.t1 = None self.clean_task = None
class Fetcher: def __init__(self, loop): self.num_worker = 10 self.loop = loop self.q = Queue() self.seen_urls = set(['/']) async def manager(self): workers = [ self.loop.create_task(self.worker()) for _ in range(self.num_worker) ] # the `yield from` is not needed await self.q.put('/') # wait until q is empty await self.q.join() for w in workers: w.cancel() async def worker(self): while True: url = await self.q.get() sock = socket.socket(socket.AF_INET) sock.setblocking(False) try: await self.loop.sock_connect(sock, ('dilbert.com', 80)) except BlockingIOError: pass request = 'GET {} HTTP/1.1\r\nHost: dilbert.com\r\nConnection: close\r\n\r\n'.format( url) await self.loop.sock_sendall(sock, request.encode('ascii')) response = b'' chunk = await self.loop.sock_recv(sock, 4096) while chunk: response += chunk chunk = await self.loop.sock_recv(sock, 4096) links = await self.parse_link(response) for link in links.difference(self.seen_urls): await self.q.put(link) self.seen_urls.update(links) self.q.task_done() sock.close() async def parse_link(self, response): links = set([]) d = pq(response) anchors = d("a") for anchor in anchors: href = anchor.get("href") if href and href[:5] == "http:" and href[7:14] == "dilbert": links.add(href[6:]) return links
def __init__(self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, *, loop=None): # The lone * indicates that all following arguments are keyword-only arguments self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): # \A and \Z are similar to ^ and $, \d represents the digital.(0.0.0.0) self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None
def __init__(self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root)#url字符串分解为6个元素,以元祖的形式返回,scheme://netloc/path;parameters?query#fragment,返回的对象还包含了一些属性:username、password、hostname、port. host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host)#192.168.3.4这样的ip直接访问的 else: host = host.lower()#域名不区分大小写 if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host))#本意也许是只取顶级域名?奇怪的是lenient函数生成的为什么没有点 for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None
def __init__(self, root, max_tasks=1000, loop=None, file=None): LOGGER.info('Starting Crawler ...\n') self.loop = loop or asyncio.get_event_loop() self.q = Queue(loop=self.loop) self.visited_urls = set() self.max_tasks = max_tasks self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() self.root_domains.add(host) print('Hosts : {}'.format(','.join(self.root_domains))) self.add_url(root) self.t0 = time.time() self.t1 = None filename = '{}.csv'.format(file) self.f = open(filename, 'w') self.csv = csv.writer(self.f) self.csv.writerow(CSV_HEADER)
def __init__(self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None
def __init__(self, url, max_requests, loop, max_coroutines=100): self.url = url self.max_requests = max_requests self.links_visited = set() self.max_coroutines = max_coroutines self.queue = Queue() self.loop = loop
def __init__(self, targets=None, loop=None, configuration_reads=True, bus_timeout=2, iface=False, nat_mode=False): self.loop = loop or asyncio.get_event_loop() # q contains all KNXnet/IP gateways self.q = Queue(loop=self.loop) # bus_protocols is a list of all bus protocol instances for proper connection shutdown self.bus_protocols = [] # knx_gateways is a list of KnxTargetReport objects, one for each found KNXnet/IP gateway self.knx_gateways = [] self.t0 = time.time() self.t1 = None self.desc_timeout = None self.desc_retries = None self.knx_source = None self.configuration_reads = configuration_reads self.bus_timeout = bus_timeout self.iface = iface self.nat_mode = nat_mode if targets: self.set_targets(targets) else: self.targets = set()
class Crawler: def __init__(self, root_url, max_redirect): self.max_tasks = 10 self.max_redirect = max_redirect self.q = Queue() self.seen_urls = set() self.session = aiohttp.ClientSession(loop=loop) self.q.put((root_url, self.max_redirect)) @asyncio.coroutine def crawl(self): workers = [asyncio.Task(self.work()), for _ in range(self.max_tasks)]
def __init__( self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.count = 0 self.seen_urls = set() self.good_urls = set() # if os.path.exists('seenurls'): # with open('seenurls', 'r') as f: # for line in f: # self.seen_urls.add(json.loads(line)) self.done = [] # if os.path.exists('done'): # with open('done', 'r') as f: # for line in f: # data = json.loads(line) # self.record_statistic(FetchStatistic(url=data[0], # next_url=data[1], # status=data[2], # exception=data[3], # size=data[4], # content_type=data[5], # encoding=data[6], # num_urls=data[7], # num_new_urls=data[8])) self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None self.saving = True
def __init__(self, loop=None, max_tasks=None, retries=None, secure=None, hosts=None, params=None, config=None, config_file=None, access=None, secret=None, debug=None): # Set default values for kwargs. loop = asyncio.get_event_loop() if not loop else loop max_tasks = 100 if not max_tasks else max_tasks max_retries = 10 if not retries else retries protocol = 'http://' if not secure else 'https://' config = get_config(config, config_file) access = config.get('s3', {}).get('access', access) secret = config.get('s3', {}).get('secret', secret) debug = True if debug else False self.max_tasks = max_tasks self.max_retries = max_retries self.protocol = protocol self.hosts = hosts self.config = config self.access = access self.debug = debug self.cookies = config.get('cookies', {}) # Asyncio/Aiohttp settings. self.connector = aiohttp.TCPConnector(share_cookies=True, loop=loop) self.connector.update_cookies(self.cookies) self.loop = loop self.q = Queue(1000, loop=self.loop) self.q = Queue(loop=self.loop) # Require valid access key! self.assert_s3_keys_valid(access, secret) # Rate limiting. self._max_per_second = self.get_global_rate_limit() self._min_interval = 1.0 / float(self._max_per_second) self._last_time_called = 0.0
class Crawler: def __init__(self, root_url, max_redirect): self.max_tasks = 10 self.max_redirect = max_redirect self.q = Queue() self.seen_urls = set() self.session = aiohttp.ClientSession(loop=loop) self.q.put((root_url, self.max_redirect)) @asyncio.coroutine def crawl(self): workers = [asyncio.Task(self.work()) for _ in range(self.max_tasks)] yield from self.q.join() for w in workers: w.cancel() @asyncio.coroutine def work(self): while True: url, max_direct = yield from self.q.get() yield from self.fetch(url, max_redirect) self.q.task_done( ) # 多线程:https://segmentfault.com/q/1010000009765115 @asyncio.coroutine def fetch(self, url, max_redirect): response = yield from self.session.get(url, allow_redirects=False) try: if is_redirect(response): if max_redirect > 0: next_url = response.headers['location'] if next_url in self.seen_urls: return self.seen_urls.add(next_url) self.q.put_nowait((next_url, max_redirect - 1)) else: links = yield from self.parse_links(response) for link in links.differenct(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) finally: yield from response.release()
def __init__(self, root_url, max_redirect): self.max_tasks = 10 self.max_redirect = max_redirect self.q = Queue() self.seen_urls = set() self.session = aiohttp.ClientSession(loop=loop) self.q.put((root_url, self.max_redirect))
def __init__(self, max_tries=30, max_tasks=10, timeout=5, rootDir=os.getcwd()): self.max_tries = max_tries self.max_tasks = max_tasks self.loop = asyncio.get_event_loop() self.q = Queue(loop=self.loop) self.session = aiohttp.ClientSession(loop=self.loop) self.timeout = timeout self.rootDir = rootDir
def __init__(self, max_tasks=20): self.max_tasks = max_tasks self.max_redirect = max_redirect self.q = Queue() # aiohttp's ClientSession does connection pooling and # HTTP keep-alives for us. loop = asyncio.get_event_loop() loop.run_until_complete(self.fetch()) self.session = aiohttp.ClientSession(loop=loop)
def __init__( self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=15, scrape_nonhtml=False, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.request_timeout = 15 self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.ua = fake_useragent.UserAgent() self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() self.scrape_nonhtml = scrape_nonhtml self.dbdsn = 'dbname=osint user=postgres host=127.0.0.1' self.dbinsertquery = 'INSERT INTO public.rawhtml' \ '(hostreversed, port, path, query, ctype, cdata, ctimestamp) ' \ 'VALUES (%(hostreversed)s, %(port)s, %(path)s, %(query)s, ' \ '%(ctype)s, %(cdata)s, %(ctimestamp)s)' self.dbroots = dict() self.dnsroots = dict() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) self.dnsroots[host] = host self.dbroots[ host] = host # TODO: get human readable form from DNS server else: host = host.lower() if self.strict: self.root_domains.add(host) self.dbroots[host] = '.'.join(reversed(host.split('.'))) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.dbpool = None self.t0 = time.time() self.t1 = None
def __init__(self, root_url: str, max_redirect: int): self.max_tasks = 10 self.max_redirect = max_redirect self.q = Queue() self.seen_urls = set() # aiohttp 的 ClientSession 执行连接池 并且 HTTP 为我们 keep-alive self.session = aiohttp.ClientSession(loop=loop) # 把 (URL, max_redirect) 放入队列 self.q.put((root_url, self.max_redirect))
def worker(get, queue: asyncio.JoinableQueue, output): while True: item = yield from queue.get() # This is horrible and I feel bad for writing it, believe me try: if item is None: return chunks, id = item for i in range(id, id + chunks): try: data = yield from get("item/{}".format(i)) output(data) except Exception: pass except Exception as e: pass finally: queue.task_done()
def __init__(self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.count = 0 self.seen_urls = set() self.good_urls = set() # if os.path.exists('seenurls'): # with open('seenurls', 'r') as f: # for line in f: # self.seen_urls.add(json.loads(line)) self.done = [] # if os.path.exists('done'): # with open('done', 'r') as f: # for line in f: # data = json.loads(line) # self.record_statistic(FetchStatistic(url=data[0], # next_url=data[1], # status=data[2], # exception=data[3], # size=data[4], # content_type=data[5], # encoding=data[6], # num_urls=data[7], # num_new_urls=data[8])) self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None self.saving = True
def __init__(self, root_url, max_redirect): self.max_tasks = 10 self.max_redirect = max_redirect self.q = Queue() self.seen_urls = set() # aiohttp's ClientSession does connection pooling and # HTTP keep-alives for us. self.session = aiohttp.ClientSession(loop=loop) # Put (URL, max_redirect) in the queue. self.q.put((root_url, self.max_redirect)) @asyncio.coroutine def crawl(self): """Run the crawler until all work is done.""" workers = [asyncio.Task(self.work()) for _ in range(self.max_tasks)] # When all work is done, exit. yield from self.q.join() for w in workers: w.cancel() @asyncio.coroutine def work(self): while True: url, max_redirect = yield from self.q.get() # Download page and add new links to self.q. yield from self.fetch(url, max_redirect) self.q.task_done() # Begin fetching http://xkcd.com/353/ fetcher = Fetcher('/353/') Task(fetcher.fetch()) loop = asyncio.get_event_loop() crawler = crawling.Crawler('http://xkcd.com', max_redirect=10) loop.run_until_complete(crawler.crawl())
class WebCrawler(object): def __init__(self, url, max_requests, loop, max_coroutines=100): self.url = url self.max_requests = max_requests self.links_visited = set() self.max_coroutines = max_coroutines self.queue = Queue() self.loop = loop @asyncio.coroutine def work(self): while True: url = yield from self.queue.get() fetcher = Fetcher(url, self) yield from fetcher.connect() self.queue.task_done() @asyncio.coroutine def web_crawler(self): self.queue.put_nowait(self.url) self.session = aiohttp.ClientSession(loop=self.loop) workers = [ asyncio.Task(self.work()) for _ in range(self.max_coroutines) ] yield from self.queue.join() for worker in workers: worker.cancel() yield from self.session.close()
def __init__(self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, loop=None): get_domain(roots) if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) with open(path, 'w') as temp_file: print('writing') temp_file.write('Domain name:') temp_file.write(roots) temp_file.write('\n \n') temp_file.close() self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.connector = aiohttp.TCPConnector(loop=self.loop) self.root_domains = set() # for root in roots: # parts = urllib.parse.urlparse(root) # host, port = urllib.parse.splitport(parts.netloc) # if not host: # continue # if re.match(r'\A[\d\.]*\Z', host): # self.root_domains.add(host) # else: ## host = host.lower() # if self.strict: # self.root_domains.add(host) # else: # self.root_domains.add(lenient_host(host)) # for root in roots: # print("true root") # print(root) # self.add_url(root) self.add_url(roots) self.t0 = time.time() self.t1 = None
def __init__(self, task): self.seen_url = set() self.max_tasks = 50 self.max_retry = 10 self.loop = asyncio.get_event_loop() self.session = aiohttp.ClientSession(loop=self.loop) self.q = Queue(loop = self.loop) self.manager = Manager() self.q.put_nowait(task) """ for debug """ self.item_cnt = 0 self.page_cnt = 0 self.f = open("D:\\Acer", 'w') print('initialization finished.')
def __init__(self, loop, pre_url=None, pre_payload=None, target_url=None, login=None, payload_model=None, wordlist=None, error_string=None, tasks=64, tor=None, tor_address=None, debug=None): self.max_tasks = tasks self.queue = Queue() self.pre_url = pre_url self.pre_payload = self._generate_payload_type(pre_payload) self.attack_url = target_url self.login = login self.error_string = [x.strip() for x in error_string.split(',')] self.payload = self._generate_payload_type(payload_model) self.wordlist = wordlist self.found = Event() self.tor_use = tor #self.session = self._generate_new_session(loop) self.debug = debug self.runned_passwords = set() self.old_passwds = set() self.restore_files = [] self.progress_bar = None self.ua = self._prepare_user_agents() self.start_time = time.time() self.last_report_time = time.time() # Statuses set of settings self.loaded_passwords = 0 self.tried_passwords = 0 self.error_passwords = 0 self.max_passwords = 0 # Tor set of settings if self.tor_use is not None and tor_address is not None: ip, port = parse_proxy_address(tor_address) self.tor_address = "http://{0}:{1}".format(ip, port) self.tor_address_string = tor_address # Session set of settings self.session_name = self._generate_session_name() restore_files = self._search_open_sesssion() if restore_files > 0: for file in self.restore_files: if self._load_old_session(file) is True: break else: pass
def __init__(self, start_date, end_date, max_tasks=10, max_tries=10, loop=None): self.start_date = start_date self.end_date = end_date self.max_tasks = max_tasks self.max_tries = max_tries self.loop = loop or asyncio.get_event_loop() self.session = aiohttp.ClientSession(loop=self.loop) self.q = Queue(loop=self.loop) self.t0 = time.time() self.t1 = None self.make_url_queue()
def __init__(self, loop, root_urls, url_filter, max_tries=4, max_redirects=10, sleep_interval=0): self._loop = loop self._max_tries = max_tries self._max_redirects = max_redirects self._sleep_interval = sleep_interval self._session = None self._url_filter = url_filter # get queue ready self._url_queue = Queue(loop=loop) # add root URLs to URL queue for url in root_urls: self.add_a_task(url, 0, 0)
def __init__(self, roots, # What to crawl. exclude=None, include=None, output=None, strict=True, count=None, proxy=None, max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, loop=None, no_parse_links=False): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.include = include self.output = output self.count = int(count) if count else None self.strict = strict self.proxy = proxy self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.task_exit_counter = 0 self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() self.no_parse_links = no_parse_links for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None self.output_file = self.get_file()
def __init__( self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) # url执行队列,使用put将url放入队列供爬虫爬取 self.seen_urls = set() self.done = [] # 完成列表,每个元素是访问url后的具名元组FetchStatistic self.session = aiohttp.ClientSession(loop=self.loop) # 单线程IO操作 self.root_domains = set() for root in roots: parts = urllib.parse.urlparse( root) # return 6 parts includes netloc(host+port) host, port = urllib.parse.splitport( parts.netloc) # www.baidu.com, 80 if not host: continue if re.match(r'\A[\d\.]*\Z', host): # 如果url是全数字 self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: # 省略www. self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) # add url to seen_urls set self.t0 = time.time() # bgn time self.t1 = None # end time
def __init__(self, targets=None, max_workers=100, loop=None): self.loop = loop or asyncio.get_event_loop() # The number of concurrent workers for discovering KNXnet/IP gateways self.max_workers = max_workers # q contains all KNXnet/IP gateways self.q = Queue(loop=self.loop) # bus_queues is a dict containing a bus queue for each KNXnet/IP gateway self.bus_queues = dict() # bus_protocols is a list of all bus protocol instances for proper connection shutdown self.bus_protocols = list() # knx_gateways is a list of KnxTargetReport objects, one for each found KNXnet/IP gateway self.knx_gateways = list() # bus_devices is a list of KnxBusTargetReport objects, one for each found bus device self.bus_devices = set() self.bus_info = False self.t0 = time.time() self.t1 = None if targets: self.set_targets(targets) else: self.targets = set()
def __init__( self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, *, loop=None ): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.connector = aiohttp.TCPConnector(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r"\A[\d\.]*\Z", host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None
def __init__(self, roots, exclude = None, strict = True, max_redirect = 10, max_tries = 4, max_tasks = 10, * ,loop = None): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop = self.loop) self.seen_urls = set() self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() print ('{}'.format(roots)) for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) print ('host: %s, port %s'%(host,port)) if not host: continue if re.match(r'\A[\d\.]*\Z', host): #match IP address self.root_domains.add(host) else: host = host.lower() if self.strict: print ('no handled!') self.root_domains.add(host) else: print ('handled!') self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None
class Fetcher(object): """ Async Page fetcher, that c """ def __init__(self, max_tasks=20): self.max_tasks = max_tasks self.max_redirect = max_redirect self.q = Queue() # aiohttp's ClientSession does connection pooling and # HTTP keep-alives for us. loop = asyncio.get_event_loop() loop.run_until_complete(self.fetch()) self.session = aiohttp.ClientSession(loop=loop) @asyncio.coroutine def fetch(self): """ Run the fetcher until all work is done. """ # Create workers that fetch pages workers = [asyncio.Task(self.work()) for _ in range(self.max_tasks / 2)] # Create seeders that takes URLs from redis and adds it to own queue seeders = [asyncio.Task(self.get_seeds()) for _ in range(self.max_tasks / 2)] # When all work is done, exit. yield from self.q.join() for s in seeders: s.cancel() for w in workers: w.cancel() @asyncio.coroutine def work(self): while True: # Get URLs from own queue url = yield from self.q.get() # Download page yield from self.fetch_url(url) self.q.task_done() @asyncio.coroutine def fetch_url(self, url): # Handle redirects ourselves. response = yield from self.session.get( url, allow_redirects=True) try: # Handle the reponse pass finally: # Return connection to pool. yield from response.release() @asyncio.coroutine def get_seeds(self): while True: pass
class Crawler: """Crawl a set of URLs. This manages two sets of URLs: 'urls' and 'done'. 'urls' is a set of URLs seen, and 'done' is a list of FetchStatistics. """ def __init__(self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, *, loop=None): # The lone * indicates that all following arguments are keyword-only arguments self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): # \A and \Z are similar to ^ and $, \d represents the digital.(0.0.0.0) self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None def close(self): """Close resources.""" self.session.close() def host_okay(self, host): """Check if a host should be crawled. A literal match (after lowercasing) is always good. For hosts that don't look like IP addresses, some approximate matches are okay depending on the strict flag. """ host = host.lower() if host in self.root_domains: return True if re.match(r'\A[\d\.]*\Z', host): return False if self.strict: return self._host_okay_strictish(host) else: return self._host_okay_lenient(host) def _host_okay_strictish(self, host): """Check if a host should be crawled, strict-ish version. This checks for equality modulo an initial 'www.' component. """ host = host[4:] if host.startswith('www.') else 'www.' + host return host in self.root_domains def _host_okay_lenient(self, host): """Check if a host should be crawled, lenient version. This compares the last two components of the host. """ return lenient_host(host) in self.root_domains def record_statistic(self, fetch_statistic): """Record the FetchStatistic for completed / failed URL.""" self.done.append(fetch_statistic) @asyncio.coroutine def parse_links(self, response): """Return a FetchStatistic and list of links.""" links = set() content_type = None encoding = None body = yield from response.read() if response.status == 200: content_type = response.headers.get('content-type') pdict = {} if content_type: content_type, pdict = cgi.parse_header(content_type) encoding = pdict.get('charset', 'utf-8') if content_type in ('text/html', 'application/xml'): text = yield from response.text() urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''', text)) if urls: LOGGER.info('got %r distinct urls from %r', len(urls), response.url) for url in urls: normalized = urllib.parse.urljoin(response.url, url) defragmented, frag = urllib.parse.urldefrag(normalized) if self.url_allowed(defragmented): links.add(defragmented) stat = FetchStatistic( url=response.url, next_url=None, status=response.status, exception=None, size=len(body), content_type=content_type, encoding=encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls)) return stat, links @asyncio.coroutine def fetch(self, url, max_redirect): """Fetch one URL.""" tries = 0 exception = None while tries < self.max_tries: try: response = yield from self.session.get( url, allow_redirects=False) if tries > 1: LOGGER.info('try %r for %r success', tries, url) break except aiohttp.ClientError as client_error: LOGGER.info('try %r for %r raised %r', tries, url, client_error) exception = client_error tries += 1 else: # We never broke out of the loop: all tries failed. LOGGER.error('%r failed after %r tries', url, self.max_tries) self.record_statistic(FetchStatistic(url=url, next_url=None, status=None, exception=exception, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) return try: if is_redirect(response): location = response.headers['location'] next_url = urllib.parse.urljoin(url, location) self.record_statistic(FetchStatistic(url=url, next_url=next_url, status=response.status, exception=None, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) if next_url in self.seen_urls: return if max_redirect > 0: LOGGER.info('redirect to %r from %r', next_url, url) self.add_url(next_url, max_redirect - 1) else: LOGGER.error('redirect limit reached for %r from %r', next_url, url) else: stat, links = yield from self.parse_links(response) self.record_statistic(stat) for link in links.difference(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) finally: yield from response.release() @asyncio.coroutine def work(self): """Process queue items forever.""" try: while True: url, max_redirect = yield from self.q.get() assert url in self.seen_urls yield from self.fetch(url, max_redirect) self.q.task_done() except asyncio.CancelledError: pass def url_allowed(self, url): if self.exclude and re.search(self.exclude, url): return False parts = urllib.parse.urlparse(url) if parts.scheme not in ('http', 'https'): LOGGER.debug('skipping non-http scheme in %r', url) return False host, port = urllib.parse.splitport(parts.netloc) if not self.host_okay(host): LOGGER.debug('skipping non-root host in %r', url) return False return True def add_url(self, url, max_redirect=None): """Add a URL to the queue if not seen before.""" if max_redirect is None: max_redirect = self.max_redirect LOGGER.debug('adding %r %r', url, max_redirect) self.seen_urls.add(url) self.q.put_nowait((url, max_redirect)) @asyncio.coroutine def crawl(self): """Run the crawler until all finished.""" workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)] self.t0 = time.time() yield from self.q.join() self.t1 = time.time() for w in workers: w.cancel()
class URLCleaner: """Preprocess and clean urls.""" def __init__(self, urls, normalizer, result_saver=print, qsize=None, result_qsize=None, num_workers=1, max_tries=4, timeout=3, max_connections=30, *, loop=None): """Async URLCleaner. :param normalizer: callable that takes url and returns normalized url or False when url is invalid or None, when url can't be validated. """ self.urls = urls self.normalizer = normalizer self.result_saver = result_saver self.loop = loop or asyncio.get_event_loop() self.q = Queue(maxsize=qsize or num_workers * 10, loop=self.loop) self.result_q = Queue(maxsize=result_qsize or num_workers * 10, loop=self.loop) self.num_workers = num_workers self.max_tries = max_tries self.timeout = timeout proxy = os.environ.get('http_proxy') if proxy: self.connector = aiohttp.ProxyConnector(proxy=proxy, limit=max_connections, loop=self.loop) else: self.connector = aiohttp.TCPConnector(limit=max_connections, loop=self.loop) self.t0 = time.time() self.t1 = None self.clean_task = None def local_clean(self, url): local_clean_url = self.normalizer(url) if local_clean_url: status = 'LOCAL_OK' elif local_clean_url is False: status = 'LOCAL_INVALID' local_clean_url = None else: status = 'UNCLEANED' return URLStat(url=url, local_clean_url=local_clean_url, remote_clean_url=None, status=status, http_code=None, exception=None) @asyncio.coroutine def remote_clean(self, urlstat): """Check URL by HEAD probing it.""" tries = 0 exception = None url = urlstat.local_clean_url headers = { 'Accept-Encoding': 'identity', } while tries < self.max_tries: try: response = yield from asyncio.wait_for( aiohttp.request('head', url, allow_redirects=True, headers=headers, connector=self.connector, loop=self.loop), self.timeout, loop=self.loop) response.close() if tries > 1: logger.info('Try %r for %r success', tries, url) break except ValueError as error: # do not need to retry for these errors logger.info('For %r raised %s', url, error) tries = self.max_tries exception = error except aiohttp.HttpProcessingError as e: logger.error('Got http error for %r, exception %s', url, e) urlstat.http_code = e.code urlstat.status = 'REMOTE_ERROR' urlstat.exception = e return urlstat except (aiohttp.ClientError, asyncio.TimeoutError) as error: logger.info('Try %r for %r raised %s, %s', tries, url, type(error), error) exception = error tries += 1 yield from asyncio.sleep(0.1) else: # all tries failed logger.error('all tries for %r failed, exception %s', url, exception) urlstat.status = 'REMOTE_ERROR' urlstat.exception = exception return urlstat urlstat.http_code = response.status if response.status == 200: remote_clean_url = self.normalizer(response.url) if remote_clean_url: urlstat.status = 'REMOTE_OK' urlstat.remote_clean_url = remote_clean_url elif remote_clean_url is False: urlstat.status = 'REMOTE_INVALID' else: # url requires authorization, can't clean urlstat.status = 'UNCLEANED' else: urlstat.status = 'REMOTE_INVALID' return urlstat @asyncio.coroutine def process_url(self, url): urlstat = self.local_clean(url) if urlstat.status == 'LOCAL_OK': urlstat = yield from self.remote_clean(urlstat) return urlstat def close(self): """Close resources.""" self.connector.close() @asyncio.coroutine def save_results(self): """Save cleaned URLStat.""" while True: urlstat = yield from self.result_q.get() try: self.result_saver(urlstat) except StopIteration: self.cancel() except Exception as e: # noqa logger.exception(e) self.result_q.task_done() @asyncio.coroutine def work(self): """Process queue items forever.""" while True: url = yield from self.q.get() urlstat = yield from self.process_url(url) self.q.task_done() yield from self.result_q.put(urlstat) @asyncio.coroutine def _clean(self): try: self.consumer = asyncio.Task(self.save_results(), loop=self.loop) self.workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.num_workers)] self.t0 = time.time() for url in self.urls: yield from self.q.put(url) yield from self.q.join() yield from self.result_q.join() self.t1 = time.time() logger.debug('Cleaning time %.2f seconds', self.t1 - self.t0) self.cancel() finally: self.close() def clean(self): """Run the cleaner until all finished.""" self.clean_task = asyncio.async(self._clean(), loop=self.loop) return self.clean_task def cancel(self): self.consumer.cancel() for w in self.workers: w.cancel() self.clean_task.cancel()
class Spider: def __init__(self, max_tries=30, max_tasks=10, timeout=5, rootDir=os.getcwd()): self.max_tries = max_tries self.max_tasks = max_tasks self.loop = asyncio.get_event_loop() self.q = Queue(loop=self.loop) self.session = aiohttp.ClientSession(loop=self.loop) self.timeout = timeout self.rootDir = rootDir def close(self): self.session.close() def append_request(self, request): self.q.put_nowait(request) @asyncio.coroutine def _get_request(self): r = yield from self.q.get() return r @asyncio.coroutine def fetch(self, request_type, url, params, data): """Fetch one URL""" tries = 0 exception = None while tries < self.max_tries: try: print("try %s---->%d times" % (url, tries)) with aiohttp.Timeout(self.timeout): response = yield from self.session.get(url, params=params) if response.status == 200: content_type = response.headers.get('content-type') if content_type in CONTENT_TYPE_TEXT: with aiohttp.Timeout(self.timeout): content = yield from response.text( encoding='GBK') else: with aiohttp.Timeout(self.timeout): content = yield from response.read() break except asyncio.TimeoutError: print("timeout") except aiohttp.ClientError as client_error: print("client error") except Exception: print("unknown error") tries += 1 else: print("try %s---->more than %d times, quit" % (url, tries)) return None response.release() return content @asyncio.coroutine def _work(self): """Process queue items forever.""" try: while True: r = yield from self._get_request() content = yield from self.fetch(r.request_type, r.url, r.params, r.data) if (content): r.handle_func(content) self.q.task_done() except asyncio.CancelledError: pass @asyncio.coroutine def work(self): yield from self._work() @asyncio.coroutine def spider(self): """run the spider until all finished""" workers = [ asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks) ] yield from self.q.join() for w in workers: w.cancel()
def __init__(self, loop): self.num_worker = 10 self.loop = loop self.q = Queue() self.seen_urls = set(['/'])
class Crawler: def __init__(self, roots, exclude=None, strict=True, max_redirect=10, max_tries=4, max_tasks=10, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z]', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None def close(self): self.session.close() def host_okay(self, host): host = host.lower() if host in self.root_domains: return True if re.match(r'\A[\d\.]*\Z', host): return False if self.strict: return self._host_okay_strictish(host) else: return self._host_okay_lenient(host) def _host_okay_strictish(self, host): host = host[4:] if host.startswith('www.') else 'www.' + host return host in self.root_domains def _host_okay_lenient(self, host): return lenient_host(host) in self.root_domains def record_statistic(self, fetch_statistic): self.done.append(fetch_statistic) @asyncio.coroutine def parse_links(self, response): links = set() content_type = None encoding = None body = yield from response.read() if response.status == 200: content_type = response.headers.get('content-type') pdict = {} if content_type: content_type, pdict = cgi.parse_header(content_type) encoding = pdict.get('charset', 'utf-8') if content_type in ('text/html', 'application/xml'): text = yield from response.text() urls = set(re.findall(r"""(?i)href=["']?([^\s"'<>]+)""", text)) if urls: logger.info('got %r distinct urls from %r', len(urls), response.url) for url in urls: normalized = urllib.parse.urljoin(response.url, url) defragmented, frag = urllib.parse.urldefrag(normalized) if self.url_allowed(defragmented): links.add(defragmented) stats = FetchStatistic(url=response.url, next_url=None, status=response.status, exception=None, size=len(body), content_type=content_type, encoding=encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls)) return stats, links @asyncio.coroutine def fetch(self, url, max_redirect): tries = 0 exception = None while tries < self.max_tries: try: response = yield from self.session.get(url, allow_redirects=False) if tries > 1: logger.info('try %r for % success', tries, url) break except aiohttp.ClientError as client_error: logger.info('try %r for %r raised %r', tries, url, client_error) exception = client_error tries += 1 else: logger.error('%r failed after %r tries', url, self.max_tries) self.record_statistic(FetchStatistic(url=url, next_url=None, status=None, exception=exception, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) return try: if is_redirect(response): location = response.headers['location'] next_url = urllib.parse.urljoin(url, location) self.record_statistic( FetchStatistic(url=url, next_url=next_url, status=response.status, exception=None, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) if next_url in self.seen_urls: return if max_redirect > 0: logger.info('redirect to %r from %r', next_url, url) self.add_url(next_url, max_redirect - 1) else: logger.error('redirect limit reached for &r from %r', next_url, url) else: stat, links = yield from self.parse_links(response) self.record_statistic(stat) for link in links.difference(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) finally: pass @asyncio.coroutine def work(self): try: while True: url, max_redirect = yield from self.q.get() assert url in self.seen_urls yield from self.fetch(url, max_redirect) self.q.task_done() except asyncio.CancelledError: pass def url_allowed(self, url): if self.exclude and re.search(self.exclude, url): return False parts = urllib.parse.urlparse(url) if parts.scheme not in ('http', 'https'): logger.debug('skipping non-http scheme in %r', url) return False host, port = urllib.parse.splitport(parts.netloc) if not self.host_okay(host): logger.debug('skipping non-root host in %r', url) return False return True def add_url(self, url, max_redirect=None): if max_redirect is None: max_redirect = self.max_redirect logger.debug('adding %r %r', url, max_redirect) self.seen_urls.add(url) self.q.put_nowait((url, max_redirect)) def crawl(self): workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)] self.t0 = time.time() yield from self.q.join() self.t1 = time.time() for w in workers: w.cancel()
class Crawler: """Crawl a set of URLs. This manages two sets of URLs: 'urls' and 'done'. 'urls' is a set of URLs seen, and 'done' is a list of FetchStatistics. """ def __init__(self, roots, # What to crawl. exclude=None, include=None, output=None, strict=True, count=None, proxy=None, max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, loop=None, no_parse_links=False): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.include = include self.output = output self.count = int(count) if count else None self.strict = strict self.proxy = proxy self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.task_exit_counter = 0 self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() self.no_parse_links = no_parse_links for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None self.output_file = self.get_file() @asyncio.coroutine def close(self): """Close resources.""" yield from self.session.close() def host_okay(self, host): """Check if a host should be crawled. A literal match (after lowercasing) is always good. For hosts that don't look like IP addresses, some approximate matches are okay depending on the strict flag. """ host = host.lower() if host in self.root_domains: return True if re.match(r'\A[\d\.]*\Z', host): return False if self.strict: return self._host_okay_strictish(host) else: return self._host_okay_lenient(host) def _host_okay_strictish(self, host): """Check if a host should be crawled, strict-ish version. This checks for equality modulo an initial 'www.' component. """ host = host[4:] if host.startswith('www.') else 'www.' + host return host in self.root_domains def _host_okay_lenient(self, host): """Check if a host should be crawled, lenient version. This compares the last two components of the host. """ return lenient_host(host) in self.root_domains def record_statistic(self, fetch_statistic): """Record the FetchStatistic for completed / failed URL.""" self.done.append(fetch_statistic) def parse_text(self, url, text): ''' call callback func on route ''' route, args = router.match(url) if route: route.call(text, **args) @asyncio.coroutine def parse_links(self, response): """Return a FetchStatistic and list of links.""" links = set() content_type = None encoding = None body = yield from response.read() if response.status == 200: content_type = response.headers.get('content-type') pdict = {} if content_type: content_type, pdict = cgi.parse_header(content_type) encoding = pdict.get('charset', 'utf-8') if content_type in ('text/html', 'application/xml'): text = yield from response.text(errors='ignore') # Replace href with (?:href|src) to follow image links. urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''', text)) if urls: logger.debug('got %r distinct urls from %r', len(urls), response.url) for url in urls: normalized = urllib.parse.urljoin(str(response.url), url) defragmented, frag = urllib.parse.urldefrag(normalized) if self.url_allowed(defragmented): links.add(defragmented) # parse text self.parse_text(str(response.url), text) # do outing self.handle_output(str(response.url), text) stat = FetchStatistic( url=response.url, next_url=None, status=response.status, exception=None, size=len(body), content_type=content_type, encoding=encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls)) return stat, links def handle_output(self, url, text): if self.output: d = self.parse_output(url, text) logger.info(f'write item: {url}') outputing.do_write(self.output, d, self.output_file) def parse_output(self, url, text): html = HTML(html=text) title_ele = html.find('title', first=True) d = OrderedDict() d['title'] = title_ele.text d['url'] = url d['datetime'] = now_time() d['text'] = text return d def get_file(self): ''' generate a file name for output ''' domains = list(self.root_domains) dt = datetime.datetime.now() dt_str = dt.strftime('%Y-%m-%d %H:%M:%S') f_name = f'{domains[0]}-{dt_str}' if self.output: if self.output == 'stream': return None f_name += f'.{self.output}' return f_name @asyncio.coroutine def fetch(self, url, max_redirect): """Fetch one URL.""" tries = 0 exception = None while tries < self.max_tries: try: response = yield from self.session.get( url, allow_redirects=False, proxy=self.proxy) if tries > 1: logger.info('try %r for %r success', tries, url) break except aiohttp.ClientError as client_error: logger.info('try %r for %r raised %r', tries, url, client_error) exception = client_error tries += 1 else: # We never broke out of the loop: all tries failed. logger.error('%r failed after %r tries', url, self.max_tries) self.record_statistic(FetchStatistic(url=url, next_url=None, status=None, exception=exception, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) return try: if is_redirect(response): location = response.headers['location'] next_url = urllib.parse.urljoin(url, location) self.record_statistic(FetchStatistic(url=url, next_url=next_url, status=response.status, exception=None, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) if next_url in self.seen_urls: return if max_redirect > 0: logger.info('redirect to %r from %r', next_url, url) self.add_url(next_url, max_redirect - 1) else: logger.error('redirect limit reached for %r from %r', next_url, url) else: stat, links = yield from self.parse_links(response) self.record_statistic(stat) # disable parse links if not self.no_parse_links: for link in links.difference(self.seen_urls): # use router to verify links if self.verify_url(link) or router.verify_url(link, url): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) except Exception as ex: logger.error(f'parse error: {url}') logger.exception(ex) finally: yield from asyncio.sleep(1) yield from response.release() @asyncio.coroutine def exit_on_empty_queue(self): if self.count and len(self.done) >= self.count: logger.warning(f'reach count: {self.count}, now quit') router.stop() if self.q.qsize() == 0: logger.warning('empty queue, now quit') yield from self.q.join() router.stop() @asyncio.coroutine def work(self): """Process queue items forever.""" try: while router.is_running(): url, max_redirect = yield from self.q.get() logger.debug(f'work on url {url}') assert url in self.seen_urls yield from self.fetch(url, max_redirect) self.q.task_done() yield from self.exit_on_empty_queue() except asyncio.CancelledError: logger.warning('canceling the worker') def url_allowed(self, url): parts = urllib.parse.urlparse(url) if parts.scheme not in ('http', 'https'): # logger.debug('skipping non-http scheme in %r', url) return False host, port = urllib.parse.splitport(parts.netloc) if not self.host_okay(host): # logger.debug('skipping non-root host in %r', url) return False return True def verify_url(self, url): if self.include: for pattern in self.include: if re.search(pattern, url): logger.debug( f'{url} match include pattern: {pattern}, allowed') return True if self.exclude and re.search(self.exclude, url): logger.debug( f'{url} match exclude pattern: {self.exclude}, rejected') return False # default False return False def add_url(self, url, max_redirect=None): """Add a URL to the queue if not seen before.""" if max_redirect is None: max_redirect = self.max_redirect logger.debug('adding %r %r', url, max_redirect) self.seen_urls.add(url) self.q.put_nowait((url, max_redirect)) @asyncio.coroutine def crawl(self): """Run the crawler until all finished.""" try: workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)] self.t0 = time.time() # yield from asyncio.gather(*workers, loop=self.loop, return_exceptions=True) yield from router.quit_event.wait() for w in workers: w.cancel() self.t1 = time.time() except asyncio.CancelledError: logger.warning('canceling the crawler') finally: logger.warning('closing the crawler') yield from self.close()
class Crawl: def __init__(self, roots, exclude=None, strict=True, base_url=None, max_redirect=2, max_tries=2, max_tasks=1, proxy=None, *, loop=None): self.base_url = base_url self.t0 = time.time() self.t1 = None self.strict = strict self.exclude = exclude self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.loop = loop or asyncio.get_event_loop() self.q = Queue(loop=self.loop) self.proxy = proxy self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() self.seen_urls = set() self.done = [] for root in roots: parts = urlparse(root) host, port = splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) def add_url(self, url, max_redirect=None): if max_redirect is None: max_redirect = self.max_redirect self.seen_urls.add(url) self.q.put_nowait((url, max_redirect)) def record_statistic(self, fetch_statistic): """Record the FetchStatistic for completed / failed URL.""" self.done.append(fetch_statistic) async def parse_links(self, response): links = set() content_type = None encoding = None price_link = [] if response.status == 200: content_type = response.headers.get('content-type') response_url = str(response.url) if content_type in ('text/html', 'application/xml', 'text/html;charset=UTF-8'): pdict = {} if content_type: content_type, pdict = cgi.parse_header(content_type) encoding = pdict.get('charset', 'utf-8') if content_type in ('text/html', 'application/xml'): text = await response.text() # print(text) '''(?i)href=["']([^\s"'<>]+)''' urls = set( re.findall( '<li style="margin-left: [-\d]+px">.*?<a href="(/s/ref=lp_\d+_nr_n_[\d+].*?)">.*?<span class="refinementLink">(.*?)</span>.*?</a>.*?</li>', text, re.S | re.M)) if urls: LOGGER.info('got %r distinct urls from %r', len(urls), response.url) else: for price_g in range(1, 100, 2): low_price = price_g high_price = price_g + 1 price_link.append( "{}&low-price={}&high-price={}".format( response_url, low_price, high_price)) if len(price_link) > 0: redis_server.lpush("price_link_tmp", *price_link) for url in urls: u, t = url k = u.replace('&', '&') normalized = urljoin(str(response.url), k) defragmented, frag = urldefrag(normalized) if self.url_allowed(defragmented): print(defragmented, t) ''' Children's Books(儿童图书) General (科学通俗读物) 这两个陷入了回调. INFO:__main__:redirect to 'https://www.amazon.cn/s/ref=lp_2084813051_nr_n_11/460-8646033-3118437?rh=n%3A2084813051&ie=UTF8' from 'https://www.amazon.cn/s/ref=lp_2084813051_nr_n_11/460-8646033-3118437?fst=as%3Aoff&rh=n%3A658390051%2Cn%3A%21658391051%2Cn%3A2045366051%2Cn%3A2078652051%2Cn%3A2084813051%2Cn%3A2084839051&bbn=2084813051&ie=UTF8&qid=1511710241&rnid=2084813051' ''' LOGGER.info = LOGGER.debug( 'previous url: %s, next url: %s, title: %s', str(response.url), defragmented, t) if t == "General (科学通俗读物)": LOGGER.error("错误的分类: %r", t) else: links.add(defragmented) stat = FetchStatistic(url=response.url) return stat, links async def fetch(self, url, max_redirect): tries = 0 exception = None while tries < self.max_tries: try: headers = { 'User-Agent': FakeChromeUA.get_ua(), 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Connection': 'keep-alive' } response = await self.session.get(url, allow_redirects=False, headers=headers ) #proxy=self.p, if tries > 1: LOGGER.info = LOGGER.info('try %r for %r success', tries, url) break except aiohttp.ClientError as client_error: LOGGER.info('try %r for %r raised %r', tries, url, client_error) exception = client_error tries += 1 else: LOGGER.error('%r failed after %r tries', url, self.max_tries) return try: if is_redirect(response): location = response.headers['location'] next_url = urljoin(url, location) if next_url in self.seen_urls: return if max_redirect > 0: LOGGER.info('redirect to %r from %r', next_url, url) self.add_url(next_url, max_redirect - 1) else: LOGGER.error('redirect limit reached for %r from %r', next_url, url) else: # LOGGER.info('ubuntu ing') stat, links = await self.parse_links(response) self.record_statistic(stat) for link in links.difference(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) finally: await response.release() async def work(self): try: while True: url, max_redirect = await self.q.get() assert url in self.seen_urls await self.fetch(url, max_redirect) self.q.task_done() except asyncio.CancelledError: pass def host_okay(self, host): """Check if a host should be crawled. A literal match (after lowercasing) is always good. For hosts that don't look like IP addresses, some approximate matches are okay depending on the strict flag. """ host = host.lower() if host in self.root_domains: return True if re.match(r'\A[\d\.]*\Z', host): return False if self.strict: return self._host_okay_strictish(host) else: return self._host_okay_lenient(host) def _host_okay_strictish(self, host): """Check if a host should be crawled, strict-ish version. This checks for equality modulo an initial 'www.' component. """ host = host[4:] if host.startswith('www.') else 'www.' + host return host in self.root_domains def _host_okay_lenient(self, host): """Check if a host should be crawled, lenient version. This compares the last two components of the host. """ return lenient_host(host) in self.root_domains def url_allowed(self, url): if self.exclude and re.search(self.exclude, url): return False parts = urlparse(url) if parts.scheme not in ('http', 'https'): LOGGER.debug('skipping non-http scheme in %r', url) return False host, port = splitport(parts.netloc) if not self.host_okay(host): LOGGER.debug('skipping non-root host in %r', url) return False return True async def crawl(self): """Run the crawler until all finished.""" workers = [ asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks) ] self.t0 = time.time() await self.q.join() self.t1 = time.time() for w in workers: w.cancel() def close(self): self.session.close() def check_result(self): print(self.root_domains) def __call__(self): print("__call__ function") print(self.root_domains)
class KnxMap: """The main scanner instance that takes care of scheduling workers for the targets.""" def __init__(self, targets=None, max_workers=100, loop=None): self.loop = loop or asyncio.get_event_loop() # The number of concurrent workers for discovering KNXnet/IP gateways self.max_workers = max_workers # q contains all KNXnet/IP gateways self.q = Queue(loop=self.loop) # bus_queues is a dict containing a bus queue for each KNXnet/IP gateway self.bus_queues = dict() # bus_protocols is a list of all bus protocol instances for proper connection shutdown self.bus_protocols = list() # knx_gateways is a list of KnxTargetReport objects, one for each found KNXnet/IP gateway self.knx_gateways = list() # bus_devices is a list of KnxBusTargetReport objects, one for each found bus device self.bus_devices = set() self.bus_info = False self.t0 = time.time() self.t1 = None if targets: self.set_targets(targets) else: self.targets = set() def set_targets(self, targets): self.targets = targets for target in self.targets: self.add_target(target) def add_target(self, target): self.q.put_nowait(target) def add_bus_queue(self, gateway, bus_targets): self.bus_queues[gateway] = Queue(loop=self.loop) for target in bus_targets: self.bus_queues[gateway].put_nowait(target) return self.bus_queues[gateway] @asyncio.coroutine def bruteforce_auth_key(self, knx_gateway, target, full_key_space=False): if isinstance(target, set): target = list(target)[0] future = asyncio.Future() transport, protocol = yield from self.loop.create_datagram_endpoint( functools.partial(KnxTunnelConnection, future), remote_addr=(knx_gateway[0], knx_gateway[1])) self.bus_protocols.append(protocol) # Make sure the tunnel has been established connected = yield from future alive = yield from protocol.tpci_connect(target) if full_key_space: key_space = range(0, 0xffffffff) else: key_space = [0x11223344, 0x12345678, 0x00000000, 0x87654321, 0x11111111, 0xffffffff] # Bruteforce the key via A_Authorize_Request messages for key in key_space: access_level = yield from protocol.apci_authenticate(target, key) if access_level == 0: LOGGER.info("GOT THE KEY: {}".format(format(key, '08x'))) break @asyncio.coroutine def knx_bus_worker(self, transport, protocol, queue): """A worker for communicating with devices on the bus.""" try: while True: target = queue.get_nowait() LOGGER.info('BUS: target: {}'.format(target)) if not protocol.tunnel_established: LOGGER.error('KNX tunnel is not open!') return alive = yield from protocol.tpci_connect(target) if alive: properties = collections.OrderedDict() serial = None # DeviceDescriptorRead descriptor = yield from protocol.apci_device_descriptor_read(target) if not descriptor: tunnel_request = protocol.make_tunnel_request(target) tunnel_request.tpci_unnumbered_control_data('DISCONNECT') protocol.send_data(tunnel_request.get_message(), target) queue.task_done() continue if not self.bus_info: t = KnxBusTargetReport(address=target) self.bus_devices.add(t) tunnel_request = protocol.make_tunnel_request(target) tunnel_request.tpci_unnumbered_control_data('DISCONNECT') protocol.send_data(tunnel_request.get_message(), target) queue.task_done() continue dev_desc = struct.unpack('!H', descriptor)[0] desc_medium, desc_type, desc_version = KnxMessage.parse_device_descriptor(dev_desc) if desc_type > 1: # Read System 2 and System 7 manufacturer ID object manufacturer = yield from protocol.apci_property_value_read( target, property_id=DEVICE_OBJECTS.get('PID_MANUFACTURER_ID')) if isinstance(manufacturer, (str, bytes)): manufacturer = int.from_bytes(manufacturer, 'big') manufacturer = get_manufacturer_by_id(manufacturer) # Read the device state device_state = yield from protocol.apci_memory_read( target, memory_address=0x0060) if device_state: properties['DEVICE_STATE'] = KnxMessage.unpack_cemi_runstate( int.from_bytes(device_state, 'big')) # Read the serial number object on System 2 and System 7 devices serial = yield from protocol.apci_property_value_read( target, property_id=DEVICE_OBJECTS.get('PID_SERIAL_NUMBER')) if isinstance(serial, (str, bytes)): serial = codecs.encode(serial, 'hex').decode().upper() for object_index, props in OBJECTS.items(): x = collections.OrderedDict() for k, v in props.items(): ret = yield from protocol.apci_property_value_read( target, property_id=v, object_index=object_index) if ret: x[k.replace('PID_', '')] = codecs.encode(ret, 'hex') if x: properties[OBJECT_TYPES.get(object_index)] = x else: # Try to MemoryRead the manufacturer ID on System 1 devices. # Note: System 1 devices do not support access controls, so # an authorization request is not needed. manufacturer = yield from protocol.apci_memory_read( target, memory_address=0x0104, read_count=1) if isinstance(manufacturer, (str, bytes)): manufacturer = int.from_bytes(manufacturer, 'big') manufacturer = get_manufacturer_by_id(manufacturer) device_state = yield from protocol.apci_memory_read( target, memory_address=0x0060) if device_state: properties['DEVICE_STATE'] = codecs.encode(device_state, 'hex') ret = yield from protocol.apci_memory_read( target, memory_address=0x0105, read_count=2) if ret: properties['DevTyp'] = codecs.encode(ret, 'hex') ret = yield from protocol.apci_memory_read( target, memory_address=0x0101, read_count=3) if ret: properties['ManData'] = codecs.encode(ret, 'hex') ret = yield from protocol.apci_memory_read( target, memory_address=0x0108, read_count=1) if ret: properties['CheckLim'] = codecs.encode(ret, 'hex') ret = yield from protocol.apci_memory_read( target, memory_address=0x01FE, read_count=1) if ret: properties['UsrPrg'] = codecs.encode(ret, 'hex') ret = yield from protocol.apci_memory_read( target, memory_address=0x0116, read_count=4) if ret: properties['AdrTab'] = codecs.encode(ret, 'hex') start_addr = 0x0100 properties['EEPROM_DUMP'] = b'' for i in range(51): ret = yield from protocol.apci_memory_read( target, memory_address=start_addr, read_count=5) if ret: properties['EEPROM_DUMP'] += codecs.encode(ret, 'hex') start_addr += 5 if descriptor: t = KnxBusTargetReport( address=target, medium=desc_medium, type=desc_type, version=desc_version, device_serial=serial, manufacturer=manufacturer, properties=properties) self.bus_devices.add(t) # Properly close the TPCI layer yield from protocol.tpci_disconnect(target) queue.task_done() except asyncio.CancelledError: pass except asyncio.QueueEmpty: pass @asyncio.coroutine def bus_scan(self, knx_gateway, bus_targets): queue = self.add_bus_queue(knx_gateway.host, bus_targets) LOGGER.info('Scanning {} bus device(s) on {}'.format(queue.qsize(), knx_gateway.host)) future = asyncio.Future() transport, bus_protocol = yield from self.loop.create_datagram_endpoint( functools.partial( KnxTunnelConnection, future, ndp_defer_time=self.bus_timeout, knx_source=self.knx_source), remote_addr=(knx_gateway.host, knx_gateway.port)) self.bus_protocols.append(bus_protocol) # Make sure the tunnel has been established connected = yield from future if connected: workers = [asyncio.Task(self.knx_bus_worker(transport, bus_protocol, queue), loop=self.loop)] self.t0 = time.time() yield from queue.join() self.t1 = time.time() for w in workers: w.cancel() bus_protocol.knx_tunnel_disconnect() for i in self.bus_devices: knx_gateway.bus_devices.append(i) LOGGER.info('Bus scan took {} seconds'.format(self.t1 - self.t0)) @asyncio.coroutine def knx_search_worker(self): """Send a KnxSearch request to see if target is a KNX device.""" try: sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sock.setblocking(0) sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1) sock.setsockopt(socket.SOL_SOCKET, socket.SO_BINDTODEVICE, struct.pack('256s', str.encode(self.iface))) protocol = KnxGatewaySearch() waiter = asyncio.Future(loop=self.loop) transport = self.loop._make_datagram_transport( sock, protocol, ('224.0.23.12', 3671), waiter) try: # Wait until connection_made() has been called on the transport yield from waiter except: LOGGER.error('Creating multicast transport failed!') transport.close() return # Wait SEARCH_TIMEOUT seconds for responses to our multicast packets yield from asyncio.sleep(self.search_timeout) if protocol.responses: # If protocol received SEARCH_RESPONSE packets, print them for response in protocol.responses: peer = response[0] response = response[1] t = KnxTargetReport( host=peer[0], port=peer[1], mac_address=response.body.get('dib_dev_info').get('knx_mac_address'), knx_address=response.body.get('dib_dev_info').get('knx_address'), device_serial=response.body.get('dib_dev_info').get('knx_device_serial'), friendly_name=response.body.get('dib_dev_info').get('device_friendly_name'), device_status=response.body.get('dib_dev_info').get('device_status'), knx_medium=response.body.get('dib_dev_info').get('knx_medium'), project_install_identifier=response.body.get('dib_dev_info').get('project_install_identifier'), supported_services=[ KNX_SERVICES[k] for k, v in response.body.get('dib_supp_sv_families').get('families').items()], bus_devices=[]) self.knx_gateways.append(t) except asyncio.CancelledError: pass @asyncio.coroutine def search_gateways(self): self.t0 = time.time() yield from asyncio.ensure_future(asyncio.Task(self.knx_search_worker(), loop=self.loop)) self.t1 = time.time() LOGGER.info('Scan took {} seconds'.format(self.t1 - self.t0)) @asyncio.coroutine def knx_description_worker(self): """Send a KnxDescription request to see if target is a KNX device.""" try: while True: target = self.q.get_nowait() LOGGER.debug('Scanning {}'.format(target)) for _try in range(self.desc_retries): LOGGER.debug('Sending {}. KnxDescriptionRequest to {}'.format(_try, target)) future = asyncio.Future() yield from self.loop.create_datagram_endpoint( functools.partial(KnxGatewayDescription, future, timeout=self.desc_timeout), remote_addr=target) response = yield from future if response: break if response and isinstance(response, KnxDescriptionResponse): t = KnxTargetReport( host=target[0], port=target[1], mac_address=response.body.get('dib_dev_info').get('knx_mac_address'), knx_address=response.body.get('dib_dev_info').get('knx_address'), device_serial=response.body.get('dib_dev_info').get('knx_device_serial'), friendly_name=response.body.get('dib_dev_info').get('device_friendly_name'), device_status=response.body.get('dib_dev_info').get('device_status'), knx_medium=response.body.get('dib_dev_info').get('knx_medium'), project_install_identifier=response.body.get('dib_dev_info').get('project_install_identifier'), supported_services=[ KNX_SERVICES[k] for k, v in response.body.get('dib_supp_sv_families').get('families').items()], bus_devices=[]) self.knx_gateways.append(t) self.q.task_done() except (asyncio.CancelledError, asyncio.QueueEmpty): pass @asyncio.coroutine def monitor(self, targets=None, group_monitor_mode=False): if targets: self.set_targets(targets) if group_monitor_mode: LOGGER.debug('Starting group monitor') else: LOGGER.debug('Starting bus monitor') future = asyncio.Future() transport, protocol = yield from self.loop.create_datagram_endpoint( functools.partial(KnxBusMonitor, future, group_monitor=group_monitor_mode), remote_addr=list(self.targets)[0]) self.bus_protocols.append(protocol) yield from future if group_monitor_mode: LOGGER.debug('Starting group monitor') else: LOGGER.debug('Starting bus monitor') @asyncio.coroutine def search(self, search_timeout=5, iface=None): self.iface = iface self.search_timeout = search_timeout LOGGER.info('Make sure there are no filtering rules that drop UDP multicast packets!') yield from self.search_gateways() for t in self.knx_gateways: print_knx_target(t) LOGGER.info('Searching done') @asyncio.coroutine def brute(self, targets=None, bus_target=None, full_key_space=False): if targets: self.set_targets(targets) tasks = [asyncio.Task(self.bruteforce_auth_key(t, bus_target, full_key_space), loop=self.loop) for t in self.targets] yield from asyncio.wait(tasks) @asyncio.coroutine def scan(self, targets=None, desc_timeout=2, desc_retries=2, bus_timeout=2, bus_targets=None, bus_info=False, knx_source=None, auth_key=0xffffffff): """The function that will be called by run_until_complete(). This is the main coroutine.""" self.auth_key = auth_key if targets: self.set_targets(targets) self.desc_timeout = desc_timeout self.desc_retries = desc_retries self.bus_timeout = bus_timeout self.knx_source = knx_source workers = [asyncio.Task(self.knx_description_worker(), loop=self.loop) for _ in range(self.max_workers if len(self.targets) > self.max_workers else len(self.targets))] self.t0 = time.time() yield from self.q.join() self.t1 = time.time() for w in workers: w.cancel() if bus_targets and self.knx_gateways: self.bus_info = bus_info bus_scanners = [asyncio.Task(self.bus_scan(g, bus_targets), loop=self.loop) for g in self.knx_gateways] yield from asyncio.wait(bus_scanners) else: LOGGER.info('Scan took {} seconds'.format(self.t1 - self.t0)) for t in self.knx_gateways: print_knx_target(t) @asyncio.coroutine def group_writer(self, target, value=0, routing=False, desc_timeout=2, desc_retries=2, iface=False): self.desc_timeout = desc_timeout self.desc_retries = desc_retries self.iface = iface workers = [asyncio.Task(self.knx_description_worker(), loop=self.loop) for _ in range(self.max_workers if len(self.targets) > self.max_workers else len(self.targets))] self.t0 = time.time() yield from self.q.join() self.t1 = time.time() for w in workers: w.cancel() if self.knx_gateways: # TODO: make sure only a single gateway is supplied knx_gateway = self.knx_gateways[0] else: LOGGER.error('No valid KNX gateway found') return if routing: # Use KNX Routing to write group values if 'KNXnet/IP Routing' not in knx_gateway.supported_services: LOGGER.error('KNX gateway {gateway} does not support Routing'.format( gateway=knx_gateway.host)) sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sock.setblocking(0) sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1) sock.setsockopt(socket.SOL_SOCKET, socket.SO_BINDTODEVICE, struct.pack('256s', str.encode(self.iface))) # TODO: what if we have devices that access more advanced payloads? if isinstance(value, str): value = int(value) protocol = KnxRoutingConnection(target=target, value=value) waiter = asyncio.Future(loop=self.loop) transport = self.loop._make_datagram_transport( sock, protocol, ('224.0.23.12', 3671), waiter) try: # Wait until connection_made() has been called on the transport yield from waiter except: LOGGER.error('Creating multicast transport failed!') transport.close() return else: # Use KNX Tunnelling to write group values if 'KNXnet/IP Tunnelling' not in knx_gateway.supported_services: LOGGER.error('KNX gateway {gateway} does not support Routing'.format( gateway=knx_gateway.host)) future = asyncio.Future() transport, protocol = yield from self.loop.create_datagram_endpoint( functools.partial(KnxTunnelConnection, future), remote_addr=(knx_gateway.host, knx_gateway.port)) self.bus_protocols.append(protocol) # Make sure the tunnel has been established connected = yield from future if connected: # TODO: what if we have devices that access more advanced payloads? if isinstance(value, str): value = int(value) yield from protocol.apci_group_value_write(target, value=value) protocol.knx_tunnel_disconnect() @asyncio.coroutine def apci(self, target, desc_timeout=2, desc_retries=2, iface=False, args=None): self.desc_timeout = desc_timeout self.desc_retries = desc_retries self.iface = iface self.knx_source = args.knx_source workers = [asyncio.Task(self.knx_description_worker(), loop=self.loop) for _ in range(self.max_workers if len(self.targets) > self.max_workers else len(self.targets))] self.t0 = time.time() yield from self.q.join() self.t1 = time.time() for w in workers: w.cancel() if self.knx_gateways: # TODO: make sure only a single gateway is supplied knx_gateway = self.knx_gateways[0] else: LOGGER.error('No valid KNX gateway found') return # Use KNX Tunnelling to write group values if 'KNXnet/IP Tunnelling' not in knx_gateway.supported_services: LOGGER.error('KNX gateway {gateway} does not support Routing'.format( gateway=knx_gateway.host)) future = asyncio.Future() transport, protocol = yield from self.loop.create_datagram_endpoint( functools.partial(KnxTunnelConnection, future, knx_source=self.knx_source), remote_addr=(knx_gateway.host, knx_gateway.port)) self.bus_protocols.append(protocol) # Make sure the tunnel has been established connected = yield from future if connected: if args.apci_type == 'Memory_Read': alive = yield from protocol.tpci_connect(target) if alive: dev_type = yield from protocol.get_device_type(target) if not dev_type: protocol.knx_tunnel_disconnect() protocol.tpci_disconnect(target) return if dev_type > 1 and not args.ignore_auth: auth_key = args.auth_key if not isinstance(auth_key, int): try: auth_key = int(auth_key, 16) except ValueError: LOGGER.error('Invalid property ID') protocol.knx_tunnel_disconnect() protocol.tpci_disconnect(target) return auth_level = yield from protocol.apci_authenticate( target, key=auth_key) if auth_level > 0: LOGGER.error('Invalid authentication key') protocol.knx_tunnel_disconnect() protocol.tpci_disconnect(target) return memory_address = args.memory_address if not isinstance(memory_address, int): try: memory_address = int(memory_address, 16) except ValueError: LOGGER.error('Invalid property ID') protocol.knx_tunnel_disconnect() protocol.tpci_disconnect(target) return data = yield from protocol.apci_memory_read( target, memory_address=memory_address, read_count=args.read_count) protocol.tpci_disconnect(target) if not data: LOGGER.debug('No data received') else: LOGGER.info(codecs.encode(data, 'hex')) elif args.apci_type == 'Memory_Write': alive = yield from protocol.tpci_connect(target) if alive: dev_type = yield from protocol.get_device_type(target) if not dev_type: protocol.knx_tunnel_disconnect() protocol.tpci_disconnect(target) return if dev_type > 1: auth_key = args.auth_key if not isinstance(auth_key, int): try: auth_key = int(auth_key, 16) except ValueError: LOGGER.error('Invalid property ID') protocol.knx_tunnel_disconnect() protocol.tpci_disconnect(target) return auth_level = yield from protocol.apci_authenticate( target, key=auth_key) if auth_level > 0: LOGGER.error('Invalid authentication key') protocol.knx_tunnel_disconnect() protocol.tpci_disconnect(target) return memory_address = args.memory_address memory_data = args.memory_data if not isinstance(memory_address, int) or \ not isinstance(memory_data, bytes): try: memory_address = int(memory_address, 16) memory_data = codecs.decode(memory_data, 'hex') except ValueError: LOGGER.error('Invalid property ID or write data') protocol.knx_tunnel_disconnect() protocol.tpci_disconnect(target) return data = yield from protocol.apci_memory_write( target, memory_address=memory_address, write_count=args.read_count, data=memory_data) protocol.tpci_disconnect(target) if not data: LOGGER.debug('No data received') else: LOGGER.info(codecs.encode(data, 'hex')) elif args.apci_type == 'Key_Write': alive = yield from protocol.tpci_connect(target) if alive: dev_type = yield from protocol.get_device_type(target) if not dev_type: protocol.knx_tunnel_disconnect() protocol.tpci_disconnect(target) return if dev_type > 1: auth_key = args.auth_key if not isinstance(auth_key, int): try: auth_key = int(auth_key, 16) except ValueError: LOGGER.error('Invalid property ID') protocol.knx_tunnel_disconnect() protocol.tpci_disconnect(target) return auth_level = yield from protocol.apci_authenticate( target, key=auth_key) if auth_level > 0: LOGGER.error('Invalid authentication key') protocol.knx_tunnel_disconnect() protocol.tpci_disconnect(target) return new_auth_key = args.new_auth_key if not isinstance(new_auth_key, int): try: new_auth_key = int(new_auth_key, 16) except ValueError: LOGGER.error('Invalid property ID') protocol.knx_tunnel_disconnect() protocol.tpci_disconnect(target) return data = yield from protocol.apci_key_write( target, level=args.auth_level, key=new_auth_key) protocol.tpci_disconnect(target) if not data: LOGGER.debug('No data received') else: LOGGER.info('Authorization level: {}'.format(data)) elif args.apci_type == 'PropertyValue_Read': property_id = args.property_id if not isinstance(property_id, int): try: property_id = int(property_id, 16) except ValueError: LOGGER.error('Invalid property ID') protocol.knx_tunnel_disconnect() protocol.tpci_disconnect(target) return alive = yield from protocol.tpci_connect(target) if alive: data = yield from protocol.apci_property_value_read( target, object_index=args.object_index, property_id=property_id, num_elements=args.num_elements, start_index=args.start_index) protocol.tpci_disconnect(target) if not data: LOGGER.debug('No data received') else: LOGGER.info(codecs.encode(data, 'hex')) elif args.apci_type == 'DeviceDescriptor_Read': alive = yield from protocol.tpci_connect(target) if alive: data = yield from protocol.apci_device_descriptor_read(target) protocol.tpci_disconnect(target) if not data: LOGGER.debug('No data received') else: LOGGER.info(codecs.encode(data, 'hex')) elif args.apci_type == 'Authorize': auth_key = args.auth_key if not isinstance(auth_key, int): try: auth_key = int(auth_key, 16) except ValueError: LOGGER.error('Invalid property ID') protocol.knx_tunnel_disconnect() protocol.tpci_disconnect(target) return alive = yield from protocol.tpci_connect(target) if alive: data = yield from protocol.apci_authenticate( target, key=auth_key) protocol.tpci_disconnect(target) if isinstance(data, (type(None), type(False))): LOGGER.debug('No data received') else: LOGGER.info('Authorization level: {}'.format(data)) elif args.apci_type == 'IndividualAddress_Read': alive = yield from protocol.tpci_connect(target) if alive: data = yield from protocol.apci_individual_address_read(target) protocol.tpci_disconnect(target) if isinstance(data, (type(None), type(False))): LOGGER.debug('No data received') else: LOGGER.info('Individual address: {}'.format(data)) elif args.apci_type == 'UserManufacturerInfo_Read': alive = yield from protocol.tpci_connect(target) if alive: data = yield from protocol.apci_user_manufacturer_info_read(target) protocol.tpci_disconnect(target) if isinstance(data, (type(None), type(False))): LOGGER.debug('No data received') else: LOGGER.info(codecs.encode(data, 'hex')) elif args.apci_type == 'Restart': alive = yield from protocol.tpci_connect(target) if alive: yield from protocol.apci_restart(target) protocol.tpci_disconnect(target) elif args.apci_type == 'Progmode': alive = yield from protocol.tpci_connect(target) if alive: dev_type = yield from protocol.get_device_type(target) if not dev_type: protocol.knx_tunnel_disconnect() protocol.tpci_disconnect(target) return if dev_type > 1: auth_key = args.auth_key if not isinstance(auth_key, int): try: auth_key = int(auth_key, 16) except ValueError: LOGGER.error('Invalid property ID') protocol.knx_tunnel_disconnect() protocol.tpci_disconnect(target) return auth_level = yield from protocol.apci_authenticate( target, key=auth_key) if auth_level > 0: LOGGER.error('Invalid authentication key') protocol.knx_tunnel_disconnect() protocol.tpci_disconnect(target) return data = yield from protocol.apci_memory_read( target, memory_address=0x0060, read_count=args.read_count) if not data: LOGGER.debug('No data received') else: data = int.from_bytes(data, 'big') run_state = KnxMessage.unpack_cemi_runstate(data) if args.toggle: if run_state.get('PROG_MODE'): run_state = KnxMessage.pack_cemi_runstate( prog_mode=False, link_layer_active=run_state.get('LINK_LAYER'), transport_layer_active=run_state.get('TRANSPORT_LAYER'), app_layer_active=run_state.get('APP_LAYER'), serial_interface_active=run_state.get('SERIAL_INTERFACE'), user_app_run=run_state.get('USER_APP'), bcu_download_mode=run_state.get('BC_DM')) else: run_state = KnxMessage.pack_cemi_runstate( prog_mode=True, link_layer_active=run_state.get('LINK_LAYER'), transport_layer_active=run_state.get('TRANSPORT_LAYER'), app_layer_active=run_state.get('APP_LAYER'), serial_interface_active=run_state.get('SERIAL_INTERFACE'), user_app_run=run_state.get('USER_APP'), bcu_download_mode=run_state.get('BC_DM')) data = yield from protocol.apci_memory_write( target, memory_address=0x0060, data=struct.pack('!B', run_state)) if not data: LOGGER.debug('No data received') else: LOGGER.info(codecs.encode(data, 'hex')) else: if run_state.get('PROG_MODE'): LOGGER.info('Programming mode ENABLED') else: LOGGER.info('Programming mode disabled') protocol.tpci_disconnect(target) elif args.apci_type == 'GroupValue_Write': if not hasattr(args, 'value') or args.value is None: LOGGER.error('Invalid parameters') protocol.knx_tunnel_disconnect() return if isinstance(args.value, str): value = int(args.value) yield from protocol.apci_group_value_write(target, value=value) protocol.knx_tunnel_disconnect()
class Crawler: def __init__(self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None def close(self): self.session.close() def host_okay(self, host): host = host.lower() if host in self.root_domains: return True if re.match(r'\A[\d\.]*\Z', host): return False if self.strict: return self._host_okay_strictish(host) else: return self._host_okay_lenient(host) def _host_okay_strictish(self, host): host = host[4:] if host.startswith('www.') else 'www.' + host return host in self.root_domains def _host_okay_lenient(self, host): return lenient_host(host) in self.root_domains def record_statistic(self, fetch_statistic): self.done.append(fetch_statistic) @asyncio.coroutine def parse_links(self, response): links = set() content_type = None encoding = None body = yield from response.read() if response.status == 200: content_type = response.headers.get('content-type') pdict = {} if content_type: content_type, pdict = cgi.parse_header(content_type) encoding = pdict.get('charset', 'utf-8') if content_type in ('text/html', 'application/xml'): text = yield from response.text() # Replace href with (?:href|src) to follow image links. urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''', text)) if urls: LOGGER.info('got %r distinct urls from %r', len(urls), response.url) for url in urls: normalized = urllib.parse.urljoin(response.url, url) defragmented, frag = urllib.parse.urldefrag(normalized) if self.url_allowed(defragmented): links.add(defragmented) stat = FetchStatistic( url=response.url, next_url=None, status=response.status, exception=None, size=len(body), content_type=content_type, encoding=encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls)) return stat, links @asyncio.coroutine def fetch(self, url, max_redirect): tries = 0 exception = None while tries < self.max_tries: try: response = yield from self.session.get( url, allow_redirects=False) if tries > 1: LOGGER.info('try %r for %r success', tries, url) break except aiohttp.ClientError as client_error: LOGGER.info('try %r for %r raised %r', tries, url, client_error) exception = client_error tries += 1 else: # We never broke out of the loop: all tries failed. LOGGER.error('%r failed after %r tries', url, self.max_tries) self.record_statistic(FetchStatistic(url=url, next_url=None, status=None, exception=exception, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) return try: if is_redirect(response): location = response.headers['location'] next_url = urllib.parse.urljoin(url, location) self.record_statistic(FetchStatistic(url=url, next_url=next_url, status=response.status, exception=None, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) if next_url in self.seen_urls: return if max_redirect > 0: LOGGER.info('redirect to %r from %r', next_url, url) self.add_url(next_url, max_redirect - 1) else: LOGGER.error('redirect limit reached for %r from %r', next_url, url) else: stat, links = yield from self.parse_links(response) self.record_statistic(stat) for link in links.difference(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) finally: yield from response.release() @asyncio.coroutine def work(self): # Process queue items forever. try: while True: url, max_redirect = yield from self.q.get() assert url in self.seen_urls yield from self.fetch(url, max_redirect) self.q.task_done() except asyncio.CancelledError: pass def url_allowed(self, url): if self.exclude and re.search(self.exclude, url): return False parts = urllib.parse.urlparse(url) if parts.scheme not in ('http', 'https'): LOGGER.debug('skipping non-http scheme in %r', url) return False host, port = urllib.parse.splitport(parts.netloc) if not self.host_okay(host): LOGGER.debug('skipping non-root host in %r', url) return False return True def add_url(self, url, max_redirect=None): # Add a URL to the queue if not seen before. if max_redirect is None: max_redirect = self.max_redirect LOGGER.debug('adding %r %r', url, max_redirect) self.seen_urls.add(url) self.q.put_nowait((url, max_redirect)) @asyncio.coroutine def crawl(self): # Run the crawler until all finished. workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)] self.t0 = time.time() yield from self.q.join() self.t1 = time.time() for w in workers: w.cancel()
class Crawler: """Crawl a set of URLs. This manages two sets of URLs: 'urls' and 'done'. 'urls' is a set of URLs seen, and 'done' is a list of FetchStatistics. 这里面有两个队列.seen_urls 和 done """ # TODO xpath support # TODO uvloop support def __init__(self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None def close(self): """Close resources.""" self.session.close() def host_okay(self, host): """Check if a host should be crawled. A literal match (after lowercasing) is always good. For hosts that don't look like IP addresses, some approximate matches are okay depending on the strict flag. """ host = host.lower() if host in self.root_domains: return True if re.match(r'\A[\d\.]*\Z', host): return False if self.strict: return self._host_okay_strictish(host) else: return self._host_okay_lenient(host) def _host_okay_strictish(self, host): """Check if a host should be crawled, strict-ish version. This checks for equality modulo an initial 'www.' component. """ host = host[4:] if host.startswith('www.') else 'www.' + host return host in self.root_domains def _host_okay_lenient(self, host): """Check if a host should be crawled, lenient version. This compares the last two components of the host. """ return lenient_host(host) in self.root_domains def record_statistic(self, fetch_statistic): """Record the FetchStatistic for completed / failed URL.""" self.done.append(fetch_statistic) @asyncio.coroutine def parse_links(self, response): """Return a FetchStatistic and list of links.""" links = set() content_type = None encoding = None body = yield from response.read() if response.status == 200: content_type = response.headers.get('content-type') pdict = {} if content_type: content_type, pdict = cgi.parse_header(content_type) encoding = pdict.get('charset', 'utf-8') if content_type in ('text/html', 'application/xml'): text = yield from response.text() # Replace href with (?:href|src) to follow image links. urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''', text)) if urls: LOGGER.info('got %r distinct urls from %r', len(urls), response.url) for url in urls: normalized = urllib.parse.urljoin(response.url, url) defragmented, frag = urllib.parse.urldefrag(normalized) if self.url_allowed(defragmented): links.add(defragmented) stat = FetchStatistic( url=response.url, next_url=None, status=response.status, exception=None, size=len(body), content_type=content_type, encoding=encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls)) return stat, links @asyncio.coroutine def fetch(self, url, max_redirect): """Fetch one URL.""" tries = 0 exception = None while tries < self.max_tries: try: response = yield from self.session.get( url, allow_redirects=False) if tries > 1: LOGGER.info('try %r for %r success', tries, url) break except aiohttp.ClientError as client_error: LOGGER.info('try %r for %r raised %r', tries, url, client_error) exception = client_error tries += 1 else: # We never broke out of the loop: all tries failed. LOGGER.error('%r failed after %r tries', url, self.max_tries) self.record_statistic(FetchStatistic(url=url, next_url=None, status=None, exception=exception, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) return try: if is_redirect(response): location = response.headers['location'] next_url = urllib.parse.urljoin(url, location) self.record_statistic(FetchStatistic(url=url, next_url=next_url, status=response.status, exception=None, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) if next_url in self.seen_urls: return if max_redirect > 0: LOGGER.info('redirect to %r from %r', next_url, url) self.add_url(next_url, max_redirect - 1) else: LOGGER.error('redirect limit reached for %r from %r', next_url, url) else: stat, links = yield from self.parse_links(response) self.record_statistic(stat) for link in links.difference(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) finally: yield from response.release() @asyncio.coroutine def work(self): """Process queue items forever.""" try: while True: url, max_redirect = yield from self.q.get() assert url in self.seen_urls yield from self.fetch(url, max_redirect) self.q.task_done() except asyncio.CancelledError: pass def url_allowed(self, url): if self.exclude and re.search(self.exclude, url): return False parts = urllib.parse.urlparse(url) if parts.scheme not in ('http', 'https'): LOGGER.debug('skipping non-http scheme in %r', url) return False host, port = urllib.parse.splitport(parts.netloc) if not self.host_okay(host): LOGGER.debug('skipping non-root host in %r', url) return False return True def add_url(self, url, max_redirect=None): """Add a URL to the queue if not seen before.""" if max_redirect is None: max_redirect = self.max_redirect LOGGER.debug('adding %r %r', url, max_redirect) self.seen_urls.add(url) self.q.put_nowait((url, max_redirect)) @asyncio.coroutine def crawl(self): """Run the crawler until all finished.""" workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)] self.t0 = time.time() yield from self.q.join() self.t1 = time.time() for w in workers: w.cancel()
def __init__(self, roots, exclude=None, strict=True, max_redirect=10, proxy=None, max_tries=4, user_agents=None, max_tasks=10, time_out=15, allowed_paths=None, item_paths=None, *, loop=None): if not loop: asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) self.loop = asyncio.get_event_loop() else: self.loop = loop self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect # self.proxy = proxy self.max_tries = max_tries self.max_tasks = max_tasks self.time_out = time_out self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self._session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() self._allowed_paths = None if allowed_paths: self._allowed_paths = allowed_paths self._item_paths = None if item_paths: self._item_paths = item_paths for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.user_agents = self.USER_AGENTS if user_agents: self._user_agents = user_agents self.t0 = time.time() self.t1 = None
class BaseCrawler(object): USER_AGENTS = [ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36' ] ALLOW_CONTENT_TYPE = ('text/html', 'application/xml') ALLOWED_PATHS = None ITEM_PATHS = None def __init__(self, roots, exclude=None, strict=True, max_redirect=10, proxy=None, max_tries=4, user_agents=None, max_tasks=10, time_out=15, allowed_paths=None, item_paths=None, *, loop=None): if not loop: asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) self.loop = asyncio.get_event_loop() else: self.loop = loop self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect # self.proxy = proxy self.max_tries = max_tries self.max_tasks = max_tasks self.time_out = time_out self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self._session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() self._allowed_paths = None if allowed_paths: self._allowed_paths = allowed_paths self._item_paths = None if item_paths: self._item_paths = item_paths for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.user_agents = self.USER_AGENTS if user_agents: self._user_agents = user_agents self.t0 = time.time() self.t1 = None @property def session(self): if not self._session: self._session = aiohttp.ClientSession(loop=self.loop) return self._session @property def proxy(self): proxy = 'http://{}'.format( requests.get("http://127.0.0.1:5010/get/").text) logger.info(proxy) return proxy @property def allowed_paths(self): if self._allowed_paths is None: self._allowed_paths = self.ALLOWED_PATHS return self._allowed_paths @property def item_paths(self): if self._item_paths is None: self._item_paths = self.ITEM_PATHS return self._item_paths def host_okay(self, host): """Check if a host should be crawled. A literal match (after lowercasing) is always good. For hosts that don't look like IP addresses, some approximate matches are okay depending on the strict flag. """ host = host.lower() if host in self.root_domains: return True if re.match(r'\A[\d\.]*\Z', host): return False if self.strict: return self._host_okay_strictish(host) return self._host_okay_lenient(host) def _host_okay_strictish(self, host): """Check if a host should be crawled, strict-ish version. This checks for equality modulo an initial 'www.' component. """ host = host[4:] if host.startswith('www.') else 'www.' + host return host in self.root_domains def _host_okay_lenient(self, host): """Check if a host should be crawled, lenient version. This compares the last two components of the host. """ return lenient_host(host) in self.root_domains def record_statistic(self, fetch_statistic): """Record the FetchStatistic for completed / failed URL.""" self.done.append(fetch_statistic) def get_random_user_agent(self): if len(self._user_agents) == 1: return self._user_agents return random.choice(self._user_agents) def close(self): self.session.close() def add_url(self, url, max_redirect=None, meta=None): if meta is None: meta = {} if max_redirect is None: max_redirect = self.max_redirect logger.debug('adding %r %r', url, max_redirect) self.seen_urls.add(url) self.q.put_nowait((url, max_redirect, meta)) async def parse_item(self, url, data, *args, **kwargs): allowed, parse_function = self.parse_item_allowed(url) if allowed: await parse_function(url, data, *args, **kwargs) def parse_item_allowed(self, url): if self.item_paths: for key, rule in self.item_paths.items(): if not re.search(rule, url): continue return True, self.get_parse_function(key) return False, None def get_parse_function(self, name): parse_function_name = 'parse_{}'.format(name) if hasattr(self, parse_function_name): return getattr(self, parse_function_name) logger.error('Not Implemented method: %r', parse_function_name) raise NotImplementedError def path_allowed(self, url): if self.allowed_paths: logger.debug(self.allowed_paths) for rule in self.allowed_paths: if not re.search(rule, url): continue return True return False async def parse(self, url, response, **kwargs): links = set() content_type = None encoding = None body = await response.read() if response.status == 200: content_type = response.headers.get('content-type') pdict = {} if content_type: content_type, pdict = cgi.parse_header(content_type) encoding = pdict.get('charset', 'utf-8') if content_type in self.ALLOW_CONTENT_TYPE: data = await response.text() links = await self._parse_links(response.url, data) await self.parse_item(url, data, **kwargs) stat = FetchStatistic(url=response.url.human_repr(), next_url=None, status=response.status, exception=None, size=len(body), content_type=content_type, encoding=encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls)) return stat, links async def _parse_links(self, base_url, text): links = set() # Replace href with (?:href|src) to follow image links. urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''', text)) if urls: logger.info('got %r distinct urls from %r', len(urls), base_url) for url in urls: try: normalized = urllib.parse.urljoin(base_url.human_repr(), url) # normalized = base_url.join(url) defragmented, frag = urllib.parse.urldefrag(normalized) except TypeError as type_error: logger.error('join error happen on base_url: %r, url: %r', base_url, url) continue if self.url_allowed(defragmented): links.add(defragmented) return links def headers(self, **kwargs): headers = {'User-Agent': self.get_random_user_agent()} headers.update(**kwargs) return headers async def fetch(self, url, max_redirect, meta=None): tries = 0 exception = None while tries < self.max_tries: try: with async_timeout.timeout(self.time_out): headers = self.headers() response = await self.session.get(url, headers=headers, proxy=self.proxy, allow_redirects=False) if tries > 1: logger.info('try %r for %r success', tries, url) break except aiohttp.ClientError as client_error: logger.info('try %r for %r raised %r', tries, url, client_error) exception = client_error except asyncio.TimeoutError as timeout_error: logger.info('try %r for %r raised %r', tries, url, timeout_error) exception = timeout_error except Exception as e: logger.info('try %r for %r raised %r', tries, url, e) exception = e tries += 1 else: # We never broke out of the loop: all tries failed. logger.error('%r failed after %r tries', url, self.max_tries) self.record_statistic( FetchStatistic(url=url, next_url=None, status=None, exception=exception, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) return try: if is_redirect(response): location = response.headers['location'] next_url = urllib.parse.urljoin(url, location) self.record_statistic( FetchStatistic(url=url, next_url=next_url, status=response.status, exception=None, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) if next_url in self.seen_urls: return if max_redirect > 0: if self.url_allowed(next_url): logger.info('redirect to %r from %r', next_url, url) self.add_url(next_url, max_redirect - 1) else: logger.error('redirect limit reached for %r from %r', next_url, url) else: stat, links = await self.parse(url, response) self.record_statistic(stat) for link in links.difference(self.seen_urls): self.add_url(link, meta=meta) self.seen_urls.update(links) finally: await response.release() async def work(self): try: while True: url, max_redirect, meta = await self.q.get() assert url in self.seen_urls await self.fetch(url, max_redirect, meta) self.q.task_done() except asyncio.CancelledError: pass def url_allowed(self, url): if self.exclude and re.search(self.exclude, url): return False parts = urllib.parse.urlparse(url) if parts.scheme not in ('http', 'https'): logger.debug('skipping non-http scheme in %r', url) return False host, port = urllib.parse.splitport(parts.netloc) if not self.host_okay(host): logger.debug('skipping non-root host in %r', url) return False return self.path_allowed(url) async def crawl(self): workers = [ asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks) ] self.t0 = time.time() await self.q.join() self.t1 = time.time() for w in workers: w.cancel()
class Spider: def __init__(self, max_tries=30, max_tasks=10, timeout=5, rootDir=os.getcwd()): self.max_tries = max_tries self.max_tasks = max_tasks self.loop = asyncio.get_event_loop() self.q = Queue(loop=self.loop) self.session = aiohttp.ClientSession(loop=self.loop) self.timeout = timeout self.rootDir = rootDir def close(self): self.session.close() def append_request(self, request): self.q.put_nowait(request) @asyncio.coroutine def _get_request(self): r = yield from self.q.get() return r @asyncio.coroutine def fetch(self, request_type, url, params, data): """Fetch one URL""" tries = 0 exception = None while tries < self.max_tries: try: print("try %s---->%d times"%(url, tries)) with aiohttp.Timeout(self.timeout): response = yield from self.session.get(url, params=params) if response.status == 200: content_type = response.headers.get('content-type') if content_type in CONTENT_TYPE_TEXT: with aiohttp.Timeout(self.timeout): content = yield from response.text(encoding='GBK') else: with aiohttp.Timeout(self.timeout): content = yield from response.read() break; except asyncio.TimeoutError: print("timeout") except aiohttp.ClientError as client_error: print("client error") except Exception: print("unknown error") tries += 1 else: print("try %s---->more than %d times, quit"%(url, tries)) return None response.release() return content @asyncio.coroutine def _work(self): """Process queue items forever.""" try: while True: r = yield from self._get_request() content = yield from self.fetch(r.request_type, r.url, r.params, r.data) if(content): r.handle_func(content) self.q.task_done() except asyncio.CancelledError: pass @asyncio.coroutine def work(self): yield from self._work() @asyncio.coroutine def spider(self): """run the spider until all finished""" workers = [asyncio.Task(self.work(),loop=self.loop) for _ in range (self.max_tasks)] yield from self.q.join() for w in workers: w.cancel()
class Crawler: """Crawl a set of URLs. This manages two sets of URLs: 'urls' and 'done'. 'urls' is a set of URLs seen, and 'done' is a list of FetchStatistics. """ def __init__(self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, loop=None): get_domain(roots) if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) with open(path, 'w') as temp_file: print('writing') temp_file.write('Domain name:') temp_file.write(roots) temp_file.write('\n \n') temp_file.close() self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.connector = aiohttp.TCPConnector(loop=self.loop) self.root_domains = set() # for root in roots: # parts = urllib.parse.urlparse(root) # host, port = urllib.parse.splitport(parts.netloc) # if not host: # continue # if re.match(r'\A[\d\.]*\Z', host): # self.root_domains.add(host) # else: ## host = host.lower() # if self.strict: # self.root_domains.add(host) # else: # self.root_domains.add(lenient_host(host)) # for root in roots: # print("true root") # print(root) # self.add_url(root) self.add_url(roots) self.t0 = time.time() self.t1 = None def close(self): """Close resources.""" self.connector.close() def host_okay(self, host): """Check if a host should be crawled. A literal match (after lowercasing) is always good. For hosts that don't look like IP addresses, some approximate matches are okay depending on the strict flag. """ host = host.lower() if host in self.root_domains: return True if re.match(r'\A[\d\.]*\Z', host): return False if self.strict: return self._host_okay_strictish(host) else: return self._host_okay_lenient(host) def _host_okay_strictish(self, host): """Check if a host should be crawled, strict-ish version. This checks for equality modulo an initial 'www.' component. """ host = host[4:] if host.startswith('www.') else 'www.' + host return host in self.root_domains def _host_okay_lenient(self, host): """Check if a host should be crawled, lenient version. This compares the last two components of the host. """ return lenient_host(host) in self.root_domains def record_statistic(self, fetch_statistic): """Record the FetchStatistic for completed / failed URL.""" self.done.append(fetch_statistic) @asyncio.coroutine def parse_links(self, response): """Return a FetchStatistic and list of links.""" links = set() content_type = None encoding = None body = yield from response.read() if response.status == 200: content_type = response.headers.get('content-type') pdict = {} if content_type: content_type, pdict = cgi.parse_header(content_type) encoding = pdict.get('charset', 'utf-8') if content_type in ('text/html', 'application/xml'): text = yield from response.text() #Mick - raw HTML page #print(text) # Replace href with (?:href|src) to follow image links. urls = set(re.findall(r'''(?i)href=["']?([^\s"'<>]+)''', text)) if urls: LOGGER.info('got %r distinct urls from %r', len(urls), response.url) for url in urls: #if(url.find("/ibm/console/logon.jsp?action=OK"): # print("There is a login page") normalized = urllib.parse.urljoin(response.url, url) # path = get_domain(str(normalized)) with open(path, 'a') as temp_file: temp_file.write(str(normalized) + ',\n') temp_file.close() defragmented, frag = urllib.parse.urldefrag(normalized) if self.url_allowed(defragmented): links.add(defragmented) stat = FetchStatistic( url=response.url, next_url=None, status=response.status, exception=None, size=len(body), content_type=content_type, encoding=encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls)) return stat, links @asyncio.coroutine def fetch(self, url, max_redirect): """Fetch one URL.""" tries = 0 exception = None while tries < self.max_tries: try: response = yield from aiohttp.request( 'get', url, connector=self.connector, allow_redirects=False, loop=self.loop) if tries > 1: LOGGER.info('try %r for %r success', tries, url) break except aiohttp.ClientError as client_error: LOGGER.info('try %r for %r raised %r', tries, url, client_error) exception = client_error tries += 1 else: # We never broke out of the loop: all tries failed. LOGGER.error('%r failed after %r tries', url, self.max_tries) self.record_statistic(FetchStatistic(url=url, next_url=None, status=None, exception=exception, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) return if is_redirect(response): location = response.headers['location'] next_url = urllib.parse.urljoin(url, location) self.record_statistic(FetchStatistic(url=url, next_url=next_url, status=response.status, exception=None, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) if next_url in self.seen_urls: return if max_redirect > 0: LOGGER.info('redirect to %r from %r', next_url, url) self.add_url(next_url, max_redirect - 1) else: LOGGER.error('redirect limit reached for %r from %r', next_url, url) else: stat, links = yield from self.parse_links(response) self.record_statistic(stat) for link in links.difference(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) @asyncio.coroutine def work(self): """Process queue items forever.""" while True: url, max_redirect = yield from self.q.get() assert url in self.seen_urls yield from self.fetch(url, max_redirect) self.q.task_done() def url_allowed(self, url): if self.exclude and re.search(self.exclude, url): return False parts = urllib.parse.urlparse(url) if parts.scheme not in ('http', 'https'): LOGGER.debug('skipping non-http scheme in %r', url) return False host, port = urllib.parse.splitport(parts.netloc) if not self.host_okay(host): LOGGER.debug('skipping non-root host in %r', url) return False return True def add_url(self, url, max_redirect=None): """Add a URL to the queue if not seen before.""" if max_redirect is None: max_redirect = self.max_redirect LOGGER.debug('adding %r %r', url, max_redirect) #TODO Mick - getting a new URL #print("new url: ") #print(url) # path = get_domain(url) # with open(path, 'w') as temp_file: # print('writing') # temp_file.write('Domain name:') # temp_file.write(url) # temp_file.write('\n \n') # temp_file.close() self.seen_urls.add(url) self.q.put_nowait((url, max_redirect)) @asyncio.coroutine def crawl(self): print("crawling...") """Run the crawler until all finished.""" workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)] self.t0 = time.time() yield from self.q.join() assert self.seen_urls == set(stat.url for stat in self.done) self.t1 = time.time() for w in workers: w.cancel()
class BaseAsyncCrawler(metaclass=CrawlerMetaClass): def __init__(self, task): self.seen_url = set() self.max_tasks = 50 self.max_retry = 10 self.loop = asyncio.get_event_loop() self.session = aiohttp.ClientSession(loop=self.loop) self.q = Queue(loop = self.loop) self.manager = Manager() self.q.put_nowait(task) """ for debug """ self.item_cnt = 0 self.page_cnt = 0 self.f = open("D:\\Acer", 'w') print('initialization finished.') def close(self): self.session.close() self.f.close() async def crawl(self): workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)] try: await self.q.join() for worker in workers: worker.cancel() except Exception as e: LOGGER.error('[in crawl] unexpected error with message: ', e) raise e async def fetch(self, queuingTask): #print('fetching') """ unpack task tuple """ url, page_type, info_item = queuingTask #print(url) """ try to establish connection """ tries = 0 while tries < self.max_retry: try: response = await self.session.get(url, headers = self.REQUEST_HEADERS) break except aiohttp.ClientError: pass except Exception as e: print(e) tries += 1 else: LOGGER.warning("fail to connect to "+url) return #print('connection established.') try: text = await response.text() except Exception as e: print('when get page content: ', e) print("fail to get page content from " + url) else: """ parse response """ try: todo = self.manager.list() done = self.manager.list() #print(self.__dict__) #print(hasattr(self, '__mapping__')) if hasattr(self, '__mapping__'): parser = self.__mapping__[page_type] #print('parsing '+page_type) if parser.parser_type == ParserType.GENERATOR: p = Process(target=parser, args=(todo, done, self, text,)) elif parser.parser_type == ParserType.APPENDER: p = Process(target=parser, args=(todo, done, self, text, info_item,)) else: raise Exception('fatal: unrecognized parser type') p.start() p.join() for task in todo: self.q.put_nowait(task) for data_item in done: self.item_cnt += 1 self.f.write(str(self.item_cnt) + "#" + str(data_item) + "\n") self.f.flush() print("NO." + str(self.item_cnt) + " item") else: raise Exception('fatal: uninitialized crawler.') except Exception as e: print(e) finally: await response.release() async def work(self): try: while True: queuingTask = await self.q.get() await self.fetch(queuingTask) self.q.task_done() except asyncio.CancelledError: pass