def test_task_queue(): tq_config = {'tasks': task_scripts} tq = TaskQueue(config=tq_config) random_queue = generate_random_queue(task_scripts.keys(), 30) print("QUEUE:", random_queue) for elem in random_queue: tq.add_task(elem, None) te_config = {'task_guards': task_guards, 'max_threads': 40} te = TaskExecutor(tq, config=te_config)
async def task_consumer(consumer_index: int, task_queue: TaskQueue): logger.info("Consumer %s is in play!", consumer_index) while True: async with get_task(task_queue) as task: logger.info("%s got %s", consumer_index, task.url) parsed_url = furl.furl(task.url) if task.url in task.visited: continue else: task.visited.add(task.url) # logger.info("%s got task with %s", consumer_index, task.url) async with task.http_client.get(task.url) as resp: if resp.status != http.HTTPStatus.OK: continue try: html = await resp.text() except Exception: continue soup = BeautifulSoup(html, "lxml", parse_only=SoupStrainer('a')) urls = set(a['href'] for a in soup.find_all('a', href=True)) new_urls = [] # logger.info("Got %s urls", len(urls)) for url in urls: parsed = furl.furl(url).remove(query_params=True, fragment_args=True, fragment_path=True) cleaned_url = str(parsed) is_absolute = bool(parsed.host) new_url_to_crawl = None # logger.info("%s: Url: %s", consumer_index, url) if is_absolute and mask_match(cleaned_url, task.mask): new_url_to_crawl = cleaned_url # logger.info("%s: Absolute: %s", consumer_index, new_url_to_crawl) elif not is_absolute: new_url_to_crawl = str( parsed_url.copy().remove(path=True) / cleaned_url) # logger.info("%s: Constructed new %s ...", consumer_index, new_url_to_crawl) if (new_url_to_crawl and new_url_to_crawl not in task.visited and len(new_url_to_crawl) < URL_MAX_LENGTH): # logger.info("%s: Pushing %s ...", consumer_index, new_url_to_crawl) new_urls.append(new_url_to_crawl) await asyncio.gather(*[ task_queue.add_task(task.clone_with_url(url)) for url in new_urls ]) async with task.connection_pool.acquire() as conn: await conn.executemany( ''' INSERT INTO links(url_from, url_to, "count") VALUES($1, $2, 1) ON CONFLICT DO NOTHING ''', [(task.url, url) for url in new_urls])