Beispiel #1
0
def test_task_queue():
    tq_config = {'tasks': task_scripts}
    tq = TaskQueue(config=tq_config)
    random_queue = generate_random_queue(task_scripts.keys(), 30)
    print("QUEUE:", random_queue)

    for elem in random_queue:
        tq.add_task(elem, None)

    te_config = {'task_guards': task_guards, 'max_threads': 40}
    te = TaskExecutor(tq, config=te_config)
Beispiel #2
0
async def task_consumer(consumer_index: int, task_queue: TaskQueue):
    logger.info("Consumer %s is in play!", consumer_index)
    while True:
        async with get_task(task_queue) as task:
            logger.info("%s got %s", consumer_index, task.url)
            parsed_url = furl.furl(task.url)
            if task.url in task.visited:
                continue
            else:
                task.visited.add(task.url)
            # logger.info("%s got task with %s", consumer_index, task.url)
            async with task.http_client.get(task.url) as resp:
                if resp.status != http.HTTPStatus.OK:
                    continue
                try:
                    html = await resp.text()
                except Exception:
                    continue
                soup = BeautifulSoup(html,
                                     "lxml",
                                     parse_only=SoupStrainer('a'))
                urls = set(a['href'] for a in soup.find_all('a', href=True))
                new_urls = []
                # logger.info("Got %s urls", len(urls))
                for url in urls:
                    parsed = furl.furl(url).remove(query_params=True,
                                                   fragment_args=True,
                                                   fragment_path=True)
                    cleaned_url = str(parsed)
                    is_absolute = bool(parsed.host)
                    new_url_to_crawl = None
                    # logger.info("%s: Url: %s", consumer_index, url)
                    if is_absolute and mask_match(cleaned_url, task.mask):
                        new_url_to_crawl = cleaned_url
                        # logger.info("%s: Absolute: %s", consumer_index, new_url_to_crawl)
                    elif not is_absolute:
                        new_url_to_crawl = str(
                            parsed_url.copy().remove(path=True) / cleaned_url)
                        # logger.info("%s: Constructed new %s ...", consumer_index, new_url_to_crawl)
                    if (new_url_to_crawl
                            and new_url_to_crawl not in task.visited
                            and len(new_url_to_crawl) < URL_MAX_LENGTH):
                        # logger.info("%s: Pushing %s ...", consumer_index, new_url_to_crawl)
                        new_urls.append(new_url_to_crawl)
                await asyncio.gather(*[
                    task_queue.add_task(task.clone_with_url(url))
                    for url in new_urls
                ])
                async with task.connection_pool.acquire() as conn:
                    await conn.executemany(
                        '''
                        INSERT INTO links(url_from, url_to, "count") VALUES($1, $2, 1)
                        ON CONFLICT DO NOTHING
                    ''', [(task.url, url) for url in new_urls])