def extract_async(url): data = yield from fetch_async(url) found_urls = set() for match in URL_EXPR.finditer(data): found = canonicalize(match.group('url')) if same_domain(url, found): found_urls.add(urljoin(url, found)) return url, data, sorted(found_urls)
def crawl_parallel(start_url, max_depth): fetch_queue = Queue() # (crawl_depth, url) fetch_queue.put((0, canonicalize(start_url))) seen_urls, result = set(), [] func = lambda: consumer(fetch_queue, max_depth, seen_urls, result) for _ in range(3): Thread(target=func, daemon=True).start() fetch_queue.join() return result
def crawl_async(start_url, max_depth): seen_urls = set() to_fetch = [canonicalize(start_url)] results = [] for depth in range(max_depth + 1): batch = yield from extract_multi_async(to_fetch, seen_urls) to_fetch = [] for url, data, found_urls in batch: results.append((depth, url, data)) to_fetch.extend(found_urls) return results
def main(): url = canonicalize(argv[1]) # Bridge the gap between sync and async future = asyncio.Task(extract_async(url)) loop = asyncio.get_event_loop() loop.run_until_complete(future) loop.close() _, data, found_urls = future.result() # Will raise exception print('%s is %d bytes, %d urls:\n%s' % (url, len(data), len(found_urls), '\n'.join(found_urls)))
def parallel_wordcount(start_url, max_depth, word_length): fetch_queue = Queue() # (crawl_depth, url) fetch_queue.put((0, canonicalize(start_url))) count_queue = Queue() # (url, data) seen_urls = set() func = lambda: fetcher(fetch_queue, max_depth, seen_urls, count_queue) for _ in range(3): Thread(target=func, daemon=True).start() result = [] func = lambda: counter(count_queue, word_length, result) for _ in range(3): Thread(target=func, daemon=True).start() fetch_queue.join() count_queue.join() return result