async def test_prefetch(): url = URL('http://example.com/') config.config(None, None) resolver = dns.get_resolver() iplist = await dns.prefetch(url, resolver) assert len(iplist) > 0 iplist2 = await dns.prefetch(url, resolver) assert iplist == iplist2
async def test_cocrawler(capsys): config.config(None, None) # we have to get around the useragent checks config.write('pytest', 'UserAgent', 'MyPrefix') config.write('http://example.com/pytest-test-cocrawler.py', 'UserAgent', 'URL') # and configure url_allowed config.write('AllDomains', 'Plugins', 'url_allowed') crawler = cocrawler.Crawler() crawler.add_url(0, {'url': URL('http://example1.com/')}) crawler.add_url(0, {'url': URL('http://example2.com/')}) crawler.add_url(0, {'url': URL('http://example3.com/')}) assert crawler.qsize == 3 f = tempfile.NamedTemporaryFile(delete=False) name = f.name with open(name, 'wb') as f: crawler.save(f) assert crawler.qsize == 0 crawler.add_url(0, {'url': URL('http://example4.com/')}) assert crawler.qsize == 1 with open(name, 'rb') as f: crawler.load(f) assert crawler.qsize == 3 os.unlink(name) assert not os.path.exists(name) # clear out the existing capture out, err = capsys.readouterr() crawler.summarize() out, err = capsys.readouterr() assert err == '' assert len( out) >= 200 # not a very good test, but at least it is something await crawler.close() # needed for smooth shutdown
async def test_cocrawler(capsys): config.config(None, None) # we have to get around the useragent checks config.write('pytest', 'UserAgent', 'MyPrefix') config.write('http://example.com/pytest-test-cocrawler.py', 'UserAgent', 'URL') # and configure url_allowed config.write('AllDomains', 'Plugins', 'url_allowed') crawler = cocrawler.Crawler() crawler.add_url(0, {'url': URL('http://example1.com/')}) crawler.add_url(0, {'url': URL('http://example2.com/')}) crawler.add_url(0, {'url': URL('http://example3.com/')}) assert crawler.qsize == 3 f = tempfile.NamedTemporaryFile(delete=False) name = f.name with open(name, 'wb') as f: crawler.save(f) assert crawler.qsize == 0 crawler.add_url(0, {'url': URL('http://example4.com/')}) assert crawler.qsize == 1 with open(name, 'rb') as f: crawler.load(f) assert crawler.qsize == 3 os.unlink(name) assert not os.path.exists(name) # clear out the existing capture out, err = capsys.readouterr() crawler.summarize() out, err = capsys.readouterr() assert err == '' assert len(out) >= 200 # not a very good test, but at least it is something await crawler.close() # needed for smooth shutdown
import asyncio import cocrawler.dns as dns import cocrawler.config as config ARGS = argparse.ArgumentParser(description='CoCrawler dns fetcher') ARGS.add_argument('--config', action='append') ARGS.add_argument('--configfile', action='store') ARGS.add_argument('--no-confighome', action='store_true') ARGS.add_argument('--type', default='A') ARGS.add_argument('hosts', nargs='+', help='list of hostnames to query') args = ARGS.parse_args() config.config(args.configfile, args.config, confighome=not args.no_confighome) ns = config.read('Fetcher', 'Nameservers') if not isinstance(ns, list): ns = [ns] dns.setup_resolver(ns) print('set nameservers to', ns) async def main(hosts): for host in hosts: try: result = await dns.query(host, args.type) print(host, result) except Exception as e:
def main(): ''' Main program: parse args, read config, set up event loop, run the crawler. ''' args = ARGS.parse_args() if args.printdefault: config.print_default() sys.exit(1) loglevel = os.getenv('COCRAWLER_LOGLEVEL') or args.loglevel logging.basicConfig(level=loglevel) config.config(args.configfile, args.config) if args.printfinal: config.print_final() sys.exit(1) memory.limit_resources() if os.getenv('PYTHONASYNCIODEBUG') is not None: logging.captureWarnings(True) warnings.simplefilter('default', category=ResourceWarning) if LOGGER.getEffectiveLevel() > logging.WARNING: LOGGER.setLevel(logging.WARNING) LOGGER.warning( 'Lowered logging level to WARNING because PYTHONASYNCIODEBUG env var is set' ) LOGGER.warning( 'Configured logging system to show ResourceWarning because PYTHONASYNCIODEBUG env var is set' ) LOGGER.warning( 'Note that this does have a significant impact on asyncio overhead' ) if os.getenv('COCRAWLER_GC_DEBUG') is not None: LOGGER.warning('Configuring gc debugging') gc.set_debug(gc.DEBUG_STATS | gc.DEBUG_UNCOLLECTABLE) kwargs = {} if args.load: kwargs['load'] = args.load if args.no_test: kwargs['no_test'] = True crawler = cocrawler.Crawler(**kwargs) loop = asyncio.get_event_loop() slow_callback_duration = os.getenv('ASYNCIO_SLOW_CALLBACK_DURATION') if slow_callback_duration: loop.slow_callback_duration = float(slow_callback_duration) LOGGER.warning('set slow_callback_duration to %f', slow_callback_duration) if config.read('CarbonStats'): timer.start_carbon() if config.read('REST'): app = webserver.make_app() else: app = None try: loop.run_until_complete(crawler.crawl()) except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupt. Exiting cleanly.\n') crawler.cancel_workers() finally: loop.run_until_complete(crawler.close()) if app: webserver.close(app) if config.read('CarbonStats'): timer.close() # apparently this is needed for full aiohttp cleanup -- or is it cargo cult loop.stop() loop.run_forever() loop.close()
import random import os from cocrawler.urls import URL import cocrawler.dns as dns import cocrawler.config as config ARGS = argparse.ArgumentParser(description='CoCrawler dns benchmark') ARGS.add_argument('--config', action='append') ARGS.add_argument('--configfile', action='store') ARGS.add_argument('--count', type=int, default=1000) ARGS.add_argument('--expect-not-suitable', action='store_true') args = ARGS.parse_args() config.config(args.configfile, args.config) max_workers = config.read('Crawl', 'MaxWorkers') ns = config.read('Fetcher', 'Nameservers') if isinstance(ns, str): ns = [ns] config.write(ns, 'Fetcher', 'Nameservers') exit_value = 0 resolver = dns.get_resolver() def create_queue(): queue = asyncio.Queue() # add a fake domain to make sure the dns doesn't send unknown hosts to a search
def main(): ''' Main program: parse args, read config, set up event loop, run the crawler. ''' args = ARGS.parse_args() if args.printdefault: config.print_default() sys.exit(1) loglevel = os.getenv('COCRAWLER_LOGLEVEL') if loglevel is None and args.loglevel: loglevel = args.loglevel if loglevel is None and args.verbose: loglevel = 'DEBUG' logging.basicConfig(level=loglevel) config.config(args.configfile, args.config) if args.printfinal: config.print_final() sys.exit(1) memory.limit_resources() if os.getenv('PYTHONASYNCIODEBUG') is not None: logging.captureWarnings(True) warnings.simplefilter('default', category=ResourceWarning) if LOGGER.getEffectiveLevel() > logging.WARNING: LOGGER.setLevel(logging.WARNING) LOGGER.warning('Lowered logging level to WARNING because PYTHONASYNCIODEBUG env var is set') LOGGER.warning('Configured logging system to show ResourceWarning because PYTHONASYNCIODEBUG env var is set') LOGGER.warning('Note that this does have a significant impact on asyncio overhead') if os.getenv('COCRAWLER_GC_DEBUG') is not None: LOGGER.warning('Configuring gc debugging') gc.set_debug(gc.DEBUG_STATS | gc.DEBUG_UNCOLLECTABLE) kwargs = {} if args.load: kwargs['load'] = args.load if args.no_test: kwargs['no_test'] = True crawler = cocrawler.Crawler(**kwargs) loop = asyncio.get_event_loop() slow_callback_duration = os.getenv('ASYNCIO_SLOW_CALLBACK_DURATION') if slow_callback_duration: loop.slow_callback_duration = float(slow_callback_duration) LOGGER.warning('set slow_callback_duration to %f', slow_callback_duration) if config.read('CarbonStats'): timer.start_carbon() if config.read('REST'): app = webserver.make_app() else: app = None try: loop.run_until_complete(crawler.crawl()) except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupt. Exiting cleanly.\n') crawler.cancel_workers() finally: loop.run_until_complete(crawler.close()) if app: webserver.close(app) if config.read('CarbonStats'): timer.close() # vodoo recommended by advanced aiohttp docs for graceful shutdown # https://github.com/aio-libs/aiohttp/issues/1925 loop.run_until_complete(asyncio.sleep(0.250)) loop.close()
import sys import cocrawler import cocrawler.config as config f = sys.argv[1] config.config(None, None) crawler = cocrawler.Crawler(load=f) # at this point the crawler won't start until we call loop.run_until_complete ... if sys.argv[2] == 'frontier': crawler.scheduler.dump_frontier()
import random import os from cocrawler.urls import URL import cocrawler.dns as dns import cocrawler.config as config ARGS = argparse.ArgumentParser(description='CoCrawler dns benchmark') ARGS.add_argument('--config', action='append') ARGS.add_argument('--configfile', action='store') ARGS.add_argument('--count', type=int, default=1000) ARGS.add_argument('--expect-not-suitable', action='store_true') args = ARGS.parse_args() config.config(args.configfile, args.config) max_workers = config.read('Crawl', 'MaxWorkers') ns = config.read('Fetcher', 'Nameservers') if isinstance(ns, str): ns = [ns] config.write(ns, 'Fetcher', 'Nameservers') exit_value = 0 resolver = dns.get_resolver() def create_queue(): queue = asyncio.Queue() # add a fake domain to make sure the dns doesn't send unknown hosts to a search