Esempio n. 1
0
def main():
    '''
    Main program: parse args, read config, set up event loop, run the crawler.
    '''

    args = ARGS.parse_args()

    if args.printdefault:
        conf.print_default()
        sys.exit(1)

    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
    logging.basicConfig(level=levels[min(args.loglevel, len(levels) - 1)])

    config = conf.config(args.configfile,
                         args.config,
                         confighome=not args.no_confighome)
    limit_resources(config)

    kwargs = {}
    if args.load:
        kwargs['load'] = args.load
    if args.no_test:
        kwargs['no_test'] = True

    loop = asyncio.get_event_loop()
    crawler = cocrawler.Crawler(loop, config, **kwargs)

    if config.get('CarbonStats'):
        timer.start_carbon(loop, config)

    if config['REST']:
        app = webserver.make_app(loop, config)
    else:
        app = None

    try:
        loop.run_until_complete(crawler.crawl())
    except KeyboardInterrupt:
        sys.stderr.flush()
        print('\nInterrupt. Exiting cleanly.\n')
        crawler.cancel_workers()
    finally:
        crawler.close()
        if app:
            webserver.close(app)
        if config.get('CarbonStats'):
            timer.close()
        # apparently this is needed for full aiohttp cleanup
        loop.stop()
        loop.run_forever()
        loop.close()
Esempio n. 2
0
def test_cocrawler(capsys):
    config = conf.config(None, None, confighome=False)

    # ok, we have to get around the useragent checks
    config['UserAgent']['MyPrefix'] = 'pytest'
    config['UserAgent']['URL'] = 'http://example.com/pytest-test-cocrawler.py'

    loop = asyncio.get_event_loop()
    crawler = cocrawler.Crawler(loop, config)

    crawler.add_url(0, URL('http://example1.com/'), seed=True)
    crawler.add_url(0, URL('http://example2.com/'), seed=True)
    crawler.add_url(0, URL('http://example3.com/'), seed=True)
    assert crawler.qsize == 3

    f = tempfile.NamedTemporaryFile(delete=False)
    name = f.name

    with open(name, 'wb') as f:
        crawler.save(f)
    assert crawler.qsize == 0

    crawler.add_url(0, URL('http://example4.com/'), seed=True)
    assert crawler.qsize == 1

    with open(name, 'rb') as f:
        crawler.load(f)

    assert crawler.qsize == 3

    os.unlink(name)
    assert not os.path.exists(name)

    # clear out the existing capture
    out, err = capsys.readouterr()

    crawler.summarize()

    out, err = capsys.readouterr()

    assert err == ''
    assert len(out) >= 242  # not a very good test, but at least it is something
Esempio n. 3
0
import aiohttp

import cocrawler.conf as conf
import cocrawler.dns as dns

ARGS = argparse.ArgumentParser(description='CoCrawler dns benchmark')
ARGS.add_argument('--config', action='append')
ARGS.add_argument('--configfile', action='store')
ARGS.add_argument('--no-confighome', action='store_true')
ARGS.add_argument('--count', type=int, default=1000)

args = ARGS.parse_args()

config = conf.config(args.configfile,
                     args.config,
                     confighome=not args.no_confighome)
max_workers = config['Crawl']['MaxWorkers']

ns = config['Fetcher'].get('Nameservers')
if not isinstance(ns, list):
    ns = [ns]

#resolver = aiohttp.resolver.AsyncResolver(nameservers=ns)  # Can I pass rotate=True into this?
#connector = aiohttp.connector.TCPConnector(resolver=resolver, family=socket.AF_INET)
#session = aiohttp.ClientSession(connector=connector)
exit_value = 0

dns.setup_resolver(ns)