Esempio n. 1
0
def test_seen():
    c = {'Robots': {'RobotsCacheSize': 1, 'RobotsCacheTimeout': 1}}
    config.set_config(c)
    dl = datalayer.Datalayer()
    assert not dl.seen(URL('http://example.com'))
    dl.add_seen(URL('http://example.com'))
    assert dl.seen(URL('http://example.com'))
Esempio n. 2
0
def test_robots():
    '''
    There's already end-to-end testing for the normal functionality.
    Exercise only the weird stuff here.
    '''
    config.set_config({
        'Robots': {
            'MaxTries': 4,
            'MaxRobotsPageSize': 500000
        },
        'Logging': {}
    })
    # XXX really I should use the defaults in config.py so that I don't have
    # to edit the above as I add mandatory args
    r = robots.Robots('foo', None, None)

    robots_txt = b'<'
    plausible, message = r.is_plausible_robots('example.com', robots_txt, 1.0)
    assert not plausible
    assert len(message)

    robots_txt = b''  # application/x-empty
    plausible, message = r.is_plausible_robots('example.com', robots_txt, 1.0)
    assert plausible
    assert not len(message)

    robots_txt = b'x' * 1000001
    plausible, message = r.is_plausible_robots('example.com', robots_txt, 1.0)
    assert not plausible
    assert len(message)

    robots_txt = b'foo'
    plausible, message = r.is_plausible_robots('example.com', robots_txt, 1.0)
    assert plausible
    assert not len(message)
Esempio n. 3
0
def test_seen():
    c = {'Robots': {'RobotsCacheSize': 1, 'RobotsCacheTimeout': 1}}
    config.set_config(c)
    dl = datalayer.Datalayer()
    assert not dl.seen_url(URL('http://example.com'))
    dl.add_seen_url(URL('http://example.com'))
    assert dl.seen_url(URL('http://example.com'))
Esempio n. 4
0
def test_robotscache():
    c = {'Robots': {'RobotsCacheSize': 1, 'RobotsCacheTimeout': 1}}
    config.set_config(c)
    dl = datalayer.Datalayer()
    with pytest.raises(KeyError):
        dl.read_robots_cache('http://example.com')
    dl.cache_robots('http://example.com', b'THIS IS A TEST')
    assert dl.read_robots_cache('http://example.com') == b'THIS IS A TEST'
Esempio n. 5
0
def test_robotscache():
    c = {'Robots': {'RobotsCacheSize': 1, 'RobotsCacheTimeout': 1}}
    config.set_config(c)
    dl = datalayer.Datalayer()
    with pytest.raises(KeyError):
        dl.read_robots_cache('http://example.com')
    dl.cache_robots('http://example.com', b'THIS IS A TEST')
    assert dl.read_robots_cache('http://example.com') == b'THIS IS A TEST'
Esempio n. 6
0
def test_summarize(capsys):
    c = {'Robots': {'RobotsCacheSize': 1, 'RobotsCacheTimeout': 1}}
    config.set_config(c)
    dl = datalayer.Datalayer()
    dl.add_seen(URL('http://example.com'))
    dl.add_seen(URL('http://example2.com'))
    dl.summarize()

    out, err = capsys.readouterr()

    assert len(err) == 0
    assert out.startswith('2 seen')
Esempio n. 7
0
def test_summarize(capsys):
    c = {'Robots': {'RobotsCacheSize': 1, 'RobotsCacheTimeout': 1}}
    config.set_config(c)
    dl = datalayer.Datalayer()
    dl.add_seen_url(URL('http://example.com'))
    dl.add_seen_url(URL('http://example2.com'))
    dl.summarize()

    out, err = capsys.readouterr()

    assert len(err) == 0
    assert out.startswith('2 seen_urls')
Esempio n. 8
0
def main():
    ARGS = argparse.ArgumentParser(description='bench_burn benchmark for burner thread overhead')
    ARGS.add_argument('--threads', type=int, default=2)
    ARGS.add_argument('--workers', type=int, default=100)
    ARGS.add_argument('--datasize', type=int, default=10000)
    ARGS.add_argument('--affinity', action='store_true')
    ARGS.add_argument('--duration', type=float, default=0.010)
    ARGS.add_argument('--count', type=int, default=10000)
    args = ARGS.parse_args()

    c = {'Multiprocess': {'BurnerThreads': args.threads, 'Affinity': args.affinity}}
    config.set_config(c)
    global b
    b = burner.Burner('parser')

    for _ in range(args.count):
        queue.put_nowait((args.duration, 'x' * args.datasize))

    print('args are', args)

    print('Processing {} items of size {} kbytes and {:.3f} seconds of burn using {} burner threads'.format(
        args.count, int(args.datasize/1000), args.duration, args.threads))

    t0 = time.time()
    c0 = time.process_time()

    try:
        loop.run_until_complete(crawl())
    except KeyboardInterrupt:
        sys.stderr.flush()
        print('\nInterrupt. Exiting.\n')
    finally:
        loop.stop()
        loop.run_forever()
        loop.close()

    elapsed = time.time() - t0
    print('Elapsed time is {:.1f} seconds.'.format(elapsed))
    expected = args.count * args.duration / args.threads
    print('Expected is {:.1f} seconds.'.format(expected))

    print('Burner-side overhead is {}% or {:.4f} seconds per call'.format(
        int((elapsed - expected)/expected*100), (elapsed - expected)/args.count))

    celapsed = time.process_time() - c0
    print('Main-thread overhead is {}%, {:.4f} seconds per call, {} calls per cpu-second'.format(
        int(celapsed/elapsed*100), celapsed/args.count, int(args.count/celapsed)))
Esempio n. 9
0
def test_saveload():
    tf = tempfile.NamedTemporaryFile(delete=False)
    name = tf.name

    c = {'Robots': {'RobotsCacheSize': 1, 'RobotsCacheTimeout': 1}}
    config.set_config(c)
    dl = datalayer.Datalayer()
    dl.add_seen(URL('http://example.com'))
    assert dl.seen(URL('http://example.com'))

    with open(name, 'wb') as f:
        dl.save(f)
    dl.add_seen(URL('http://example2.com'))
    with open(name, 'rb') as f:
        dl.load(f)

    assert dl.seen(URL('http://example.com'))
    assert not dl.seen(URL('http://example2.com'))
    os.unlink(name)
    assert not os.path.exists(name)
Esempio n. 10
0
def test_saveload():
    tf = tempfile.NamedTemporaryFile(delete=False)
    name = tf.name

    c = {'Robots': {'RobotsCacheSize': 1, 'RobotsCacheTimeout': 1}}
    config.set_config(c)
    dl = datalayer.Datalayer()
    dl.add_seen_url(URL('http://example.com'))
    assert dl.seen_url(URL('http://example.com'))

    with open(name, 'wb') as f:
        dl.save(f)
    dl.add_seen_url(URL('http://example2.com'))
    with open(name, 'rb') as f:
        dl.load(f)

    assert dl.seen_url(URL('http://example.com'))
    assert not dl.seen_url(URL('http://example2.com'))
    os.unlink(name)
    assert not os.path.exists(name)
Esempio n. 11
0
import os
import sys
import logging
import functools

import asyncio

import cocrawler.burner as burner
import cocrawler.parse as parse
import cocrawler.stats as stats
import cocrawler.config as config

c = {'Multiprocess': {'BurnerThreads': 2}}
config.set_config(c)
loop = asyncio.get_event_loop()
b = burner.Burner('parser')
queue = asyncio.Queue()


def parse_all(name, string):
    links1, _ = parse.find_html_links(string)
    links2, embeds2 = parse.find_html_links_and_embeds(string)

    all2 = links2.union(embeds2)

    if len(links1) != len(all2):
        print('{} had different link counts of {} and {}'.format(
            name, len(links1), len(all2)))
        extra1 = links1.difference(all2)
        extra2 = all2.difference(links1)
        print('  extra in links:            {!r}'.format(extra1))
Esempio n. 12
0
import os
import sys
import logging
import functools

import asyncio

import cocrawler.burner as burner
import cocrawler.parse as parse
import cocrawler.stats as stats
import cocrawler.config as config

c = {'Multiprocess': {'BurnerThreads': 2}}
config.set_config(c)
loop = asyncio.get_event_loop()
b = burner.Burner('parser')
queue = asyncio.Queue()


def parse_all(name, string):
    links1, _ = parse.find_html_links(string)
    links2, embeds2 = parse.find_html_links_and_embeds(string)

    all2 = links2.union(embeds2)

    if len(links1) != len(all2):
        print('{} had different link counts of {} and {}'.format(name, len(links1), len(all2)))
        extra1 = links1.difference(all2)
        extra2 = all2.difference(links1)
        print('  extra in links:            {!r}'.format(extra1))
        print('  extra in links and embeds: {!r}'.format(extra2))
Esempio n. 13
0
def test_useragent():

    c = {
        'UserAgent': {
            'Style': 'crawler',
            'MyPrefix': 'something',
            'URL': 'http://example.com/cocrawler.html'
        }
    }
    config.set_config(c)
    version = '1.0'

    robotname, ua = useragent.useragent(version)

    assert version in ua
    assert 'http://example.com/cocrawler.html' in ua
    assert robotname == 'something-cocrawler'

    config.write('laptopplus', 'UserAgent', 'Style')
    robotname, ua = useragent.useragent(version)
    assert 'Mozilla/5.0' in ua

    config.write('tabletplus', 'UserAgent', 'Style')
    robotname, ua = useragent.useragent(version)
    assert 'Mozilla/5.0' in ua

    config.write('phoneplus', 'UserAgent', 'Style')
    robotname, ua = useragent.useragent(version)
    assert 'Mozilla/5.0' in ua

    config.set_config(c)
    config.write('error', 'UserAgent', 'Style')
    with pytest.raises(ValueError):
        robotname, ua = useragent.useragent(version)

    config.set_config(c)
    config.write('ha ha I left this off', 'UserAgent', 'URL')
    with pytest.raises(ValueError):
        robotname, ua = useragent.useragent(version)

    config.set_config(c)
    config.write('http://cocrawler.com/cocrawler.html', 'UserAgent', 'URL')
    with pytest.raises(ValueError):
        robotname, ua = useragent.useragent(version)

    config.set_config(c)
    config.write('test', 'UserAgent', 'MyPrefix')
    with pytest.raises(ValueError):
        robotname, ua = useragent.useragent(version)

    config.set_config(c)
    config.write('', 'UserAgent', 'MyPrefix')
    with pytest.raises(ValueError):
        robotname, ua = useragent.useragent(version)
Esempio n. 14
0
def test_useragent():

    c = {'UserAgent': {'Style': 'crawler',
                       'MyPrefix': 'something',
                       'URL': 'http://example.com/cocrawler.html'}}
    config.set_config(c)
    version = '1.0'

    robotname, ua = useragent.useragent(version)

    assert version in ua
    assert 'http://example.com/cocrawler.html' in ua
    assert robotname == 'something-cocrawler'

    config.write('laptopplus', 'UserAgent', 'Style')
    robotname, ua = useragent.useragent(version)
    assert 'Mozilla/5.0' in ua

    config.write('tabletplus', 'UserAgent', 'Style')
    robotname, ua = useragent.useragent(version)
    assert 'Mozilla/5.0' in ua

    config.write('phoneplus', 'UserAgent', 'Style')
    robotname, ua = useragent.useragent(version)
    assert 'Mozilla/5.0' in ua

    config.set_config(c)
    config.write('error', 'UserAgent', 'Style')
    with pytest.raises(ValueError):
        robotname, ua = useragent.useragent(version)

    config.set_config(c)
    config.write('ha ha I left this off', 'UserAgent', 'URL')
    with pytest.raises(ValueError):
        robotname, ua = useragent.useragent(version)

    config.set_config(c)
    config.write('http://cocrawler.com/cocrawler.html', 'UserAgent', 'URL')
    with pytest.raises(ValueError):
        robotname, ua = useragent.useragent(version)

    config.set_config(c)
    config.write('test', 'UserAgent', 'MyPrefix')
    with pytest.raises(ValueError):
        robotname, ua = useragent.useragent(version)

    config.set_config(c)
    config.write('', 'UserAgent', 'MyPrefix')
    with pytest.raises(ValueError):
        robotname, ua = useragent.useragent(version)