Example #1
0
def test_hashtag_links():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    site = brozzler.Site(rr, {'seed': 'http://example.org/'})
    brozzler.new_site(frontier, site)
    parent_page = frontier.seed_page(site.id)
    assert not parent_page.hashtags
    outlinks = [
        'http://example.org/#foo',
        'http://example.org/bar',
        'http://example.org/bar#baz',
        'http://example.org/bar#quux',
        'http://example.org/zuh#buh',
    ]
    frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)

    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
    assert len(pages) == 3
    assert pages[0].url == 'http://example.org/'
    assert sorted(pages[0].outlinks['accepted']) == [
        'http://example.org/', 'http://example.org/bar',
        'http://example.org/zuh'
    ]
    assert not pages[0].outlinks['blocked']
    assert not pages[0].outlinks['rejected']
    assert pages[0].hashtags == [
        '#foo',
    ]
    assert pages[0].hops_from_seed == 0

    assert pages[1].url == 'http://example.org/bar'
    assert sorted(pages[1].hashtags) == ['#baz', '#quux']
    assert pages[1].priority == 36
    assert pages[1].hops_from_seed == 1

    assert pages[2].url == 'http://example.org/zuh'
    assert pages[2].hashtags == ['#buh']
    assert pages[2].priority == 12
Example #2
0
def test_login(httpd):
    test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    site = brozzler.Site(rr, {
        'seed': 'http://localhost:%s/site2/' % httpd.server_port,
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}},
        'username': '******', 'password': '******'})

    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # take a look at the captures table
    time.sleep(2)   # in case warcprox hasn't finished processing urls
    robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port
    captures = list(rr.table('captures').filter(
                {'test_id':test_id}).order_by('timestamp').run())
    meth_url = ['%s %s' % (c['http_method'], c['url']) for c in captures]

    # there are several forms in in htdocs/site2/login.html but only one
    # that brozzler's heuristic should match and try to submit, and it has
    # action='00', so we can check for that here
    assert ('POST http://localhost:%s/site2/00' % httpd.server_port) in meth_url

    # sanity check the rest of the crawl
    assert ('GET http://localhost:%s/robots.txt' % httpd.server_port) in meth_url
    assert ('GET http://localhost:%s/site2/' % httpd.server_port) in meth_url
    assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/' % httpd.server_port) in meth_url
    assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/' % httpd.server_port) in meth_url
    assert ('GET http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
    assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
    assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
Example #3
0
def brozzler_new_job():
    '''
    Command line utility entry point for queuing a new brozzler job. Takes a
    yaml brozzler job configuration file, creates job, sites, and pages objects
    in rethinkdb, which brozzler-workers will look at and start crawling.
    '''
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(sys.argv[0]),
            description='brozzler-new-job - queue new job with brozzler',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    arg_parser.add_argument(
            'job_conf_file', metavar='JOB_CONF_FILE',
            help='brozzler job configuration file in yaml')
    _add_rethinkdb_options(arg_parser)
    _add_common_options(arg_parser)

    args = arg_parser.parse_args(args=sys.argv[1:])
    _configure_logging(args)

    r = rethinkstuff.Rethinker(
            args.rethinkdb_servers.split(','), args.rethinkdb_db)
    frontier = brozzler.RethinkDbFrontier(r)
    brozzler.job.new_job_file(frontier, args.job_conf_file)
Example #4
0
def test_claim_site():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    rr.table('sites').delete().run()  # clean slate

    with pytest.raises(brozzler.NothingToClaim):
        claimed_site = frontier.claim_site(worker_id='test_claim_site')

    site = brozzler.Site(rr, {'seed': 'http://example.org/'})
    brozzler.new_site(frontier, site)

    claimed_site = frontier.claim_site(worker_id='test_claim_site')
    assert claimed_site.id == site.id
    assert claimed_site.claimed
    assert claimed_site.last_claimed >= doublethink.utcnow(
    ) - datetime.timedelta(minutes=1)
    with pytest.raises(brozzler.NothingToClaim):
        claimed_site = frontier.claim_site(worker_id='test_claim_site')

    # site last_claimed less than 1 hour ago still not to be reclaimed
    claimed_site.last_claimed = doublethink.utcnow() - datetime.timedelta(
        minutes=55)
    claimed_site.save()
    with pytest.raises(brozzler.NothingToClaim):
        claimed_site = frontier.claim_site(worker_id='test_claim_site')

    # site last_claimed more than 1 hour ago can be reclaimed
    site = claimed_site
    claimed_site = None
    site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=65)
    site.save()
    claimed_site = frontier.claim_site(worker_id='test_claim_site')
    assert claimed_site.id == site.id

    # clean up
    rr.table('sites').get(claimed_site.id).delete().run()
Example #5
0
def test_redirect_hashtags(httpd):
    test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    seed_url = 'http://localhost:%s/site9/' % httpd.server_port
    site = brozzler.Site(rr, {
        'seed': seed_url,
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})

    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)
    assert site.id

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # check that we the page we expected
    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
    assert len(pages) == 2
    assert pages[0].url == seed_url
    assert pages[0].hops_from_seed == 0
    assert pages[0].brozzle_count == 1
    assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site9/redirect.html' % httpd.server_port]
    assert not pages[0].hashtags
    assert pages[1].url == 'http://localhost:%s/site9/redirect.html' % httpd.server_port
    assert pages[1].hops_from_seed == 1
    assert pages[1].brozzle_count == 1
    assert sorted(pages[1].hashtags) == ['#hash1','#hash2',]

    time.sleep(2)   # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = rr.table('captures').filter({'test_id':test_id}).run()
    redirect_captures = [c for c in captures if c['url'] == 'http://localhost:%s/site9/redirect.html' % httpd.server_port and c['http_method'] == 'GET']
    assert len(redirect_captures) == 2 # youtube-dl + browser, no hashtags
Example #6
0
def test_scope_and_schedule_outlinks():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    site = brozzler.Site(rr, {'seed':'http://example.com/'})
    parent_page = brozzler.Page(rr, {
        'hops_from_seed': 1, 'url': 'http://example.com/whatever'})
    outlinks = [
        'https://example.com/',
        'https://example.com/foo',
        'http://example.com/bar',
        'HTtp://exAMPle.COm/bar',
        'HTtp://exAMPle.COm/BAr',
        'HTtp://exAMPle.COm/BAZZZZ',]
    orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
    brozzler.is_permitted_by_robots = lambda *args: True
    try:
        frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
    finally:
        brozzler.is_permitted_by_robots = orig_is_permitted_by_robots

    assert sorted(parent_page.outlinks['rejected']) == [
            'https://example.com/', 'https://example.com/foo']
    assert sorted(parent_page.outlinks['accepted']) == [
                'http://example.com/BAZZZZ', 'http://example.com/BAr',
                'http://example.com/bar']
    assert parent_page.outlinks['blocked'] == []

    pp = brozzler.Page.load(rr, parent_page.id)
    assert pp == parent_page

    for url in parent_page.outlinks['rejected']:
        id = brozzler.Page.compute_id(site.id, url)
        assert brozzler.Page.load(rr, id) is None
    for url in parent_page.outlinks['accepted']:
        id = brozzler.Page.compute_id(site.id, url)
        assert brozzler.Page.load(rr, id)
Example #7
0
def test_seed_page():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    site = brozzler.Site(rr, {'seed':'http://example.com/a/'})
    site.save()

    assert frontier.seed_page(site.id) is None

    page1 = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/a/b/',
        'hops_from_seed': 1})
    page1.save()

    assert frontier.seed_page(site.id) is None

    page0 = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/a/',
        'hops_from_seed': 0})
    page0.save()

    assert frontier.seed_page(site.id) == page0
Example #8
0
def test_basics():
    rr = doublethink.Rethinker(db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    job_conf = {'seeds': [
        {'url': 'http://example.com'}, {'url': 'https://example.org/'}]}
    job = brozzler.new_job(frontier, job_conf)
    assert job.id
    assert job.starts_and_stops
    assert job.starts_and_stops[0]['start']
    assert job == {
        'id': job.id,
        'conf': {
            'seeds': [
                {'url': 'http://example.com'},
                {'url': 'https://example.org/'}
            ]
        },
        'status': 'ACTIVE',
        'starts_and_stops': [
            {
                'start': job.starts_and_stops[0]['start'],
                'stop': None
            }
        ]
    }

    sites = sorted(list(frontier.job_sites(job.id)), key=lambda x: x.seed)
    assert len(sites) == 2
    assert sites[0].starts_and_stops[0]['start']
    assert sites[1].starts_and_stops[0]['start']
    assert sites[0] == {
        'claimed': False,
        'id': sites[0].id,
        'job_id': job.id,
        'last_claimed': brozzler.EPOCH_UTC,
        'last_disclaimed': brozzler.EPOCH_UTC,
        'scope': {
            'surt': 'http://(com,example,)/'
        },
        'seed': 'http://example.com',
        'starts_and_stops': [
            {
                'start': sites[0].starts_and_stops[0]['start'],
                'stop': None
           }
        ],
        'status': 'ACTIVE'
    }
    assert sites[1] == {
        'claimed': False,
        'id': sites[1].id,
        'job_id': job.id,
        'last_claimed': brozzler.EPOCH_UTC,
        'last_disclaimed': brozzler.EPOCH_UTC,
        'scope': {
            'surt': 'https://(org,example,)/',
        },
        'seed': 'https://example.org/',
        'starts_and_stops': [
            {
                'start': sites[1].starts_and_stops[0]['start'],
                'stop': None,
           },
        ],
        'status': 'ACTIVE',
    }

    pages = list(frontier.site_pages(sites[0].id))
    assert len(pages) == 1
    assert pages[0] == {
        'brozzle_count': 0,
        'claimed': False,
        'hops_from_seed': 0,
        'hops_off_surt': 0,
        'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'),
        'job_id': job.id,
        'needs_robots_check': True,
        'priority': 1000,
        'site_id': sites[0].id,
        'url': 'http://example.com',
    }
    pages = list(frontier.site_pages(sites[1].id))
    assert len(pages) == 1
    assert pages[0] == {
        'brozzle_count': 0,
        'claimed': False,
        'hops_from_seed': 0,
        'hops_off_surt': 0,
        'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'),
        'job_id': job.id,
        'needs_robots_check': True,
        'priority': 1000,
        'site_id': sites[1].id,
        'url': 'https://example.org/',
    }

    # test "brozzled" parameter of frontier.site_pages
    assert len(list(frontier.site_pages(sites[1].id))) == 1
    assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 0
    assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 1
    pages[0].brozzle_count = 1
    pages[0].save()
    assert len(list(frontier.site_pages(sites[1].id))) == 1
    assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1
    assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0
    pages[0].brozzle_count = 32819
    pages[0].save()
    assert len(list(frontier.site_pages(sites[1].id))) == 1
    assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1
    assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0
Example #9
0
def test_brozzle_site(httpd):
    test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    site = brozzler.Site(rr, {
        'seed': 'http://localhost:%s/site1/' % httpd.server_port,
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})

    # the two pages we expect to be crawled
    page1 = 'http://localhost:%s/site1/' % httpd.server_port
    page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
    robots = 'http://localhost:%s/robots.txt' % httpd.server_port

    # so we can examine rethinkdb before it does anything
    try:
        stop_service('brozzler-worker')

        assert site.id is None
        frontier = brozzler.RethinkDbFrontier(rr)
        brozzler.new_site(frontier, site)
        assert site.id is not None
        assert len(list(frontier.site_pages(site.id))) == 1
    finally:
        start_service('brozzler-worker')

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # check that we got the two pages we expected
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 2
    assert {page.url for page in pages} == {
            'http://localhost:%s/site1/' % httpd.server_port,
            'http://localhost:%s/site1/file1.txt' % httpd.server_port}

    time.sleep(2)   # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = rr.table('captures').filter({'test_id':test_id}).run()
    captures_by_url = {
            c['url']: c for c in captures if c['http_method'] != 'HEAD'}
    assert robots in captures_by_url
    assert page1 in captures_by_url
    assert page2 in captures_by_url
    assert 'screenshot:%s' % page1 in captures_by_url
    assert 'thumbnail:%s' % page1 in captures_by_url
    # no screenshots of plaintext

    # check pywb
    t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
    expected_payload = open(os.path.join(
        os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read()
    assert requests.get(wb_url).content == expected_payload

    url = 'screenshot:%s' % page1
    t14 = captures_by_url[url]['timestamp'].strftime('%Y%m%d%H%M%S')
    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, url)
    response = requests.get(wb_url)
    assert response.status_code == 200
    assert response.headers['content-type'] == 'image/jpeg'

    url = 'thumbnail:%s' % page1
    t14 = captures_by_url[url]['timestamp'].strftime('%Y%m%d%H%M%S')
    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, url)
    response = requests.get(wb_url)
    assert response.status_code == 200
    assert response.headers['content-type'] == 'image/jpeg'
Example #10
0
def test_hashtags(httpd):
    test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    seed_url = make_url(httpd, '/site7/')
    site = brozzler.Site(
        rr, {
            'seed': seed_url,
            'warcprox_meta': {
                'captures-table-extra-fields': {
                    'test_id': test_id
                }
            }
        })

    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)
    assert site.id

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # check that we the page we expected
    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
    assert len(pages) == 2
    assert pages[0].url == seed_url
    assert pages[0].hops_from_seed == 0
    assert pages[0].brozzle_count == 1
    assert pages[0].outlinks['accepted'] == [
        make_url(httpd, '/site7/foo.html')
    ]
    assert not pages[0].hashtags
    assert pages[1].url == make_url(httpd, '/site7/foo.html')
    assert pages[1].hops_from_seed == 1
    assert pages[1].brozzle_count == 1
    assert sorted(pages[1].hashtags) == [
        '#boosh',
        '#ignored',
        '#whee',
    ]

    time.sleep(2)  # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = rr.table('captures').filter({'test_id': test_id}).run()
    captures_by_url = {
        c['url']: c
        for c in captures if c['http_method'] != 'HEAD'
    }
    assert seed_url in captures_by_url
    assert make_url(httpd, '/site7/foo.html') in captures_by_url
    assert make_url(httpd, '/site7/whee.txt') in captures_by_url
    assert make_url(httpd, '/site7/boosh.txt') in captures_by_url
    assert 'screenshot:%s' % seed_url in captures_by_url
    assert 'thumbnail:%s' % seed_url in captures_by_url
    assert 'screenshot:%s' % make_url(httpd,
                                      '/site7/foo.html') in captures_by_url
    assert 'thumbnail:%s' % make_url(httpd,
                                     '/site7/foo.html') in captures_by_url
Example #11
0
def test_max_hops_off():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    site = brozzler.Site(
        rr, {
            'seed': 'http://example.com/',
            'scope': {
                'max_hops_off_surt': 1,
                'blocks': [{
                    'ssurt': 'domain,bad,'
                }]
            }
        })
    brozzler.new_site(frontier, site)
    site.refresh()  # get it back from the db

    # renamed this param
    assert not 'max_hops_off_surt' in site.scope
    assert site.scope['max_hops_off'] == 1

    seed_page = frontier.seed_page(site.id)

    assert site.accept_reject_or_neither('http://foo.org/', seed_page) is None
    assert site.accept_reject_or_neither('https://example.com/toot',
                                         seed_page) is None
    assert site.accept_reject_or_neither('http://example.com/toot',
                                         seed_page) is True
    assert site.accept_reject_or_neither('https://some.bad.domain/something',
                                         seed_page) is False

    orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
    brozzler.is_permitted_by_robots = lambda *args: True
    try:
        # two of these are in scope because of max_hops_off
        frontier.scope_and_schedule_outlinks(site, seed_page, [
            'http://foo.org/', 'https://example.com/toot',
            'http://example.com/toot', 'https://some.bad.domain/something'
        ])
    finally:
        brozzler.is_permitted_by_robots = orig_is_permitted_by_robots

    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)

    assert len(pages) == 4
    assert pages[0].url == 'http://example.com/'
    assert pages[0].hops_off == 0
    assert not 'hops_off_surt' in pages[0]
    assert set(pages[0].outlinks['accepted']) == {
        'https://example.com/toot', 'http://foo.org/',
        'http://example.com/toot'
    }
    assert pages[0].outlinks['blocked'] == []
    assert pages[0].outlinks['rejected'] == [
        'https://some.bad.domain/something'
    ]
    assert {
        'brozzle_count': 0,
        'claimed': False,
        'hashtags': [],
        'hops_from_seed': 1,
        'hops_off': 0,
        'id': brozzler.Page.compute_id(site.id, 'http://example.com/toot'),
        'job_id': None,
        'needs_robots_check': False,
        'priority': 12,
        'site_id': site.id,
        'url': 'http://example.com/toot',
        'via_page_id': seed_page.id
    } in pages
    assert {
        'brozzle_count': 0,
        'claimed': False,
        'hashtags': [],
        'hops_from_seed': 1,
        'hops_off': 1,
        'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'),
        'job_id': None,
        'needs_robots_check': False,
        'priority': 12,
        'site_id': site.id,
        'url': 'http://foo.org/',
        'via_page_id': seed_page.id
    } in pages
    assert {
        'brozzle_count': 0,
        'claimed': False,
        'hashtags': [],
        'hops_from_seed': 1,
        'hops_off': 1,
        'id': brozzler.Page.compute_id(site.id, 'https://example.com/toot'),
        'job_id': None,
        'needs_robots_check': False,
        'priority': 12,
        'site_id': site.id,
        'url': 'https://example.com/toot',
        'via_page_id': seed_page.id
    } in pages

    # next hop is past max_hops_off, but normal in scope url is in scope
    foo_page = [pg for pg in pages if pg.url == 'http://foo.org/'][0]
    orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
    brozzler.is_permitted_by_robots = lambda *args: True
    try:
        frontier.scope_and_schedule_outlinks(
            site, foo_page, ['http://foo.org/bar', 'http://example.com/blah'])
    finally:
        brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
    assert foo_page == {
        'brozzle_count': 0,
        'claimed': False,
        'hashtags': [],
        'hops_from_seed': 1,
        'hops_off': 1,
        'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'),
        'job_id': None,
        'needs_robots_check': False,
        'priority': 12,
        'site_id': site.id,
        'url': 'http://foo.org/',
        'via_page_id': seed_page.id,
        'outlinks': {
            'accepted': ['http://example.com/blah'],
            'blocked': [],
            'rejected': ['http://foo.org/bar'],
        }
    }
    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
    assert len(pages) == 5
    assert {
        'brozzle_count': 0,
        'claimed': False,
        'hashtags': [],
        'hops_from_seed': 2,
        'hops_off': 0,
        'id': brozzler.Page.compute_id(site.id, 'http://example.com/blah'),
        'job_id': None,
        'needs_robots_check': False,
        'priority': 11,
        'site_id': site.id,
        'url': 'http://example.com/blah',
        'via_page_id': foo_page.id
    } in pages
Example #12
0
def brozzler_purge(argv=None):
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(argv[0]),
            description='brozzler-purge - purge crawl state from rethinkdb',
            formatter_class=BetterArgumentDefaultsHelpFormatter)
    group = arg_parser.add_mutually_exclusive_group(required=True)
    group.add_argument(
            '--job', dest='job', metavar='JOB_ID', help=(
                'purge crawl state from rethinkdb for a job, including all '
                'sites and pages'))
    group.add_argument(
            '--site', dest='site', metavar='SITE_ID', help=(
                'purge crawl state from rethinkdb for a site, including all '
                'pages'))
    group.add_argument(
            '--finished-before', dest='finished_before', metavar='YYYY-MM-DD',
            help=('purge crawl state from rethinkdb for a jobs that ended '
                  'before this date'))
    arg_parser.add_argument(
            '--force', dest='force', action='store_true', help=(
                'purge even if job or site is still has status ACTIVE'))
    add_rethinkdb_options(arg_parser)
    add_common_options(arg_parser, argv)

    args = arg_parser.parse_args(args=argv[1:])
    configure_logging(args)

    rr = rethinker(args)
    frontier = brozzler.RethinkDbFrontier(rr)
    if args.job:
        try:
            job_id = int(args.job)
        except ValueError:
            job_id = args.job
        job = brozzler.Job.load(rr, job_id)
        if not job:
            logging.fatal('no such job %r', job_id)
            sys.exit(1)
        if job.status == 'ACTIVE':
            if args.force:
                logging.warning(
                        'job %s has status ACTIVE, purging anyway because '
                        '--force was supplied', job_id)
            else:
                logging.fatal(
                        'refusing to purge job %s because status is ACTIVE '
                        '(override with --force)', job_id)
                sys.exit(1)
        _purge_job(rr, job_id)
    elif args.site:
        site_id = args.site
        site = brozzler.Site.load(rr, site_id)
        if not site:
            logging.fatal('no such job %r', job_id)
            sys.exit(1)
        if site.status == 'ACTIVE':
            if args.force:
                logging.warning(
                        'site %s has status ACTIVE, purging anyway because '
                        '--force was supplied', site_id)
            else:
                logging.fatal(
                        'refusing to purge site %s because status is ACTIVE '
                        '(override with --force)', site_id)
                sys.exit(1)
        _purge_site(rr, site_id)
    elif args.finished_before:
        finished_before = datetime.datetime.strptime(
                args.finished_before, '%Y-%m-%d').replace(
                        tzinfo=doublethink.UTC)
        reql = rr.table('jobs').filter(
                r.row['finished'].default(r.maxval).lt(finished_before).or_(
                    r.row['starts_and_stops'].nth(-1)['stop'].default(r.maxval).lt(finished_before)))
        logging.debug(
                'retrieving jobs older than %s: %s', finished_before, reql)
        for job in reql.run():
            # logging.info('job %s finished=%s starts_and_stops[-1]["stop"]=%s',
            #         job['id'], job.get('finished'),
            #         job.get('starts_and_stops', [{'stop':None}])[-1]['stop'])
            _purge_job(rr, job['id'])
Example #13
0
def brozzler_purge(argv=None):
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
        prog=os.path.basename(argv[0]),
        description='brozzler-purge - purge crawl state from rethinkdb',
        formatter_class=BetterArgumentDefaultsHelpFormatter)
    group = arg_parser.add_mutually_exclusive_group(required=True)
    group.add_argument(
        '--job',
        dest='job',
        metavar='JOB_ID',
        help=('purge crawl state from rethinkdb for a job, including all '
              'sites and pages'))
    group.add_argument(
        '--site',
        dest='site',
        metavar='SITE_ID',
        help=('purge crawl state from rethinkdb for a site, including all '
              'pages'))
    arg_parser.add_argument(
        '--force',
        dest='force',
        action='store_true',
        help=('purge even if job or site is still has status ACTIVE'))
    add_rethinkdb_options(arg_parser)
    add_common_options(arg_parser, argv)

    args = arg_parser.parse_args(args=argv[1:])
    configure_logging(args)

    rr = rethinker(args)
    frontier = brozzler.RethinkDbFrontier(rr)
    if args.job:
        try:
            job_id = int(args.job)
        except ValueError:
            job_id = args.job
        job = brozzler.Job.load(rr, job_id)
        if not job:
            logging.fatal('no such job %r', job_id)
            sys.exit(1)
        if job.status == 'ACTIVE':
            if args.force:
                logging.warn(
                    'job %s has status ACTIVE, purging anyway because '
                    '--force was supplied', job_id)
            else:
                logging.fatal(
                    'refusing to purge job %s because status is ACTIVE '
                    '(override with --force)', job_id)
                sys.exit(1)
        _purge_job(rr, job_id)
    elif args.site:
        site_id = args.site
        site = brozzler.Site.load(rr, site_id)
        if not site:
            logging.fatal('no such job %r', job_id)
            sys.exit(1)
        if site.status == 'ACTIVE':
            if args.force:
                logging.warn(
                    'site %s has status ACTIVE, purging anyway because '
                    '--force was supplied', site_id)
            else:
                logging.fatal(
                    'refusing to purge site %s because status is ACTIVE '
                    '(override with --force)', site_id)
                sys.exit(1)
        _purge_site(rr, site_id)
Example #14
0
def brozzler_worker():
    '''
    Main entrypoint for brozzler, gets sites and pages to brozzle from
    rethinkdb, brozzles them.
    '''
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(__file__),
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    _add_rethinkdb_options(arg_parser)
    arg_parser.add_argument(
            '-e', '--chrome-exe', dest='chrome_exe',
            default=suggest_default_chome_exe(),
            help='executable to use to invoke chrome')
    arg_parser.add_argument(
            '-n', '--max-browsers', dest='max_browsers', default='1',
            help='max number of chrome instances simultaneously browsing pages')
    _add_common_options(arg_parser)

    args = arg_parser.parse_args(args=sys.argv[1:])
    _configure_logging(args)

    def sigterm(signum, frame):
        raise brozzler.ShutdownRequested('shutdown requested (caught SIGTERM)')
    def sigint(signum, frame):
        raise brozzler.ShutdownRequested('shutdown requested (caught SIGINT)')

    # do not print in signal handler to avoid RuntimeError: reentrant call
    state_dump_msgs = []
    def queue_state_dump(signum, frame):
        signal.signal(signal.SIGQUIT, signal.SIG_IGN)
        try:
            state_strs = []
            frames = sys._current_frames()
            threads = {th.ident: th for th in threading.enumerate()}
            for ident in frames:
                if threads[ident]:
                    state_strs.append(str(threads[ident]))
                else:
                    state_strs.append('<???:thread:ident=%s>' % ident)
                stack = traceback.format_stack(frames[ident])
                state_strs.append(''.join(stack))
            state_dump_msgs.append(
                    'dumping state (caught signal %s)\n%s' % (
                        signum, '\n'.join(state_strs)))
        except BaseException as e:
            state_dump_msgs.append('exception dumping state: %s' % e)
        finally:
            signal.signal(signal.SIGQUIT, queue_state_dump)

    signal.signal(signal.SIGQUIT, queue_state_dump)
    signal.signal(signal.SIGTERM, sigterm)
    signal.signal(signal.SIGINT, sigint)

    r = rethinkstuff.Rethinker(
            args.rethinkdb_servers.split(','), args.rethinkdb_db)
    frontier = brozzler.RethinkDbFrontier(r)
    service_registry = rethinkstuff.ServiceRegistry(r)
    worker = brozzler.worker.BrozzlerWorker(
            frontier, service_registry, max_browsers=int(args.max_browsers),
            chrome_exe=args.chrome_exe)

    worker.start()
    try:
        while worker.is_alive():
            while state_dump_msgs:
                logging.warn(state_dump_msgs.pop(0))
            time.sleep(0.5)
        logging.critical('worker thread has died, shutting down')
    except brozzler.ShutdownRequested as e:
        pass
    finally:
        worker.shutdown_now()

    logging.info('brozzler-worker is all done, exiting')
Example #15
0
def test_parent_url_scoping():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    # scope rules that look at parent page url should consider both the
    # original url and the redirect url, if any, of the parent page
    site = brozzler.Site(rr, {
        'seed': 'http://example.com/foo/',
        'scope': {
            'accepts': [{
                'parent_url_regex': '^http://example.com/acceptme/.*$'}],
            'blocks': [{
                'parent_url_regex': '^http://example.com/blockme/.*$'}],
            },
        'remember_outlinks': True})
    site.save()

    # an outlink that would not otherwise be in scope
    outlinks = ['https://some-random-url.com/']

    # parent page does not match any parent_url_regex
    parent_page = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/foo/spluh'})
    orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
    brozzler.is_permitted_by_robots = lambda *args: True
    try:
        frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
    finally:
        brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
    assert parent_page.outlinks['rejected'] == outlinks
    assert parent_page.outlinks['accepted'] == []

    # parent page url matches accept parent_url_regex
    parent_page = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/acceptme/futz'})
    orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
    brozzler.is_permitted_by_robots = lambda *args: True
    try:
        frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
    finally:
        brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
    assert parent_page.outlinks['rejected'] == []
    assert parent_page.outlinks['accepted'] == outlinks

    # parent page redirect_url matches accept parent_url_regex
    parent_page_c = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/toot/blah',
        'redirect_url':'http://example.com/acceptme/futz'})
    orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
    brozzler.is_permitted_by_robots = lambda *args: True
    try:
        frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
    finally:
        brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
    assert parent_page.outlinks['rejected'] == []
    assert parent_page.outlinks['accepted'] == outlinks

    # an outlink that would normally be in scope
    outlinks = ['http://example.com/foo/whatever/']

    # parent page does not match any parent_url_regex
    parent_page = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/foo/spluh'})
    orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
    brozzler.is_permitted_by_robots = lambda *args: True
    try:
        frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
    finally:
        brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
    assert parent_page.outlinks['rejected'] == []
    assert parent_page.outlinks['accepted'] == outlinks

    # parent page url matches block parent_url_regex
    parent_page = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/blockme/futz'})
    orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
    brozzler.is_permitted_by_robots = lambda *args: True
    try:
        frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
    finally:
        brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
    assert parent_page.outlinks['rejected'] == outlinks
    assert parent_page.outlinks['accepted'] == []

    # parent page redirect_url matches block parent_url_regex
    parent_page_c = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/toot/blah',
        'redirect_url':'http://example.com/blockme/futz'})
    orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
    brozzler.is_permitted_by_robots = lambda *args: True
    try:
        frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
    finally:
        brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
    assert parent_page.outlinks['rejected'] == outlinks
    assert parent_page.outlinks['accepted'] == []
Example #16
0
def test_completed_page():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    # redirect that changes scope surt
    site = brozzler.Site(rr, {'seed':'http://example.com/a/'})
    site.save()
    page = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/a/',
        'claimed': True,
        'brozzle_count': 0,
        'hops_from_seed': 0,
        'redirect_url':'http://example.com/b/', })
    page.save()
    assert site.scope == {'surt': 'http://(com,example,)/a/'}
    frontier.completed_page(site, page)
    assert site.scope == {'surt': 'http://(com,example,)/b/'}
    site.refresh()
    assert site.scope == {'surt': 'http://(com,example,)/b/'}
    assert page.brozzle_count == 1
    assert page.claimed == False
    page.refresh()
    assert page.brozzle_count == 1
    assert page.claimed == False

    # redirect that doesn't change scope surt because destination is covered by
    # the original surt
    site = brozzler.Site(rr, {'seed':'http://example.com/a/'})
    site.save()
    page = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/a/',
        'claimed': True,
        'brozzle_count': 0,
        'hops_from_seed': 0,
        'redirect_url':'http://example.com/a/x/', })
    page.save()
    assert site.scope == {'surt': 'http://(com,example,)/a/'}
    frontier.completed_page(site, page)
    assert site.scope == {'surt': 'http://(com,example,)/a/'}
    site.refresh()
    assert site.scope == {'surt': 'http://(com,example,)/a/'}
    assert page.brozzle_count == 1
    assert page.claimed == False
    page.refresh()
    assert page.brozzle_count == 1
    assert page.claimed == False

    # redirect that doesn't change scope surt because page is not the seed page
    site = brozzler.Site(rr, {'seed':'http://example.com/a/'})
    site.save()
    page = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/c/',
        'claimed': True,
        'brozzle_count': 0,
        'hops_from_seed': 1,
        'redirect_url':'http://example.com/d/', })
    page.save()
    assert site.scope == {'surt': 'http://(com,example,)/a/'}
    frontier.completed_page(site, page)
    assert site.scope == {'surt': 'http://(com,example,)/a/'}
    site.refresh()
    assert site.scope == {'surt': 'http://(com,example,)/a/'}
    assert page.brozzle_count == 1
    assert page.claimed == False
    page.refresh()
    assert page.brozzle_count == 1
    assert page.claimed == False
Example #17
0
def test_resume_job():
    '''
    Tests that the right stuff gets twiddled in rethinkdb when we "start" and
    "finish" crawling a job. Doesn't actually crawl anything.
    '''
    # vagrant brozzler-worker isn't configured to look at the "ignoreme" db
    rr = doublethink.Rethinker(db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    job_conf = {'seeds': [{'url': 'http://example.com/'}]}
    job = brozzler.new_job(frontier, job_conf)
    assert len(list(frontier.job_sites(job.id))) == 1
    site = list(frontier.job_sites(job.id))[0]

    assert job.status == 'ACTIVE'
    assert len(job.starts_and_stops) == 1
    assert job.starts_and_stops[0]['start']
    assert job.starts_and_stops[0]['stop'] is None
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop'] is None

    frontier.finished(site, 'FINISHED')
    job.refresh()

    assert job.status == 'FINISHED'
    assert len(job.starts_and_stops) == 1
    assert job.starts_and_stops[0]['start']
    assert job.starts_and_stops[0]['stop']
    assert job.starts_and_stops[0]['stop'] > job.starts_and_stops[0]['start']
    assert site.status == 'FINISHED'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop']
    assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']

    frontier.resume_site(site)
    job.refresh()

    assert job.status == 'ACTIVE'
    assert len(job.starts_and_stops) == 2
    assert job.starts_and_stops[1]['start']
    assert job.starts_and_stops[1]['stop'] is None
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop'] is None

    frontier.finished(site, 'FINISHED')
    job.refresh()

    assert job.status == 'FINISHED'
    assert len(job.starts_and_stops) == 2
    assert job.starts_and_stops[1]['start']
    assert job.starts_and_stops[1]['stop']
    assert job.starts_and_stops[1]['stop'] > job.starts_and_stops[0]['start']
    assert site.status == 'FINISHED'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop']
    assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start']

    # resuming a job == resuming all of its sites
    frontier.resume_job(job)
    site = list(frontier.job_sites(job.id))[0]

    assert job.status == 'ACTIVE'
    assert len(job.starts_and_stops) == 3
    assert job.starts_and_stops[2]['start']
    assert job.starts_and_stops[2]['stop'] is None
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 3
    assert site.starts_and_stops[2]['start']
    assert site.starts_and_stops[2]['stop'] is None

    frontier.finished(site, 'FINISHED')
    job.refresh()

    assert job.status == 'FINISHED'
    assert len(job.starts_and_stops) == 3
    assert job.starts_and_stops[2]['start']
    assert job.starts_and_stops[2]['stop']
    assert job.starts_and_stops[2]['stop'] > job.starts_and_stops[0]['start']
    assert site.status == 'FINISHED'
    assert len(site.starts_and_stops) == 3
    assert site.starts_and_stops[2]['start']
    assert site.starts_and_stops[2]['stop']
    assert site.starts_and_stops[2]['stop'] > site.starts_and_stops[0]['start']
Example #18
0
def test_choose_warcprox():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    svcreg = doublethink.ServiceRegistry(rr)
    frontier = brozzler.RethinkDbFrontier(rr)

    # avoid this of error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021
    rr.table('sites').wait().run()
    rr.table('services').wait().run()
    rr.table('sites').index_wait().run()
    rr.table('services').index_wait().run()

    # clean slate
    rr.table('sites').delete().run()
    rr.table('services').delete().run()
    worker = brozzler.BrozzlerWorker(frontier, svcreg)
    assert worker._choose_warcprox() is None

    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host1', 'port': 8000,
        'load': 0, 'ttl': 60}).run()
    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host2', 'port': 8000,
        'load': 0, 'ttl': 60}).run()
    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host2', 'port': 8001,
        'load': 0, 'ttl': 60}).run()
    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host3', 'port': 8000,
        'load': 0, 'ttl': 60}).run()
    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host4', 'port': 8000,
        'load': 1, 'ttl': 60}).run()

    rr.table('sites').insert({
        'proxy': 'host1:8000', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()
    rr.table('sites').insert({
        'proxy': 'host1:8000', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()
    rr.table('sites').insert({
        'proxy': 'host2:8000', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()
    rr.table('sites').insert({
        'proxy': 'host2:8001', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()

    instance = worker._choose_warcprox()
    assert instance['host'] == 'host3'
    assert instance['port'] == 8000
    rr.table('sites').insert({
        'proxy': 'host3:8000', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()

    instance = worker._choose_warcprox()
    assert instance['host'] == 'host4'
    assert instance['port'] == 8000

    # clean up
    rr.table('sites').delete().run()
    rr.table('services').delete().run()
Example #19
0
def brozzler_new_site(argv=None):
    '''
    Command line utility entry point for queuing a new brozzler site.
    Takes a seed url and creates a site and page object in rethinkdb, which
    brozzler-workers will look at and start crawling.
    '''
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
        prog=os.path.basename(argv[0]),
        description='brozzler-new-site - register site to brozzle',
        formatter_class=BetterArgumentDefaultsHelpFormatter)
    arg_parser.add_argument('seed', metavar='SEED', help='seed url')
    add_rethinkdb_options(arg_parser)
    arg_parser.add_argument('--time-limit',
                            dest='time_limit',
                            default=None,
                            help='time limit in seconds for this site')
    arg_parser.add_argument('--ignore-robots',
                            dest='ignore_robots',
                            action='store_true',
                            help='ignore robots.txt for this site')
    arg_parser.add_argument(
        '--warcprox-meta',
        dest='warcprox_meta',
        help=('Warcprox-Meta http request header to send with each request; '
              'must be a json blob, ignored unless warcprox features are '
              'enabled'))
    arg_parser.add_argument(
        '--behavior-parameters',
        dest='behavior_parameters',
        default=None,
        help=('json blob of parameters to populate the javascript behavior '
              'template, e.g. {"parameter_username":"******",'
              '"parameter_password":"******"}'))
    arg_parser.add_argument(
        '--username',
        dest='username',
        default=None,
        help='use this username to try to log in if a login form is found')
    arg_parser.add_argument(
        '--password',
        dest='password',
        default=None,
        help='use this password to try to log in if a login form is found')
    add_common_options(arg_parser, argv)

    args = arg_parser.parse_args(args=argv[1:])
    configure_logging(args)

    rr = rethinker(args)
    site = brozzler.Site(
        rr, {
            'seed':
            args.seed,
            'time_limit':
            int(args.time_limit) if args.time_limit else None,
            'ignore_robots':
            args.ignore_robots,
            'warcprox_meta':
            json.loads(args.warcprox_meta) if args.warcprox_meta else None,
            'behavior_parameters':
            json.loads(args.behavior_parameters)
            if args.behavior_parameters else None,
            'username':
            args.username,
            'password':
            args.password
        })

    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)
Example #20
0
def test_stop_crawl(httpd):
    test_id = 'test_stop_crawl_job-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(rr)

    # create a new job with three sites that could be crawled forever
    job_conf = {
        'seeds': [{
            'url': make_url(httpd, '/infinite/foo/')
        }, {
            'url': make_url(httpd, '/infinite/bar/')
        }, {
            'url': make_url(httpd, '/infinite/baz/')
        }]
    }
    job = brozzler.new_job(frontier, job_conf)
    assert job.id

    sites = list(frontier.job_sites(job.id))
    assert not sites[0].stop_requested
    assert not sites[1].stop_requested

    # request crawl stop for one site using the command line entrypoint
    brozzler.cli.brozzler_stop_crawl(
        ['brozzler-stop-crawl',
         '--site=%s' % sites[0].id])
    sites[0].refresh()
    assert sites[0].stop_requested

    # stop request should be honored quickly
    start = time.time()
    while not sites[0].status.startswith(
            'FINISHED') and time.time() - start < 120:
        time.sleep(0.5)
        sites[0].refresh()
    assert sites[0].status == 'FINISHED_STOP_REQUESTED'

    # but the other sites and the job as a whole should still be crawling
    sites[1].refresh()
    assert sites[1].status == 'ACTIVE'
    sites[2].refresh()
    assert sites[2].status == 'ACTIVE'
    job.refresh()
    assert job.status == 'ACTIVE'

    # request crawl stop for the job using the command line entrypoint
    brozzler.cli.brozzler_stop_crawl(
        ['brozzler-stop-crawl', '--job=%s' % job.id])
    job.refresh()
    assert job.stop_requested

    # stop request should be honored quickly
    start = time.time()
    while not job.status.startswith('FINISHED') and time.time() - start < 120:
        time.sleep(0.5)
        job.refresh()
    assert job.status == 'FINISHED'

    # the other sites should also be FINISHED_STOP_REQUESTED
    sites[0].refresh()
    assert sites[0].status == 'FINISHED_STOP_REQUESTED'
    sites[1].refresh()
    assert sites[1].status == 'FINISHED_STOP_REQUESTED'
    sites[2].refresh()
    assert sites[2].status == 'FINISHED_STOP_REQUESTED'
Example #21
0
def brozzler_worker(argv=None):
    '''
    Main entry point for brozzler, gets sites and pages to brozzle from
    rethinkdb, brozzles them.
    '''
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
        prog=os.path.basename(argv[0]),
        formatter_class=BetterArgumentDefaultsHelpFormatter)
    add_rethinkdb_options(arg_parser)
    arg_parser.add_argument('-e',
                            '--chrome-exe',
                            dest='chrome_exe',
                            default=suggest_default_chrome_exe(),
                            help='executable to use to invoke chrome')
    arg_parser.add_argument(
        '-n',
        '--max-browsers',
        dest='max_browsers',
        default='1',
        help='max number of chrome instances simultaneously browsing pages')
    arg_parser.add_argument('--proxy',
                            dest='proxy',
                            default=None,
                            help='http proxy')
    arg_parser.add_argument(
        '--warcprox-auto',
        dest='warcprox_auto',
        action='store_true',
        help=('when needed, choose an available instance of warcprox from '
              'the rethinkdb service registry'))
    arg_parser.add_argument('--skip-extract-outlinks',
                            dest='skip_extract_outlinks',
                            action='store_true',
                            help=argparse.SUPPRESS)
    arg_parser.add_argument('--skip-visit-hashtags',
                            dest='skip_visit_hashtags',
                            action='store_true',
                            help=argparse.SUPPRESS)
    arg_parser.add_argument('--skip-youtube-dl',
                            dest='skip_youtube_dl',
                            action='store_true',
                            help=argparse.SUPPRESS)
    add_common_options(arg_parser, argv)

    args = arg_parser.parse_args(args=argv[1:])
    configure_logging(args)

    def dump_state(signum, frame):
        signal.signal(signal.SIGQUIT, signal.SIG_IGN)
        try:
            state_strs = []
            frames = sys._current_frames()
            threads = {th.ident: th for th in threading.enumerate()}
            for ident in frames:
                if threads[ident]:
                    state_strs.append(str(threads[ident]))
                else:
                    state_strs.append('<???:thread:ident=%s>' % ident)
                stack = traceback.format_stack(frames[ident])
                state_strs.append(''.join(stack))
            logging.info('dumping state (caught signal %s)\n%s' %
                         (signum, '\n'.join(state_strs)))
        except BaseException as e:
            logging.error('exception dumping state: %s' % e)
        finally:
            signal.signal(signal.SIGQUIT, dump_state)

    rr = rethinker(args)
    frontier = brozzler.RethinkDbFrontier(rr)
    service_registry = doublethink.ServiceRegistry(rr)
    worker = brozzler.worker.BrozzlerWorker(
        frontier,
        service_registry,
        max_browsers=int(args.max_browsers),
        chrome_exe=args.chrome_exe,
        proxy=args.proxy,
        warcprox_auto=args.warcprox_auto,
        skip_extract_outlinks=args.skip_extract_outlinks,
        skip_visit_hashtags=args.skip_visit_hashtags,
        skip_youtube_dl=args.skip_youtube_dl)

    signal.signal(signal.SIGQUIT, dump_state)
    signal.signal(signal.SIGTERM, lambda s, f: worker.stop())
    signal.signal(signal.SIGINT, lambda s, f: worker.stop())

    th = threading.Thread(target=worker.run, name='BrozzlerWorkerThread')
    th.start()
    th.join()
    logging.info('brozzler-worker is all done, exiting')
Example #22
0
def _test_proxy_setting(
        httpd, proxy=None, warcprox_auto=False, is_warcprox=False):
    test_id = 'test_proxy=%s_warcprox_auto=%s_is_warcprox=%s-%s' % (
            proxy, warcprox_auto, is_warcprox,
            datetime.datetime.utcnow().isoformat())

    # the two pages we expect to be crawled
    page1 = 'http://localhost:%s/site1/' % httpd.server_port
    page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
    robots = 'http://localhost:%s/robots.txt' % httpd.server_port

    rr = doublethink.Rethinker('localhost', db='brozzler')
    service_registry = doublethink.ServiceRegistry(rr)
    site = brozzler.Site(rr, {
        'seed': 'http://localhost:%s/site1/' % httpd.server_port,
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
    assert site.id is None
    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)
    assert site.id is not None
    assert len(list(frontier.site_pages(site.id))) == 1

    worker = brozzler.worker.BrozzlerWorker(
            frontier, service_registry, max_browsers=1,
            chrome_exe=brozzler.suggest_default_chrome_exe(),
            warcprox_auto=warcprox_auto, proxy=proxy)
    browser = worker._browser_pool.acquire()
    worker.brozzle_site(browser, site)
    worker._browser_pool.release(browser)

    # check proxy is set
    assert site.status == 'FINISHED'
    if warcprox_auto:
        assert site.proxy[-5:] == ':8000'
    else:
        assert not site.proxy
    site.refresh() # check that these things were persisted
    assert site.status == 'FINISHED'
    if warcprox_auto:
        assert site.proxy[-5:] == ':8000'
    else:
        assert not site.proxy

    # check that we got the two pages we expected
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 2
    assert {page.url for page in pages} == {
            'http://localhost:%s/site1/' % httpd.server_port,
            'http://localhost:%s/site1/file1.txt' % httpd.server_port}

    time.sleep(2)   # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = rr.table('captures').filter({'test_id':test_id}).run()
    captures_by_url = {
            c['url']: c for c in captures if c['http_method'] != 'HEAD'}
    if is_warcprox:
        assert robots in captures_by_url
        assert page1 in captures_by_url
        assert page2 in captures_by_url
        assert 'screenshot:%s' % page1 in captures_by_url
        assert 'thumbnail:%s' % page1 in captures_by_url

        # check pywb
        t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
        wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
        expected_payload = open(os.path.join(
            os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read()
        assert requests.get(wb_url).content == expected_payload
    else:
        assert captures_by_url == {}
Example #23
0
def test_resume_job():
    '''
    Tests that the right stuff gets twiddled in rethinkdb when we "start" and
    "finish" crawling a job. Doesn't actually crawl anything.
    '''
    # vagrant brozzler-worker isn't configured to look at the "ignoreme" db
    rr = doublethink.Rethinker(db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    job_conf = {'seeds': [{'url': 'http://example.com/'}]}
    job = brozzler.new_job(frontier, job_conf)
    assert len(list(frontier.job_sites(job.id))) == 1
    site = list(frontier.job_sites(job.id))[0]

    assert job.status == 'ACTIVE'
    assert len(job.starts_and_stops) == 1
    assert job.starts_and_stops[0]['start']
    assert job.starts_and_stops[0]['stop'] is None
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop'] is None

    frontier.finished(site, 'FINISHED')
    job.refresh()

    assert job.status == 'FINISHED'
    assert len(job.starts_and_stops) == 1
    assert job.starts_and_stops[0]['start']
    assert job.starts_and_stops[0]['stop']
    assert job.starts_and_stops[0]['stop'] > job.starts_and_stops[0]['start']
    assert site.status == 'FINISHED'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop']
    assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']

    frontier.resume_site(site)
    job.refresh()

    assert job.status == 'ACTIVE'
    assert len(job.starts_and_stops) == 2
    assert job.starts_and_stops[1]['start']
    assert job.starts_and_stops[1]['stop'] is None
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop'] is None

    frontier.finished(site, 'FINISHED')
    job.refresh()

    assert job.status == 'FINISHED'
    assert len(job.starts_and_stops) == 2
    assert job.starts_and_stops[1]['start']
    assert job.starts_and_stops[1]['stop']
    assert job.starts_and_stops[1]['stop'] > job.starts_and_stops[1]['start']
    assert site.status == 'FINISHED'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop']
    assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[1]['start']

    # resuming a job == resuming all of its sites
    frontier.resume_job(job)
    site = list(frontier.job_sites(job.id))[0]

    assert job.status == 'ACTIVE'
    assert len(job.starts_and_stops) == 3
    assert job.starts_and_stops[2]['start']
    assert job.starts_and_stops[2]['stop'] is None
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 3
    assert site.starts_and_stops[2]['start']
    assert site.starts_and_stops[2]['stop'] is None

    frontier.finished(site, 'FINISHED')
    job.refresh()

    assert job.status == 'FINISHED'
    assert len(job.starts_and_stops) == 3
    assert job.starts_and_stops[2]['start']
    assert job.starts_and_stops[2]['stop']
    assert job.starts_and_stops[2]['stop'] > job.starts_and_stops[2]['start']
    assert site.status == 'FINISHED'
    assert len(site.starts_and_stops) == 3
    assert site.starts_and_stops[2]['start']
    assert site.starts_and_stops[2]['stop']
    assert site.starts_and_stops[2]['stop'] > site.starts_and_stops[2]['start']

    frontier.resume_job(job)
    site = list(frontier.job_sites(job.id))[0]

    assert job.status == 'ACTIVE'
    assert len(job.starts_and_stops) == 4
    assert job.starts_and_stops[3]['start']
    assert job.starts_and_stops[3]['stop'] is None
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 4
    assert site.starts_and_stops[3]['start']
    assert site.starts_and_stops[3]['stop'] is None

    # simulate a job stop request
    job_conf = {'seeds': [{'url': 'http://example.com/'}, {'url': 'http://example_2.com/'}]}
    job = brozzler.new_job(frontier, job_conf)
    assert len(list(frontier.job_sites(job.id))) == 2
    site1 = list(frontier.job_sites(job.id))[0]
    site2 = list(frontier.job_sites(job.id))[1]

    job.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC)
    job.save()

    # should raise a CrawlStopped
    with pytest.raises(brozzler.CrawlStopped):
        frontier.honor_stop_request(site1)

    frontier.finished(site1, 'FINISHED_STOP_REQUESTED')
    frontier.finished(site2, 'FINISHED_STOP_REQUESTED')
    job.refresh()

    assert job.status == 'FINISHED'
    assert job.stop_requested
    assert len(job.starts_and_stops) == 1
    assert job.starts_and_stops[0]['start']
    assert job.starts_and_stops[0]['stop']
    assert job.starts_and_stops[0]['stop'] > job.starts_and_stops[0]['start']
    assert site1.status == 'FINISHED_STOP_REQUESTED'
    assert site2.status == 'FINISHED_STOP_REQUESTED'
    assert len(site1.starts_and_stops) == 1
    assert len(site2.starts_and_stops) == 1
    assert site1.starts_and_stops[0]['start']
    assert site1.starts_and_stops[0]['stop']
    assert site1.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']
    assert site2.starts_and_stops[0]['start']
    assert site2.starts_and_stops[0]['stop']
    assert site2.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']

    # simulate job resume after a stop request
    frontier.resume_job(job)
    site1 = list(frontier.job_sites(job.id))[0]
    site2 = list(frontier.job_sites(job.id))[1]

    assert job.status == 'ACTIVE'
    assert job.stop_requested is None
    assert len(job.starts_and_stops) == 2
    assert job.starts_and_stops[1]['start']
    assert job.starts_and_stops[1]['stop'] is None
    assert site1.status == 'ACTIVE'
    assert len(site1.starts_and_stops) == 2
    assert site1.starts_and_stops[1]['start']
    assert site1.starts_and_stops[1]['stop'] is None
    assert site2.status == 'ACTIVE'
    assert len(site2.starts_and_stops) == 2
    assert site2.starts_and_stops[1]['start']
    assert site2.starts_and_stops[1]['stop'] is None

    # simulate a site stop request
    site1.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC)
    site1.save()

    # should not raise a CrawlStopped
    frontier.honor_stop_request(site2)

    frontier.finished(site1, 'FINISHED_STOP_REQUESTED')
    job.refresh()

    assert job.status == 'ACTIVE'
    assert job.stop_requested is None
    assert len(job.starts_and_stops) == 2
    assert job.starts_and_stops[1]['start']
    assert job.starts_and_stops[1]['stop'] is None
    assert site1.status == 'FINISHED_STOP_REQUESTED'
    assert len(site1.starts_and_stops) == 2
    assert site1.starts_and_stops[1]['start']
    assert site1.starts_and_stops[1]['stop']
    assert site1.starts_and_stops[1]['stop'] > site.starts_and_stops[1]['start']
    assert site2.status == 'ACTIVE'
    assert len(site2.starts_and_stops) == 2
    assert site2.starts_and_stops[1]['start']
    assert site2.starts_and_stops[1]['stop'] is None

    # simulate site resume after a stop request
    frontier.resume_site(site1)
    site1 = list(frontier.job_sites(job.id))[0]
    site2 = list(frontier.job_sites(job.id))[1]

    assert job.status == 'ACTIVE'
    assert job.stop_requested is None
    assert len(job.starts_and_stops) == 2
    assert job.starts_and_stops[1]['start']
    assert job.starts_and_stops[1]['stop'] is None
    assert site1.status == 'ACTIVE'
    assert site1.stop_requested is None
    assert len(site1.starts_and_stops) == 3
    assert site1.starts_and_stops[2]['start']
    assert site1.starts_and_stops[2]['stop'] is None
    assert site2.status == 'ACTIVE'
    assert len(site2.starts_and_stops) == 2
    assert site2.starts_and_stops[1]['start']
    assert site2.starts_and_stops[1]['stop'] is None
Example #24
0
def test_warcprox_outage_resiliency(httpd):
    '''
    Tests resiliency to warcprox outage.

    If no instances of warcprox are healthy when starting to crawl a site,
    brozzler-worker should sit there and wait until a healthy instance appears.

    If an instance goes down, sites assigned to that instance should bounce
    over to a healthy instance.

    If all instances of warcprox go down, brozzler-worker should sit and wait.
    '''
    rr = doublethink.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(rr)
    svcreg = doublethink.ServiceRegistry(rr)

    # run two instances of warcprox
    opts = warcprox.Options()
    opts.address = '0.0.0.0'
    opts.port = 0

    warcprox1 = warcprox.controller.WarcproxController(
            service_registry=svcreg, options=opts)
    warcprox2 = warcprox.controller.WarcproxController(
            service_registry=svcreg, options=opts)
    warcprox1_thread = threading.Thread(
            target=warcprox1.run_until_shutdown, name='warcprox1')
    warcprox2_thread = threading.Thread(
            target=warcprox2.run_until_shutdown, name='warcprox2')

    # put together a site to crawl
    test_id = 'test_warcprox_death-%s' % datetime.datetime.utcnow().isoformat()
    site = brozzler.Site(rr, {
        'seed': 'http://localhost:%s/infinite/' % httpd.server_port,
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})

    try:
        # we manage warcprox instances ourselves, so stop the one running on
        # the system, if any
        try:
            stop_service('warcprox')
        except Exception as e:
            logging.warn('problem stopping warcprox service: %s', e)

        # queue the site for brozzling
        brozzler.new_site(frontier, site)

        # check that nothing happens
        # XXX tail brozzler-worker.log or something?
        time.sleep(30)
        site.refresh()
        assert site.status == 'ACTIVE'
        assert not site.proxy
        assert len(list(frontier.site_pages(site.id))) == 1

        # start one instance of warcprox
        warcprox1_thread.start()

        # check that it started using that instance
        start = time.time()
        while not site.proxy and time.time() - start < 30:
            time.sleep(0.5)
            site.refresh()
        assert site.proxy.endswith(':%s' % warcprox1.proxy.server_port)

        # check that the site accumulates pages in the frontier, confirming
        # that crawling is really happening
        start = time.time()
        while (len(list(frontier.site_pages(site.id))) <= 1
               and time.time() - start < 60):
            time.sleep(0.5)
            site.refresh()
        assert len(list(frontier.site_pages(site.id))) > 1

        # stop warcprox #1, start warcprox #2
        warcprox2_thread.start()
        warcprox1.stop.set()
        warcprox1_thread.join()

        # check that it switched over to warcprox #2
        start = time.time()
        while ((not site.proxy
                or not site.proxy.endswith(':%s' % warcprox2.proxy.server_port))
               and time.time() - start < 30):
            time.sleep(0.5)
            site.refresh()
        assert site.proxy.endswith(':%s' % warcprox2.proxy.server_port)

        # stop warcprox #2
        warcprox2.stop.set()
        warcprox2_thread.join()

        page_count = len(list(frontier.site_pages(site.id)))
        assert page_count > 1

        # check that it is waiting for a warcprox to appear
        time.sleep(30)
        site.refresh()
        assert site.status == 'ACTIVE'
        assert not site.proxy
        assert len(list(frontier.site_pages(site.id))) == page_count

        # stop crawling the site, else it can pollute subsequent test runs
        brozzler.cli.brozzler_stop_crawl([
            'brozzler-stop-crawl', '--site=%s' % site.id])
        site.refresh()
        assert site.stop_requested

        # stop request should be honored quickly
        start = time.time()
        while not site.status.startswith(
                'FINISHED') and time.time() - start < 120:
            time.sleep(0.5)
            site.refresh()
        assert site.status == 'FINISHED_STOP_REQUESTED'
    finally:
        warcprox1.stop.set()
        warcprox2.stop.set()
        warcprox1_thread.join()
        warcprox2_thread.join()
        start_service('warcprox')
Example #25
0
def test_time_limit():
    # XXX test not thoroughly adapted to change in time accounting, since
    # starts_and_stops is no longer used to enforce time limits

    # vagrant brozzler-worker isn't configured to look at the "ignoreme" db
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    site = brozzler.Site(rr, {'seed':'http://example.com/', 'time_limit':99999})
    brozzler.new_site(frontier, site)

    site.refresh()  # get it back from the db
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop'] is None

    frontier.finished(site, 'FINISHED')

    assert site.status == 'FINISHED'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop']
    assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']

    frontier.resume_site(site)

    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop'] is None

    # time limit not reached yet
    frontier.enforce_time_limit(site)

    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop'] is None

    site.time_limit = 0.1
    site.claimed = True
    site.save()

    # time limit not reached yet
    frontier.enforce_time_limit(site)
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop'] is None

    site.active_brozzling_time = 0.2  # this is why the time limit will be hit

    try:
        frontier.enforce_time_limit(site)
    except brozzler.ReachedTimeLimit:
        frontier.finished(site, 'FINISHED_TIME_LIMIT')

    assert site.status == 'FINISHED_TIME_LIMIT'
    assert not site.claimed
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop']
    assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start']