コード例 #1
0
def test_start_stop_backwards_compat():
    site = brozzler.Site(None, {'seed': 'http://example.com/'})
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop'] is None
    assert not 'start_time' in site

    site = brozzler.Site(None, {
        'seed': 'http://example.com/',
        'start_time': datetime.datetime(2017,1,1)})
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start'] == datetime.datetime(2017, 1, 1)
    assert site.starts_and_stops[0]['stop'] is None
    assert not 'start_time' in site

    job = brozzler.Job(None, {'seeds': [{'url':'https://example.com/'}]})
    assert job.starts_and_stops[0]['start']
    assert job.starts_and_stops[0]['stop'] is None
    assert not 'started' in job
    assert not 'finished' in job

    job = brozzler.Job(None, {
        'seeds': [{'url':'https://example.com/'}],
        'started': datetime.datetime(2017, 1, 1),
        'finished': datetime.datetime(2017, 1, 2)})
    assert job.starts_and_stops[0]['start'] == datetime.datetime(2017, 1, 1)
    assert job.starts_and_stops[0]['stop'] == datetime.datetime(2017, 1, 2)
    assert not 'started' in job
    assert not 'finished' in job
コード例 #2
0
ファイル: test_units.py プロジェクト: yushu-liu/brozzler
def test_seed_redirect():
    site = brozzler.Site(None, {'seed': 'http://foo.com/'})
    site.note_seed_redirect('https://foo.com/a/b/c')
    assert site.scope == {
        'accepts': [{
            'ssurt': 'com,foo,//http:/',
        }, {
            'ssurt': 'com,foo,//https:/',
        }]
    }

    site = brozzler.Site(None, {'seed': 'https://foo.com/'})
    site.note_seed_redirect('http://foo.com/a/b/c')
    assert site.scope == {
        'accepts': [{
            'ssurt': 'com,foo,//https:/',
        }, {
            'ssurt': 'com,foo,//http:/',
        }]
    }

    site = brozzler.Site(None, {'seed': 'http://foo.com/'})
    site.note_seed_redirect('https://bar.com/a/b/c')
    assert site.scope == {
        'accepts': [{
            'ssurt': 'com,foo,//http:/',
        }, {
            'ssurt': 'com,bar,//https:/a/b/c',
        }]
    }
コード例 #3
0
ファイル: test_frontier.py プロジェクト: n0ncetonic/brozzler
def test_hashtag_seed():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    # no hash tag
    site = brozzler.Site(rr, {'seed': 'http://example.org/'})
    brozzler.new_site(frontier, site)

    assert site.scope['surt'] == 'http://(org,example,)/'

    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 1
    assert pages[0].url == 'http://example.org/'
    assert not pages[0].hashtags

    # yes hash tag
    site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'})
    brozzler.new_site(frontier, site)

    assert site.scope['surt'] == 'http://(org,example,)/'

    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 1
    assert pages[0].url == 'http://example.org/'
    assert pages[0].hashtags == ['#hash',]
コード例 #4
0
def test_robots(httpd):
    '''
    Basic test of robots.txt user-agent substring matching.
    '''
    url = 'http://localhost:%s/' % httpd.server_port
    site = brozzler.Site(None, {'seed':url,'user_agent':'im/a/GoOdbot/yep'})
    assert brozzler.is_permitted_by_robots(site, url)

    site = brozzler.Site(None, {'seed':url,'user_agent':'im/a bAdBOt/uh huh'})
    assert not brozzler.is_permitted_by_robots(site, url)
コード例 #5
0
def test_hashtags(httpd):
    test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    seed_url = 'http://localhost:%s/site7/' % httpd.server_port
    site = brozzler.Site(
        rr, {
            'seed': seed_url,
            'warcprox_meta': {
                'captures-table-extra-fields': {
                    'test_id': test_id
                }
            }
        })

    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)
    assert site.id

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # check that we the page we expected
    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
    assert len(pages) == 2
    assert pages[0].url == seed_url
    assert pages[0].hops_from_seed == 0
    assert pages[0].brozzle_count == 1
    assert pages[0].outlinks['accepted'] == [
        'http://localhost:%s/site7/foo.html' % httpd.server_port
    ]
    assert not pages[0].hashtags
    assert pages[
        1].url == 'http://localhost:%s/site7/foo.html' % httpd.server_port
    assert pages[1].hops_from_seed == 1
    assert pages[1].brozzle_count == 1
    assert sorted(pages[1].hashtags) == [
        '#boosh',
        '#ignored',
        '#whee',
    ]

    time.sleep(2)  # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = rr.table('captures').filter({'test_id': test_id}).run()
    captures_by_url = {
        c['url']: c
        for c in captures if c['http_method'] != 'HEAD'
    }
    assert seed_url in captures_by_url
    assert 'http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
    assert 'http://localhost:%s/site7/whee.txt' % httpd.server_port in captures_by_url
    assert 'http://localhost:%s/site7/boosh.txt' % httpd.server_port in captures_by_url
    assert 'screenshot:%s' % seed_url in captures_by_url
    assert 'thumbnail:%s' % seed_url in captures_by_url
    assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
    assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
コード例 #6
0
ファイル: test_brozzling.py プロジェクト: ursafoot/brozzler
def test_page_videos(httpd):
    # test depends on behavior of youtube-dl and chromium, could fail and need
    # to be adjusted on youtube-dl or chromium updates
    chrome_exe = brozzler.suggest_default_chrome_exe()
    worker = brozzler.BrozzlerWorker(None)
    site = brozzler.Site(None, {})
    page = brozzler.Page(
        None, {'url': 'http://localhost:%s/site6/' % httpd.server_port})
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        worker.brozzle_page(browser, site, page)
    assert page.videos
    assert len(page.videos) == 2
    assert page.videos[0] == {
        'blame': 'youtube-dl',
        'response_code': 200,
        'content-length': 383631,
        'content-type': 'video/mp4',
        'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port,
    }
    assert page.videos[1] == {
        'blame': 'browser',
        # 'response_code': 206,
        # 'content-range': 'bytes 0-229454/229455',
        'response_code': 200,
        'content-length': 229455,
        'content-type': 'video/webm',
        'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port,
    }
コード例 #7
0
def test_seed_redirect(httpd):
    test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port
    site = brozzler.Site(rr, {
        'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port,
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
    assert site.scope['surt'] == 'http://(localhost:%s,)/site5/redirect/' % httpd.server_port

    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)
    assert site.id

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # take a look at the pages table
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 2
    pages.sort(key=lambda page: page.hops_from_seed)
    assert pages[0].hops_from_seed == 0
    assert pages[0].url == seed_url
    assert pages[0].redirect_url == 'http://localhost:%s/site5/destination/' % httpd.server_port
    assert pages[1].hops_from_seed == 1
    assert pages[1].url == 'http://localhost:%s/site5/destination/page2.html' % httpd.server_port

    # check that scope has been updated properly
    assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port
コード例 #8
0
def new_job(frontier, job_conf):
    '''Returns new Job.'''
    validate_conf(job_conf)
    job = Job(frontier.rr, {
                "conf": job_conf, "status": "ACTIVE",
                "started": doublethink.utcnow()})
    if "id" in job_conf:
        job.id = job_conf["id"]
    if "max_claimed_sites" in job_conf:
        job.max_claimed_sites = job_conf["max_claimed_sites"]
    job.save()

    sites = []
    for seed_conf in job_conf["seeds"]:
        merged_conf = merge(seed_conf, job_conf)
        merged_conf.pop("seeds")
        merged_conf["job_id"] = job.id
        merged_conf["seed"] = merged_conf.pop("url")
        site = brozzler.Site(frontier.rr, merged_conf)
        sites.append(site)

    for site in sites:
        new_site(frontier, site)

    return job
コード例 #9
0
ファイル: test_units.py プロジェクト: yushu-liu/brozzler
def test_robots_http_statuses():
    for status in (200, 204, 400, 401, 402, 403, 404, 405, 500, 501, 502, 503,
                   504, 505):

        class Handler(http.server.BaseHTTPRequestHandler):
            def do_GET(self):
                response = (('HTTP/1.1 %s Meaningless message\r\n' +
                             'Content-length: 0\r\n' + '\r\n') %
                            status).encode('utf-8')
                self.connection.sendall(response)
                # self.send_response(status)
                # self.end_headers()

        httpd = http.server.HTTPServer(('localhost', 0), Handler)
        httpd_thread = threading.Thread(name='httpd',
                                        target=httpd.serve_forever)
        httpd_thread.start()

        try:
            url = 'http://localhost:%s/' % httpd.server_port
            site = brozzler.Site(None, {'seed': url})
            assert brozzler.is_permitted_by_robots(site, url)
        finally:
            httpd.shutdown()
            httpd.server_close()
            httpd_thread.join()
コード例 #10
0
ファイル: test_frontier.py プロジェクト: Chunde/brozzler
def test_seed_page():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    site = brozzler.Site(rr, {'seed': 'http://example.com/a/'})
    site.save()

    assert frontier.seed_page(site.id) is None

    page1 = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/a/b/',
        'hops_from_seed': 1
    })
    page1.save()

    assert frontier.seed_page(site.id) is None

    page0 = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/a/',
        'hops_from_seed': 0
    })
    page0.save()

    assert frontier.seed_page(site.id) == page0
コード例 #11
0
def new_job(frontier, job_conf):
    job = Job(id=job_conf.get("id"),
              conf=job_conf,
              status="ACTIVE",
              started=rethinkstuff.utcnow())

    sites = []
    for seed_conf in job_conf["seeds"]:
        merged_conf = merge(seed_conf, job_conf)
        # XXX check for unknown settings, invalid url, etc

        site = brozzler.Site(
            job_id=job.id,
            seed=merged_conf["url"],
            scope=merged_conf.get("scope"),
            time_limit=merged_conf.get("time_limit"),
            proxy=merged_conf.get("proxy"),
            ignore_robots=merged_conf.get("ignore_robots"),
            enable_warcprox_features=merged_conf.get(
                "enable_warcprox_features"),
            warcprox_meta=merged_conf.get("warcprox_meta"),
            metadata=merged_conf.get("metadata"),
            remember_outlinks=merged_conf.get("remember_outlinks"))
        sites.append(site)

    # insert all the sites into database before the job
    for site in sites:
        new_site(frontier, site)

    frontier.new_job(job)
コード例 #12
0
def new_job(frontier, job_conf):
    '''Returns new Job.'''
    validate_conf(job_conf)
    job = Job(frontier.rr, {
                "conf": job_conf, "status": "ACTIVE",
                "started": doublethink.utcnow()})
    if "id" in job_conf:
        job.id = job_conf["id"]
    if "max_claimed_sites" in job_conf:
        job.max_claimed_sites = job_conf["max_claimed_sites"]
    job.save()

    sites = []
    pages = []
    for seed_conf in job_conf["seeds"]:
        merged_conf = merge(seed_conf, job_conf)
        merged_conf.pop("seeds")
        merged_conf["job_id"] = job.id
        merged_conf["seed"] = merged_conf.pop("url")
        site = brozzler.Site(frontier.rr, merged_conf)
        site.id = str(uuid.uuid4())
        sites.append(site)
        pages.append(new_seed_page(frontier, site))

    # insert in batches to avoid this error
    # rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:
    for batch in (pages[i:i+500] for i in range(0, len(pages), 500)):
        logging.info('inserting batch of %s pages', len(batch))
        result = frontier.rr.table('pages').insert(batch).run()
    for batch in (sites[i:i+100]  for i in range(0, len(sites), 100)):
        logging.info('inserting batch of %s sites', len(batch))
        result = frontier.rr.table('sites').insert(batch).run()
    logging.info('job %s fully started', job.id)

    return job
コード例 #13
0
def test_robots_socket_timeout():
    stop_hanging = threading.Event()
    class Handler(http.server.BaseHTTPRequestHandler):
        def do_GET(self):
            stop_hanging.wait(60)
            self.connection.sendall(
                    b'HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n')

    orig_timeout = brozzler.robots._SessionRaiseOn420.timeout

    httpd = http.server.HTTPServer(('localhost', 0), Handler)
    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
    httpd_thread.start()

    try:
        url = 'http://localhost:%s/' % httpd.server_port
        site = brozzler.Site(None, {'seed': url})
        brozzler.robots._SessionRaiseOn420.timeout = 2
        assert brozzler.is_permitted_by_robots(site, url)
    finally:
        brozzler.robots._SessionRaiseOn420.timeout = orig_timeout
        stop_hanging.set()
        httpd.shutdown()
        httpd.server_close()
        httpd_thread.join()
コード例 #14
0
def test_obey_robots(httpd):
    test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    site = brozzler.Site(
        rr,
        {
            'seed': make_url(httpd, '/site1/'),
            'user_agent': 'im a badbot',  # robots.txt blocks badbot
            'warcprox_meta': {
                'captures-table-extra-fields': {
                    'test_id': test_id
                }
            }
        })

    # so we can examine rethinkdb before it does anything
    try:
        stop_service('brozzler-worker')

        assert site.id is None
        frontier = brozzler.RethinkDbFrontier(rr)
        brozzler.new_site(frontier, site)
        assert site.id is not None
        site_pages = list(frontier.site_pages(site.id))
        assert len(site_pages) == 1
        assert site_pages[0].url == site.seed
        assert site_pages[0].needs_robots_check
    finally:
        start_service('brozzler-worker')

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # check that only the one page is in rethinkdb
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 1
    page = pages[0]
    assert page.url == make_url(httpd, '/site1/')
    assert page.blocked_by_robots

    # take a look at the captures table
    time.sleep(2)  # in case warcprox hasn't finished processing urls
    robots_url = make_url(httpd, '/robots.txt')
    captures = list(rr.table('captures').filter({'test_id': test_id}).run())
    assert len(captures) == 1
    assert captures[0]['url'] == robots_url

    # check pywb
    t14 = captures[0]['timestamp'].strftime('%Y%m%d%H%M%S')
    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, robots_url)
    expected_payload = open(
        os.path.join(os.path.dirname(__file__), 'htdocs', 'robots.txt'),
        'rb').read()
    assert requests.get(wb_url,
                        allow_redirects=False).content == expected_payload
コード例 #15
0
ファイル: frontier.py プロジェクト: n0ncetonic/brozzler
    def claim_sites(self, n=1):
        result = (
            self.rr.table('sites').get_all(
                r.args(
                    r.db(self.rr.dbname).table(
                        'sites', read_mode='majority').between(
                            ['ACTIVE', r.minval], ['ACTIVE', r.maxval],
                            index='sites_last_disclaimed').order_by(
                                r.desc('claimed'), 'last_disclaimed').
                    fold({},
                         lambda acc, site: acc.merge(
                             r.branch(
                                 site.has_fields('job_id'),
                                 r.object(
                                     site['job_id'].coerce_to('string'), acc[
                                         site['job_id'].coerce_to('string')].
                                     default(0).add(1)), {})),
                         emit=lambda acc, site, new_acc: r.branch(
                             r.and_(
                                 r.or_(
                                     site['claimed'].not_(), site[
                                         'last_claimed'].lt(r.now().sub(60 * 60
                                                                        ))),
                                 r.or_(
                                     site.has_fields('max_claimed_sites').not_(
                                     ), new_acc[site['job_id'].coerce_to(
                                         'string')].le(site['max_claimed_sites'
                                                            ]))), [site['id']],
                             [])).limit(n))).
            update(
                # try to avoid a race condition resulting in multiple
                # brozzler-workers claiming the same site
                # see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038
                r.branch(
                    r.or_(r.row['claimed'].not_(),
                          r.row['last_claimed'].lt(r.now().sub(60 * 60))), {
                              'claimed': True,
                              'last_claimed': r.now()
                          }, {}),
                return_changes=True)).run()

        self._vet_result(result,
                         replaced=list(range(n + 1)),
                         unchanged=list(range(n + 1)))
        sites = []
        for i in range(result["replaced"]):
            if result["changes"][i]["old_val"]["claimed"]:
                self.logger.warn(
                    "re-claimed site that was still marked 'claimed' "
                    "because it was last claimed a long time ago "
                    "at %s, and presumably some error stopped it from "
                    "being disclaimed",
                    result["changes"][i]["old_val"]["last_claimed"])
            site = brozzler.Site(self.rr, result["changes"][i]["new_val"])
            sites.append(site)
        if sites:
            return sites
        else:
            raise brozzler.NothingToClaim
コード例 #16
0
 def site(self, id):
     if id is None:
         return None
     result = self.r.table("sites").get(id).run()
     if result:
         return brozzler.Site(**result)
     else:
         return None
コード例 #17
0
def test_login(httpd):
    test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    site = brozzler.Site(
        rr, {
            'seed': 'http://localhost:%s/site2/' % httpd.server_port,
            'warcprox_meta': {
                'captures-table-extra-fields': {
                    'test_id': test_id
                }
            },
            'username': '******',
            'password': '******'
        })

    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # take a look at the captures table
    time.sleep(2)  # in case warcprox hasn't finished processing urls
    robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port
    captures = list(
        rr.table('captures').filter({
            'test_id': test_id
        }).order_by('timestamp').run())
    meth_url = ['%s %s' % (c['http_method'], c['url']) for c in captures]

    # there are several forms in in htdocs/site2/login.html but only one
    # that brozzler's heuristic should match and try to submit, and it has
    # action='00', so we can check for that here
    assert ('POST http://localhost:%s/site2/00' %
            httpd.server_port) in meth_url

    # sanity check the rest of the crawl
    assert ('GET http://localhost:%s/robots.txt' %
            httpd.server_port) in meth_url
    assert ('GET http://localhost:%s/site2/' % httpd.server_port) in meth_url
    assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/' %
            httpd.server_port) in meth_url
    assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/' %
            httpd.server_port) in meth_url
    assert ('GET http://localhost:%s/site2/login.html' %
            httpd.server_port) in meth_url
    assert (
        'WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/login.html'
        % httpd.server_port) in meth_url
    assert (
        'WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html'
        % httpd.server_port) in meth_url
コード例 #18
0
def test_scoping():
    test_scope = yaml.load('''
max_hops: 100
accepts:
- url_match: REGEX_MATCH
  value: ^.*/audio_file/.*\.mp3$
- url_match: SURT_MATCH
  value: http://(com,vimeocdn,
- url_match: STRING_MATCH
  value: ec-media.soundcloud.com
- regex: ^https?://twitter\.com.*$
- substring: facebook.com
- regex: ^https?://(www.)?youtube.com/watch?.*$
  parent_url_regex: ^https?://(www.)?youtube.com/user/.*$
blocks:
- domain: twitter.com
  url_match: REGEX_MATCH
  value: ^.*lang=(?!en).*$
''')

    site = brozzler.Site(
        None, {
            'id': 1,
            'seed': 'http://example.com/foo/bar?baz=quux#monkey',
            'scope': test_scope
        })
    page = brozzler.Page(None, {
        'url': 'http://example.com/foo/bar?baz=quux#monkey',
        'site_id': site.id
    })

    assert site.is_in_scope('http://example.com/foo/bar', page)
    assert not site.is_in_scope('http://example.com/foo/baz', page)

    assert not site.is_in_scope('http://foo.com/some.mp3', page)
    assert site.is_in_scope('http://foo.com/blah/audio_file/some.mp3', page)

    assert site.is_in_scope('http://a.b.vimeocdn.com/blahblah', page)
    assert not site.is_in_scope('https://a.b.vimeocdn.com/blahblah', page)

    assert site.is_in_scope('https://twitter.com/twit', page)
    assert site.is_in_scope('https://twitter.com/twit?lang=en', page)
    assert not site.is_in_scope('https://twitter.com/twit?lang=es', page)

    assert site.is_in_scope('https://www.facebook.com/whatevz', page)

    assert not site.is_in_scope('https://www.youtube.com/watch?v=dUIn5OAPS5s',
                                page)
    yt_user_page = brozzler.Page(
        None, {
            'url': 'https://www.youtube.com/user/SonoraSantaneraVEVO',
            'site_id': site.id,
            'hops_from_seed': 10
        })
    assert site.is_in_scope('https://www.youtube.com/watch?v=dUIn5OAPS5s',
                            yt_user_page)
コード例 #19
0
ファイル: cli.py プロジェクト: vishalbelsare/brozzler
def brozzler_new_site(argv=None):
    '''
    Command line utility entry point for queuing a new brozzler site.
    Takes a seed url and creates a site and page object in rethinkdb, which
    brozzler-workers will look at and start crawling.
    '''
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(argv[0]),
            description='brozzler-new-site - register site to brozzle',
            formatter_class=BetterArgumentDefaultsHelpFormatter)
    arg_parser.add_argument('seed', metavar='SEED', help='seed url')
    add_rethinkdb_options(arg_parser)
    arg_parser.add_argument(
            '--time-limit', dest='time_limit', default=None,
            help='time limit in seconds for this site')
    arg_parser.add_argument(
            '--ignore-robots', dest='ignore_robots', action='store_true',
            help='ignore robots.txt for this site')
    arg_parser.add_argument(
            '--warcprox-meta', dest='warcprox_meta',
            help=(
                'Warcprox-Meta http request header to send with each request; '
                'must be a json blob, ignored unless warcprox features are '
                'enabled'))
    arg_parser.add_argument(
            '--behavior-parameters', dest='behavior_parameters',
            default=None, help=(
                'json blob of parameters to populate the javascript behavior '
                'template, e.g. {"parameter_username":"******",'
                '"parameter_password":"******"}'))
    arg_parser.add_argument(
            '--username', dest='username', default=None,
            help='use this username to try to log in if a login form is found')
    arg_parser.add_argument(
            '--password', dest='password', default=None,
            help='use this password to try to log in if a login form is found')
    add_common_options(arg_parser, argv)

    args = arg_parser.parse_args(args=argv[1:])
    configure_logging(args)

    rr = rethinker(args)
    site = brozzler.Site(rr, {
        'seed': args.seed,
        'time_limit': int(args.time_limit) if args.time_limit else None,
        'ignore_robots': args.ignore_robots,
        'warcprox_meta': json.loads(
            args.warcprox_meta) if args.warcprox_meta else None,
        'behavior_parameters': json.loads(
            args.behavior_parameters) if args.behavior_parameters else None,
        'username': args.username,
        'password': args.password})

    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)
コード例 #20
0
ファイル: cli.py プロジェクト: Cloudxtreme/brozzler
def brozzle_page():
    '''
    Command line utility entry point for brozzling a single page. Opens url in
    a browser, running some javascript behaviors, and prints outlinks.
    '''
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(sys.argv[0]),
            description='brozzle-page - brozzle a single page',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    arg_parser.add_argument('url', metavar='URL', help='page url')
    arg_parser.add_argument(
            '-e', '--chrome-exe', dest='chrome_exe',
            default=suggest_default_chome_exe(),
            help='executable to use to invoke chrome')
    arg_parser.add_argument(
            '--proxy', dest='proxy', default=None,
            help='http proxy')
    arg_parser.add_argument(
            '--enable-warcprox-features', dest='enable_warcprox_features',
            action='store_true', help=(
                'enable special features that assume the configured proxy '
                'is warcprox'))
    _add_common_options(arg_parser)

    args = arg_parser.parse_args(args=sys.argv[1:])
    _configure_logging(args)

    site = brozzler.Site(
            id=-1, seed=args.url, proxy=args.proxy,
            enable_warcprox_features=args.enable_warcprox_features)
    page = brozzler.Page(url=args.url, site_id=site.id)
    worker = brozzler.BrozzlerWorker(frontier=None)

    def on_screenshot(screenshot_png):
        OK_CHARS = (string.ascii_letters + string.digits)
        filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format(
                ''.join(ch if ch in OK_CHARS else '_' for ch in args.url),
                datetime.datetime.now())
        # logging.info('len(screenshot_png)=%s', len(screenshot_png))
        with open(filename, 'wb') as f:
            f.write(screenshot_png)
        logging.info('wrote screenshot to %s', filename)

    browser = brozzler.Browser(chrome_exe=args.chrome_exe)
    browser.start(proxy=site.proxy)
    try:
        outlinks = worker.brozzle_page(
                browser, site, page, on_screenshot=on_screenshot)
        logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks)))
    except brozzler.ReachedLimit as e:
        logging.error('reached limit %s', e)
    finally:
        browser.stop()
コード例 #21
0
ファイル: test_frontier.py プロジェクト: wolfgang42/brozzler
def test_time_limit():
    # XXX test not thoroughly adapted to change in time accounting, since
    # starts_and_stops is no longer used to enforce time limits

    # vagrant brozzler-worker isn't configured to look at the "ignoreme" db
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    site = brozzler.Site(rr, {
        'seed': 'http://example.com/',
        'time_limit': 99999
    })
    brozzler.new_site(frontier, site)

    site.refresh()  # get it back from the db
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop'] is None

    frontier.finished(site, 'FINISHED')

    assert site.status == 'FINISHED'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop']
    assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']

    frontier.resume_site(site)

    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop'] is None

    # no time limit set
    frontier.enforce_time_limit(site)

    site.time_limit = 10
    site.claimed = True
    site.save()

    # time limit not reached yet
    frontier.enforce_time_limit(site)
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop'] is None

    site.time_limit = 0.1
    time.sleep(0.1)

    with pytest.raises(brozzler.ReachedTimeLimit):
        frontier.enforce_time_limit(site)
コード例 #22
0
def test_redirect_hashtags(httpd):
    test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    seed_url = make_url(httpd, '/site9/')
    site = brozzler.Site(
        rr, {
            'seed': seed_url,
            'warcprox_meta': {
                'captures-table-extra-fields': {
                    'test_id': test_id
                }
            }
        })

    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)
    assert site.id

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # check that we the page we expected
    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
    assert len(pages) == 2
    assert pages[0].url == seed_url
    assert pages[0].hops_from_seed == 0
    assert pages[0].brozzle_count == 1
    assert pages[0].outlinks['accepted'] == [
        make_url(httpd, '/site9/redirect.html')
    ]
    assert not pages[0].hashtags
    assert pages[1].url == make_url(httpd, '/site9/redirect.html')
    assert pages[1].hops_from_seed == 1
    assert pages[1].brozzle_count == 1
    assert sorted(pages[1].hashtags) == [
        '#hash1',
        '#hash2',
    ]

    time.sleep(2)  # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = rr.table('captures').filter({'test_id': test_id}).run()
    redirect_captures = [
        c for c in captures
        if c['url'] == make_url(httpd, '/site9/redirect.html')
        and c['http_method'] == 'GET'
    ]
    assert len(redirect_captures) == 2  # youtube-dl + browser, no hashtags
コード例 #23
0
def test_time_limit():
    # vagrant brozzler-worker isn't configured to look at the "ignoreme" db
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    site = brozzler.Site(rr, {
        'seed': 'http://example.com/',
        'time_limit': 99999
    })
    brozzler.new_site(frontier, site)

    site.refresh()  # get it back from the db
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop'] is None

    frontier.finished(site, 'FINISHED')

    assert site.status == 'FINISHED'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop']
    assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']

    frontier.resume_site(site)

    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop'] is None

    # time limit not reached yet
    frontier._enforce_time_limit(site)

    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop'] is None

    site.time_limit = 0.1
    site.claimed = True
    site.save()

    time.sleep(0.1)
    frontier._enforce_time_limit(site)

    assert site.status == 'FINISHED_TIME_LIMIT'
    assert not site.claimed
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop']
    assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start']
コード例 #24
0
def test_extract_outlinks(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
    worker = brozzler.BrozzlerWorker(None)
    site = brozzler.Site(None, {})
    page = brozzler.Page(
        None, {'url': 'http://localhost:%s/site8/' % httpd.server_port})
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        outlinks = worker.brozzle_page(browser, site, page)
    assert outlinks == {
        'http://example.com/offsite',
        'http://localhost:%s/site8/baz/zuh' % httpd.server_port,
        'http://localhost:%s/site8/fdjisapofdjisap#1' % httpd.server_port,
        'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port
    }
コード例 #25
0
def test_ydl_stitching(httpd):
    test_id = 'test_ydl_stitching-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(rr)
    site = brozzler.Site(
        rr, {
            'seed': make_url(httpd, '/site10/'),
            'warcprox_meta': {
                'warc-prefix': 'test_ydl_stitching',
                'captures-table-extra-fields': {
                    'test_id': test_id
                }
            }
        })
    brozzler.new_site(frontier, site)

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # check page.videos
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 1
    page = pages[0]
    while time.time() - start < 600 and not page.videos:
        time.sleep(0.5)
        page.refresh()
    assert len(page.videos) == 6
    stitched_url = 'youtube-dl:00001:%s' % make_url(httpd, '/site10/')
    assert {
        'blame': 'youtube-dl',
        'content-length': 267900,
        'content-type': 'video/mp4',
        'response_code': 204,
        'url': stitched_url,
    } in page.videos

    time.sleep(2)  # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = list(rr.table('captures').filter({'test_id': test_id}).run())
    l = [c for c in captures if c['url'] == stitched_url]
    assert len(l) == 1
    c = l[0]
    assert c['filename'].startswith('test_ydl_stitching')
    assert c['content_type'] == 'video/mp4'
    assert c['http_method'] == 'WARCPROX_WRITE_RECORD'
コード例 #26
0
ファイル: test_cluster.py プロジェクト: mouse-reeve/brozzler
def test_brozzle_site(httpd):
    test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat()
    site = brozzler.Site(
            seed='http://localhost:%s/' % httpd.server_port,
            proxy='localhost:8000', enable_warcprox_features=True,
            warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})

    # the two pages we expect to be crawled
    page1 = 'http://localhost:%s/' % httpd.server_port
    page2 = 'http://localhost:%s/file1.txt' % httpd.server_port

    assert site.id is None
    r = rethinkstuff.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(r)
    brozzler.new_site(frontier, site)
    assert site.id is not None
    assert len(list(frontier.site_pages(site.id))) == 1

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site = frontier.site(site.id)
    assert site.status == 'FINISHED'

    # check that we got the two pages we expected
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 2
    assert {page.url for page in pages} == {
            'http://localhost:%s/' % httpd.server_port,
            'http://localhost:%s/file1.txt' % httpd.server_port }

    # take a look at the captures table
    captures = r.table('captures').filter({'test_id':test_id}).run()
    captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
    assert page1 in captures_by_url
    assert '%srobots.txt' % page1 in captures_by_url
    assert page2 in captures_by_url
    assert 'screenshot:%s' % page1 in captures_by_url
    assert 'thumbnail:%s' % page1 in captures_by_url
    # no screenshots of plaintext

    # check pywb
    t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
    expected_payload = open(os.path.join(
        os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
    assert requests.get(wb_url).content == expected_payload
コード例 #27
0
def test_proxy_down():
    '''
    Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.

    This test needs to cover every possible fetch through the proxy other than
    fetches from the browser. For that, see test_brozzling.py.

    Tests two different kinds of connection error:
    - nothing listening the port (nobody listens on on port 4 :))
    - port bound but not accepting connections
    '''
    sock = socket.socket()
    sock.bind(('127.0.0.1', 0))
    for not_listening_proxy in ('127.0.0.1:4',
                                '127.0.0.1:%s' % sock.getsockname()[1]):
        worker = brozzler.BrozzlerWorker(frontier=None,
                                         proxy=not_listening_proxy)
        site = brozzler.Site(None, {
            'id': str(uuid.uuid4()),
            'seed': 'http://example.com/'
        })
        page = brozzler.Page(None, {'url': 'http://example.com/'})

        # robots.txt fetch
        with pytest.raises(brozzler.ProxyError):
            brozzler.is_permitted_by_robots(site,
                                            'http://example.com/',
                                            proxy=not_listening_proxy)

        # youtube-dl fetch
        with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
            ydl = worker._youtube_dl(tempdir, site)
            with pytest.raises(brozzler.ProxyError):
                worker._try_youtube_dl(ydl, site, page)

        # raw fetch
        with pytest.raises(brozzler.ProxyError):
            worker._fetch_url(site, page)

        # WARCPROX_WRITE_RECORD
        with pytest.raises(brozzler.ProxyError):
            worker._warcprox_write_record(
                warcprox_address=not_listening_proxy,
                url='test://proxy_down/warcprox_write_record',
                warc_type='metadata',
                content_type='text/plain',
                payload=b'''payload doesn't matter here''')
コード例 #28
0
def test_robots_empty_response():
    class Handler(http.server.BaseHTTPRequestHandler):
        def do_GET(self):
            self.connection.shutdown(socket.SHUT_RDWR)
            self.connection.close()
    httpd = http.server.HTTPServer(('localhost', 0), Handler)
    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
    httpd_thread.start()

    try:
        url = 'http://localhost:%s/' % httpd.server_port
        site = brozzler.Site(None, {'seed': url})
        assert brozzler.is_permitted_by_robots(site, url)
    finally:
        httpd.shutdown()
        httpd.server_close()
        httpd_thread.join()
コード例 #29
0
 def claim_site(self, worker_id):
     # XXX keep track of aggregate priority and prioritize sites accordingly?
     while True:
         result = (
                 self.r.table("sites", read_mode="majority")
                 .between(
                     ["ACTIVE",rethinkdb.minval],
                     ["ACTIVE",rethinkdb.maxval],
                     index="sites_last_disclaimed")
                 .order_by(index="sites_last_disclaimed")
                 .filter(
                     (rethinkdb.row["claimed"] != True) |
                     (rethinkdb.row["last_claimed"]
                         < rethinkdb.now() - 2*60*60))
                 .limit(1)
                 .update(
                     # try to avoid a race condition resulting in multiple
                     # brozzler-workers claiming the same site
                     # see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038
                     rethinkdb.branch(
                         (rethinkdb.row["claimed"] != True) |
                         (rethinkdb.row["last_claimed"]
                             < rethinkdb.now() - 2*60*60), {
                                 "claimed": True,
                                 "last_claimed_by": worker_id,
                                 "last_claimed": rethinkstuff.utcnow()
                             }, {}), return_changes=True)).run()
         self._vet_result(result, replaced=[0,1], unchanged=[0,1])
         if result["replaced"] == 1:
             if result["changes"][0]["old_val"]["claimed"]:
                 self.logger.warn(
                         "re-claimed site that was still marked 'claimed' "
                         "because it was last claimed a long time ago "
                         "at %s, and presumably some error stopped it from "
                         "being disclaimed",
                         result["changes"][0]["old_val"]["last_claimed"])
             site = brozzler.Site(**result["changes"][0]["new_val"])
         else:
             raise brozzler.NothingToClaim
         # XXX This is the only place we enforce time limit for now. Worker
         # loop should probably check time limit. Maybe frontier needs a
         # housekeeping thread to ensure that time limits get enforced in a
         # timely fashion.
         if not self._enforce_time_limit(site):
             return site
コード例 #30
0
ファイル: cli.py プロジェクト: Cloudxtreme/brozzler
def brozzler_new_site():
    '''
    Command line utility entry point for queuing a new brozzler site.
    Takes a seed url and creates a site and page object in rethinkdb, which
    brozzler-workers will look at and start crawling.
    '''
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(sys.argv[0]),
            description='brozzler-new-site - register site to brozzle',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    arg_parser.add_argument('seed', metavar='SEED', help='seed url')
    _add_rethinkdb_options(arg_parser)
    _add_proxy_options(arg_parser)
    arg_parser.add_argument(
            '--time-limit', dest='time_limit', default=None,
            help='time limit in seconds for this site')
    arg_parser.add_argument(
            '--ignore-robots', dest='ignore_robots', action='store_true',
            help='ignore robots.txt for this site')
    arg_parser.add_argument(
            '--warcprox-meta', dest='warcprox_meta',
            help=(
                'Warcprox-Meta http request header to send with each request; '
                'must be a json blob, ignored unless warcprox features are '
                'enabled'))
    _add_common_options(arg_parser)

    args = arg_parser.parse_args(args=sys.argv[1:])
    _configure_logging(args)

    site = brozzler.Site(
            seed=args.seed, proxy=args.proxy,
            time_limit=int(args.time_limit) if args.time_limit else None,
            ignore_robots=args.ignore_robots,
            enable_warcprox_features=args.enable_warcprox_features,
            warcprox_meta=(
                json.loads(args.warcprox_meta) if args.warcprox_meta else None))

    r = rethinkstuff.Rethinker(
            args.rethinkdb_servers.split(","), args.rethinkdb_db)
    frontier = brozzler.RethinkDbFrontier(r)
    brozzler.new_site(frontier, site)