def test_seed_page(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, {'seed': 'http://example.com/a/'}) site.save() assert frontier.seed_page(site.id) is None page1 = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/a/b/', 'hops_from_seed': 1 }) page1.save() assert frontier.seed_page(site.id) is None page0 = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/a/', 'hops_from_seed': 0 }) page0.save() assert frontier.seed_page(site.id) == page0
def test_scoping(): test_scope = yaml.load(''' max_hops: 100 accepts: - url_match: REGEX_MATCH value: ^.*/audio_file/.*\.mp3$ - url_match: SURT_MATCH value: http://(com,vimeocdn, - url_match: STRING_MATCH value: ec-media.soundcloud.com - regex: ^https?://twitter\.com.*$ - substring: facebook.com - regex: ^https?://(www.)?youtube.com/watch?.*$ parent_url_regex: ^https?://(www.)?youtube.com/user/.*$ blocks: - domain: twitter.com url_match: REGEX_MATCH value: ^.*lang=(?!en).*$ ''') site = brozzler.Site( None, { 'id': 1, 'seed': 'http://example.com/foo/bar?baz=quux#monkey', 'scope': test_scope }) page = brozzler.Page(None, { 'url': 'http://example.com/foo/bar?baz=quux#monkey', 'site_id': site.id }) assert site.is_in_scope('http://example.com/foo/bar', page) assert not site.is_in_scope('http://example.com/foo/baz', page) assert not site.is_in_scope('http://foo.com/some.mp3', page) assert site.is_in_scope('http://foo.com/blah/audio_file/some.mp3', page) assert site.is_in_scope('http://a.b.vimeocdn.com/blahblah', page) assert not site.is_in_scope('https://a.b.vimeocdn.com/blahblah', page) assert site.is_in_scope('https://twitter.com/twit', page) assert site.is_in_scope('https://twitter.com/twit?lang=en', page) assert not site.is_in_scope('https://twitter.com/twit?lang=es', page) assert site.is_in_scope('https://www.facebook.com/whatevz', page) assert not site.is_in_scope('https://www.youtube.com/watch?v=dUIn5OAPS5s', page) yt_user_page = brozzler.Page( None, { 'url': 'https://www.youtube.com/user/SonoraSantaneraVEVO', 'site_id': site.id, 'hops_from_seed': 10 }) assert site.is_in_scope('https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page)
def new_site(frontier, site): site.id = str(uuid.uuid4()) logging.info("new site {}".format(site)) # insert the Page into the database before the Site, to avoid situation # where a brozzler worker immediately claims the site, finds no pages # to crawl, and decides the site is finished try: url = urlcanon.parse_url(site.seed) hashtag = (url.hash_sign + url.fragment).decode("utf-8") urlcanon.canon.remove_fragment(url) page = brozzler.Page( frontier.rr, { "url": str(url), "site_id": site.get("id"), "job_id": site.get("job_id"), "hops_from_seed": 0, "priority": 1000, "needs_robots_check": True }) if hashtag: page.hashtags = [ hashtag, ] page.save() logging.info("queued page %s", page) finally: # finally block because we want to insert the Site no matter what site.save()
def _build_fresh_pages(self, site, parent_page, urls): ''' Returns a dict of page_id => brozzler.Page. ''' pages = {} for url in urls: url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode('utf-8') urlcanon.canon.remove_fragment(url_for_crawling) if not url_for_scoping.surt().startswith( site.scope['surt'].encode('utf-8')): hops_off_surt = parent_page.hops_off_surt + 1 else: hops_off_surt = 0 page = brozzler.Page( self.rr, { 'url': str(url_for_crawling), 'site_id': site.id, 'job_id': site.job_id, 'hops_from_seed': parent_page.hops_from_seed + 1, 'via_page_id': parent_page.id, 'hops_off_surt': hops_off_surt, 'hashtags': [] }) if page.id in pages: pages[page.id].priority += page.priority page = pages[page.id] else: pages[page.id] = page if hashtag: page.hashtags = list(set(page.hashtags + [hashtag])) return pages
def test_needs_browsing(): # only one test case here right now, which exposed a bug class ConvenientHeaders(http.client.HTTPMessage): def __init__(self, headers): http.client.HTTPMessage.__init__(self) for (k, v) in headers.items(): self.add_header(k, v) page = brozzler.Page(None, { 'url':'http://example.com/a'}) spy = brozzler.ydl.YoutubeDLSpy() spy.fetches.append({ 'url': 'http://example.com/a', 'method': 'HEAD', 'response_code': 301, 'response_headers': ConvenientHeaders({'Location': '/b'})}) spy.fetches.append({ 'url': 'http://example.com/b', 'method': 'GET', 'response_code': 200, 'response_headers': ConvenientHeaders({ 'Content-Type': 'application/pdf'})}) assert not brozzler.worker.BrozzlerWorker._needs_browsing( None, page, spy.fetches)
def _build_fresh_page(self, site, parent_page, url, hops_off=0): url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode('utf-8') urlcanon.canon.remove_fragment(url_for_crawling) page = brozzler.Page( self.rr, { 'url': str(url_for_crawling), 'site_id': site.id, 'job_id': site.job_id, 'hops_from_seed': parent_page.hops_from_seed + 1, 'hop_path': str(parent_page.hop_path if parent_page.hop_path else "") + "L", 'via_page_id': parent_page.id, 'via_page_url': parent_page.url, 'hops_off_surt': hops_off, 'hashtags': [hashtag] if hashtag else [] }) return page
def test_page_videos(httpd): # test depends on behavior of youtube-dl and chromium, could fail and need # to be adjusted on youtube-dl or chromium updates chrome_exe = brozzler.suggest_default_chrome_exe() worker = brozzler.BrozzlerWorker(None) site = brozzler.Site(None, {}) page = brozzler.Page( None, {'url': 'http://localhost:%s/site6/' % httpd.server_port}) with brozzler.Browser(chrome_exe=chrome_exe) as browser: worker.brozzle_page(browser, site, page) assert page.videos assert len(page.videos) == 2 assert page.videos[0] == { 'blame': 'youtube-dl', 'response_code': 200, 'content-length': 383631, 'content-type': 'video/mp4', 'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port, } assert page.videos[1] == { 'blame': 'browser', # 'response_code': 206, # 'content-range': 'bytes 0-229454/229455', 'response_code': 200, 'content-length': 229455, 'content-type': 'video/webm', 'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port, }
def scope_and_schedule_outlinks(self, site, parent_page, outlinks): decisions = {'accepted': set(), 'blocked': set(), 'rejected': set()} counts = {'added': 0, 'updated': 0, 'rejected': 0, 'blocked': 0} in_scope, blocked, out_of_scope = self._scope_and_enforce_robots( site, parent_page, outlinks) decisions['blocked'] = blocked decisions['rejected'] = out_of_scope counts['blocked'] += len(blocked) counts['rejected'] += len(out_of_scope) fresh_pages = self._build_fresh_pages(site, parent_page, in_scope) # get existing pages from rethinkdb results = self.rr.table('pages').get_all(*fresh_pages.keys()).run() pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results} # build list of pages to save, consisting of new pages, and existing # pages updated with higher priority and new hashtags for fresh_page in fresh_pages.values(): decisions['accepted'].add(fresh_page.url) if fresh_page.id in pages: page = pages[fresh_page.id] page.hashtags = list( set((page.hashtags or []) + fresh_page.hashtags)) page.priority += fresh_page.priority counts['updated'] += 1 else: pages[fresh_page.id] = fresh_page counts['added'] += 1 # insert/replace in batches of 50 to try to avoid this error: # "rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:" # there can be many pages and each one can be very large (many videos, # in and out of scope links, etc) l = list(pages.values()) for batch in (l[i:i + 50] for i in range(0, len(l), 50)): try: self.logger.debug('inserting/replacing batch of %s pages', len(batch)) result = self.rr.table('pages').insert( batch, conflict='replace').run() except Exception as e: self.logger.error( 'problem inserting/replacing batch of %s pages', len(batch), exc_info=True) parent_page.outlinks = {} for k in decisions: parent_page.outlinks[k] = list(decisions[k]) parent_page.save() self.logger.info( '%s new links added, %s existing links updated, %s links ' 'rejected, %s links blocked by robots from %s', counts['added'], counts['updated'], counts['rejected'], counts['blocked'], parent_page)
def site_pages(self, site_id, unbrozzled_only=False): results = self.r.table("pages").between( [site_id, 0 if unbrozzled_only else self.r.minval, self.r.minval, self.r.minval], [site_id, 0 if unbrozzled_only else self.r.maxval, self.r.maxval, self.r.maxval], index="priority_by_site").run() for result in results: yield brozzler.Page(**result)
def new_seed_page(frontier, site): url = urlcanon.parse_url(site.seed) hashtag = (url.hash_sign + url.fragment).decode("utf-8") urlcanon.canon.remove_fragment(url) page = brozzler.Page(frontier.rr, { "url": str(url), "site_id": site.get("id"), "job_id": site.get("job_id"), "hops_from_seed": 0, "priority": 1000, "needs_robots_check": True}) if hashtag: page.hashtags = [hashtag,] return page
def seed_page(self, site_id): results = self.rr.table("pages").between( [site_id, r.minval, r.minval, r.minval], [site_id, r.maxval, r.maxval, r.maxval], index="priority_by_site").filter({"hops_from_seed":0}).run() pages = list(results) if len(pages) > 1: self.logger.warn( "more than one seed page for site_id %s ?", site_id) if len(pages) < 1: return None return brozzler.Page(self.rr, pages[0])
def brozzle_page(): ''' Command line utility entry point for brozzling a single page. Opens url in a browser, running some javascript behaviors, and prints outlinks. ''' arg_parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), description='brozzle-page - brozzle a single page', formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument('url', metavar='URL', help='page url') arg_parser.add_argument( '-e', '--chrome-exe', dest='chrome_exe', default=suggest_default_chome_exe(), help='executable to use to invoke chrome') arg_parser.add_argument( '--proxy', dest='proxy', default=None, help='http proxy') arg_parser.add_argument( '--enable-warcprox-features', dest='enable_warcprox_features', action='store_true', help=( 'enable special features that assume the configured proxy ' 'is warcprox')) _add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) _configure_logging(args) site = brozzler.Site( id=-1, seed=args.url, proxy=args.proxy, enable_warcprox_features=args.enable_warcprox_features) page = brozzler.Page(url=args.url, site_id=site.id) worker = brozzler.BrozzlerWorker(frontier=None) def on_screenshot(screenshot_png): OK_CHARS = (string.ascii_letters + string.digits) filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format( ''.join(ch if ch in OK_CHARS else '_' for ch in args.url), datetime.datetime.now()) # logging.info('len(screenshot_png)=%s', len(screenshot_png)) with open(filename, 'wb') as f: f.write(screenshot_png) logging.info('wrote screenshot to %s', filename) browser = brozzler.Browser(chrome_exe=args.chrome_exe) browser.start(proxy=site.proxy) try: outlinks = worker.brozzle_page( browser, site, page, on_screenshot=on_screenshot) logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks))) except brozzler.ReachedLimit as e: logging.error('reached limit %s', e) finally: browser.stop()
def test_extract_outlinks(httpd): chrome_exe = brozzler.suggest_default_chrome_exe() worker = brozzler.BrozzlerWorker(None) site = brozzler.Site(None, {}) page = brozzler.Page( None, {'url': 'http://localhost:%s/site8/' % httpd.server_port}) with brozzler.Browser(chrome_exe=chrome_exe) as browser: outlinks = worker.brozzle_page(browser, site, page) assert outlinks == { 'http://example.com/offsite', 'http://localhost:%s/site8/baz/zuh' % httpd.server_port, 'http://localhost:%s/site8/fdjisapofdjisap#1' % httpd.server_port, 'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port }
def scope_and_schedule_outlinks(self, site, parent_page, outlinks): if site.remember_outlinks: parent_page.outlinks = { "accepted": [], "blocked": [], "rejected": [] } counts = {"added": 0, "updated": 0, "rejected": 0, "blocked": 0} for url in outlinks or []: u = brozzler.site.Url(url) if site.is_in_scope(u, parent_page=parent_page): if brozzler.is_permitted_by_robots(site, url): if not u.surt.startswith(site.scope["surt"]): hops_off_surt = parent_page.hops_off_surt + 1 else: hops_off_surt = 0 new_child_page = brozzler.Page( url, site_id=site.id, job_id=site.job_id, hops_from_seed=parent_page.hops_from_seed + 1, via_page_id=parent_page.id, hops_off_surt=hops_off_surt) existing_child_page = self.page(new_child_page.id) if existing_child_page: existing_child_page.priority += new_child_page.priority self.update_page(existing_child_page) counts["updated"] += 1 else: self.new_page(new_child_page) counts["added"] += 1 if site.remember_outlinks: parent_page.outlinks["accepted"].append(url) else: counts["blocked"] += 1 if site.remember_outlinks: parent_page.outlinks["blocked"].append(url) else: counts["rejected"] += 1 if site.remember_outlinks: parent_page.outlinks["rejected"].append(url) if site.remember_outlinks: self.update_page(parent_page) self.logger.info( "%s new links added, %s existing links updated, %s links " "rejected, %s links blocked by robots from %s", counts["added"], counts["updated"], counts["rejected"], counts["blocked"], parent_page)
def test_proxy_down(): ''' Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down. This test needs to cover every possible fetch through the proxy other than fetches from the browser. For that, see test_brozzling.py. Tests two different kinds of connection error: - nothing listening the port (nobody listens on on port 4 :)) - port bound but not accepting connections ''' sock = socket.socket() sock.bind(('127.0.0.1', 0)) for not_listening_proxy in ('127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]): worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy) site = brozzler.Site(None, { 'id': str(uuid.uuid4()), 'seed': 'http://example.com/' }) page = brozzler.Page(None, {'url': 'http://example.com/'}) # robots.txt fetch with pytest.raises(brozzler.ProxyError): brozzler.is_permitted_by_robots(site, 'http://example.com/', proxy=not_listening_proxy) # youtube-dl fetch with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: ydl = worker._youtube_dl(tempdir, site) with pytest.raises(brozzler.ProxyError): worker._try_youtube_dl(ydl, site, page) # raw fetch with pytest.raises(brozzler.ProxyError): worker._fetch_url(site, page) # WARCPROX_WRITE_RECORD with pytest.raises(brozzler.ProxyError): worker._warcprox_write_record( warcprox_address=not_listening_proxy, url='test://proxy_down/warcprox_write_record', warc_type='metadata', content_type='text/plain', payload=b'''payload doesn't matter here''')
def site_pages(self, site_id, brozzled=None): ''' Args: site_id (str or int): brozzled (bool): if true, results include only pages that have been brozzled at least once; if false, only pages that have not been brozzled; and if None (the default), all pages Returns: iterator of brozzler.Page ''' results = self.rr.table("pages").between( [site_id, 1 if brozzled is True else 0, r.minval, r.minval], [ site_id, 0 if brozzled is False else r.maxval, r.maxval, r.maxval ], index="priority_by_site").run() for result in results: yield brozzler.Page(self.rr, result)
def scope_and_schedule_outlinks(self, site, parent_page, outlinks): decisions = {'accepted':set(),'blocked':set(),'rejected':set()} counts = {'added':0,'updated':0,'rejected':0,'blocked':0} in_scope, blocked, out_of_scope = self._scope_and_enforce_robots( site, parent_page, outlinks) decisions['blocked'] = blocked decisions['rejected'] = out_of_scope counts['blocked'] += len(blocked) counts['rejected'] += len(out_of_scope) fresh_pages = self._build_fresh_pages(site, parent_page, in_scope) # get existing pages from rethinkdb results = self.rr.table('pages').get_all(*fresh_pages.keys()).run() pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results} # build list of pages to save, consisting of new pages, and existing # pages updated with higher priority and new hashtags for fresh_page in fresh_pages.values(): decisions['accepted'].add(fresh_page.url) if fresh_page.id in pages: page = pages[fresh_page.id] page.hashtags = list(set((page.hashtags or []) + fresh_page.hashtags)) page.priority += fresh_page.priority counts['updated'] += 1 else: pages[fresh_page.id] = fresh_page counts['added'] += 1 result = self.rr.table('pages').insert( pages.values(), conflict='replace').run() parent_page.outlinks = {} for k in decisions: parent_page.outlinks[k] = list(decisions[k]) parent_page.save() self.logger.info( '%s new links added, %s existing links updated, %s links ' 'rejected, %s links blocked by robots from %s', counts['added'], counts['updated'], counts['rejected'], counts['blocked'], parent_page)
def claim_page(self, site, worker_id): # ignores the "claimed" field of the page, because only one # brozzler-worker can be working on a site at a time, and that would # have to be the worker calling this method, so if something is claimed # already, it must have been left that way because of some error result = self.rr.table("pages").between( [site.id, 0, r.minval, r.minval], [site.id, 0, r.maxval, r.maxval], index="priority_by_site").order_by( index=r.desc("priority_by_site")).limit(1).update( { "claimed": True, "last_claimed_by": worker_id }, return_changes="always").run() self._vet_result(result, unchanged=[0, 1], replaced=[0, 1]) if result["unchanged"] == 0 and result["replaced"] == 0: raise brozzler.NothingToClaim else: return brozzler.Page(self.rr, result["changes"][0]["new_val"])
def test_scope_and_schedule_outlinks(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, {'seed': 'http://example.com/'}) parent_page = brozzler.Page(rr, { 'hops_from_seed': 1, 'url': 'http://example.com/whatever' }) outlinks = [ 'https://example.com/', 'https://example.com/foo', 'http://example.com/bar', 'HTtp://exAMPle.COm/bar', 'HTtp://exAMPle.COm/BAr', 'HTtp://exAMPle.COm/BAZZZZ', ] orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots assert sorted(parent_page.outlinks['rejected']) == [ 'https://example.com/', 'https://example.com/foo' ] assert sorted(parent_page.outlinks['accepted']) == [ 'http://example.com/BAZZZZ', 'http://example.com/BAr', 'http://example.com/bar' ] assert parent_page.outlinks['blocked'] == [] pp = brozzler.Page.load(rr, parent_page.id) assert pp == parent_page for url in parent_page.outlinks['rejected']: id = brozzler.Page.compute_id(site.id, url) assert brozzler.Page.load(rr, id) is None for url in parent_page.outlinks['accepted']: id = brozzler.Page.compute_id(site.id, url) assert brozzler.Page.load(rr, id)
def new_site(frontier, site): site.id = str(uuid.uuid4()) logging.info("new site {}".format(site)) try: # insert the Page into the database before the Site, to avoid situation # where a brozzler worker immediately claims the site, finds no pages # to crawl, and decides the site is finished try: if brozzler.is_permitted_by_robots(site, site.seed): page = brozzler.Page(site.seed, site_id=site.id, job_id=site.job_id, hops_from_seed=0, priority=1000) frontier.new_page(page) logging.info("queued page %s", page) else: logging.warn("seed url %s is blocked by robots.txt", site.seed) finally: # finally block because we want to insert the Site no matter what frontier.new_site(site) except brozzler.ReachedLimit as e: frontier.reached_limit(site, e)
def test_proxy_down(): ''' Test that browsing raises `brozzler.ProxyError` when proxy is down. See also `test_proxy_down` in test_units.py. Tests two different kinds of connection error: - nothing listening the port (nobody listens on on port 4 :)) - port bound but not accepting connections ''' sock = socket.socket() sock.bind(('127.0.0.1', 0)) for not_listening_proxy in ('127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]): site = brozzler.Site(None, {'seed': 'http://example.com/'}) page = brozzler.Page(None, {'url': 'http://example.com/'}) worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy) chrome_exe = brozzler.suggest_default_chrome_exe() with brozzler.Browser(chrome_exe=chrome_exe) as browser: with pytest.raises(brozzler.ProxyError): worker.brozzle_page(browser, site, page)
def scope_and_schedule_outlinks(self, site, parent_page, outlinks): decisions = {"accepted": set(), "blocked": set(), "rejected": set()} counts = {"added": 0, "updated": 0, "rejected": 0, "blocked": 0} for url in outlinks or []: url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode('utf-8') urlcanon.canon.remove_fragment(url_for_crawling) if site.is_in_scope(url_for_scoping, parent_page=parent_page): if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): if not url_for_scoping.surt().startswith( site.scope["surt"].encode("utf-8")): hops_off_surt = parent_page.hops_off_surt + 1 else: hops_off_surt = 0 new_child_page = brozzler.Page( self.rr, { 'url': str(url_for_crawling), 'site_id': site.id, 'job_id': site.job_id, 'hops_from_seed': parent_page.hops_from_seed + 1, 'via_page_id': parent_page.id, 'hops_off_surt': hops_off_surt }) existing_child_page = brozzler.Page.load( self.rr, new_child_page.id) if existing_child_page: existing_child_page.priority += new_child_page.priority if hashtag and existing_child_page.hashtags: hashtags = set(existing_child_page.hashtags) hashtags.add(hashtag) existing_child_page.hashtags = list(hashtags) elif hashtag: existing_child_page.hashtags = [hashtag] existing_child_page.save() counts["updated"] += 1 else: if hashtag: new_child_page.hashtags = [ hashtag, ] new_child_page.save() counts["added"] += 1 decisions["accepted"].add(str(url_for_crawling)) else: counts["blocked"] += 1 decisions["blocked"].add(str(url_for_crawling)) else: counts["rejected"] += 1 decisions["rejected"].add(str(url_for_crawling)) parent_page.outlinks = {} for k in decisions: parent_page.outlinks[k] = list(decisions[k]) parent_page.save() self.logger.info( "%s new links added, %s existing links updated, %s links " "rejected, %s links blocked by robots from %s", counts["added"], counts["updated"], counts["rejected"], counts["blocked"], parent_page)
def test_completed_page(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) # redirect that changes scope surt site = brozzler.Site(rr, {'seed':'http://example.com/a/'}) site.save() page = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/a/', 'claimed': True, 'brozzle_count': 0, 'hops_from_seed': 0, 'redirect_url':'http://example.com/b/', }) page.save() assert site.scope == {'surt': 'http://(com,example,)/a/'} frontier.completed_page(site, page) assert site.scope == {'surt': 'http://(com,example,)/b/'} site.refresh() assert site.scope == {'surt': 'http://(com,example,)/b/'} assert page.brozzle_count == 1 assert page.claimed == False page.refresh() assert page.brozzle_count == 1 assert page.claimed == False # redirect that doesn't change scope surt because destination is covered by # the original surt site = brozzler.Site(rr, {'seed':'http://example.com/a/'}) site.save() page = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/a/', 'claimed': True, 'brozzle_count': 0, 'hops_from_seed': 0, 'redirect_url':'http://example.com/a/x/', }) page.save() assert site.scope == {'surt': 'http://(com,example,)/a/'} frontier.completed_page(site, page) assert site.scope == {'surt': 'http://(com,example,)/a/'} site.refresh() assert site.scope == {'surt': 'http://(com,example,)/a/'} assert page.brozzle_count == 1 assert page.claimed == False page.refresh() assert page.brozzle_count == 1 assert page.claimed == False # redirect that doesn't change scope surt because page is not the seed page site = brozzler.Site(rr, {'seed':'http://example.com/a/'}) site.save() page = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/c/', 'claimed': True, 'brozzle_count': 0, 'hops_from_seed': 1, 'redirect_url':'http://example.com/d/', }) page.save() assert site.scope == {'surt': 'http://(com,example,)/a/'} frontier.completed_page(site, page) assert site.scope == {'surt': 'http://(com,example,)/a/'} site.refresh() assert site.scope == {'surt': 'http://(com,example,)/a/'} assert page.brozzle_count == 1 assert page.claimed == False page.refresh() assert page.brozzle_count == 1 assert page.claimed == False
def test_parent_url_scoping(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) # scope rules that look at parent page url should consider both the # original url and the redirect url, if any, of the parent page site = brozzler.Site(rr, { 'seed': 'http://example.com/foo/', 'scope': { 'accepts': [{ 'parent_url_regex': '^http://example.com/acceptme/.*$'}], 'blocks': [{ 'parent_url_regex': '^http://example.com/blockme/.*$'}], }, 'remember_outlinks': True}) site.save() # an outlink that would not otherwise be in scope outlinks = ['https://some-random-url.com/'] # parent page does not match any parent_url_regex parent_page = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/foo/spluh'}) orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots assert parent_page.outlinks['rejected'] == outlinks assert parent_page.outlinks['accepted'] == [] # parent page url matches accept parent_url_regex parent_page = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/acceptme/futz'}) orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots assert parent_page.outlinks['rejected'] == [] assert parent_page.outlinks['accepted'] == outlinks # parent page redirect_url matches accept parent_url_regex parent_page_c = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/toot/blah', 'redirect_url':'http://example.com/acceptme/futz'}) orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots assert parent_page.outlinks['rejected'] == [] assert parent_page.outlinks['accepted'] == outlinks # an outlink that would normally be in scope outlinks = ['http://example.com/foo/whatever/'] # parent page does not match any parent_url_regex parent_page = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/foo/spluh'}) orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots assert parent_page.outlinks['rejected'] == [] assert parent_page.outlinks['accepted'] == outlinks # parent page url matches block parent_url_regex parent_page = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/blockme/futz'}) orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots assert parent_page.outlinks['rejected'] == outlinks assert parent_page.outlinks['accepted'] == [] # parent page redirect_url matches block parent_url_regex parent_page_c = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/toot/blah', 'redirect_url':'http://example.com/blockme/futz'}) orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots assert parent_page.outlinks['rejected'] == outlinks assert parent_page.outlinks['accepted'] == []
def test_field_defaults(): rr = doublethink.Rethinker('localhost', db='ignoreme') # page brozzler.Page.table_ensure(rr) page = brozzler.Page(rr, {'hops_from_seed': 3}) assert page.hops_from_seed == 3 assert page.id assert page.brozzle_count == 0 page.save() assert page.hops_from_seed == 3 assert page.id assert page.brozzle_count == 0 qage = brozzler.Page.load(rr, page.id) assert qage.hops_from_seed == 3 assert qage.id == page.id assert qage.brozzle_count == 0 qage.save() assert qage.hops_from_seed == 3 assert qage.id == page.id assert qage.brozzle_count == 0 qage.refresh() assert qage.hops_from_seed == 3 assert qage.id == page.id assert qage.brozzle_count == 0 # site brozzler.Site.table_ensure(rr) site = brozzler.Site(rr, {'seed': 'http://example.com/'}) assert site.id is None assert site.scope assert site.scope['surt'] == 'http://(com,example,)/' site.save() assert site.id assert site.scope tite = brozzler.Site.load(rr, site.id) assert tite.id == site.id assert tite.scope == site.scope tite.save() assert tite.id == site.id assert tite.scope == site.scope tite.refresh() assert tite.id == site.id assert tite.scope == site.scope # job brozzler.Job.table_ensure(rr) job = brozzler.Job(rr, {'status': 'WHUUUT'}) assert job.status == 'WHUUUT' assert job.id is None assert job.starts_and_stops job.save() assert job.status == 'WHUUUT' assert job.id assert job.starts_and_stops kob = brozzler.Job.load(rr, job.id) assert kob.status == 'WHUUUT' assert kob.id assert kob.starts_and_stops kob.save() assert kob.status == 'WHUUUT' assert kob.id assert kob.starts_and_stops kob.refresh() assert kob.status == 'WHUUUT' assert kob.id assert kob.starts_and_stops
def brozzle_page(argv=None): ''' Command line utility entry point for brozzling a single page. Opens url in a browser, running some javascript behaviors, and prints outlinks. ''' argv = argv or sys.argv arg_parser = argparse.ArgumentParser( prog=os.path.basename(argv[0]), description='brozzle-page - brozzle a single page', formatter_class=BetterArgumentDefaultsHelpFormatter) arg_parser.add_argument('url', metavar='URL', help='page url') arg_parser.add_argument('-e', '--chrome-exe', dest='chrome_exe', default=suggest_default_chrome_exe(), help='executable to use to invoke chrome') arg_parser.add_argument( '--behavior-parameters', dest='behavior_parameters', default=None, help=('json blob of parameters to populate the javascript behavior ' 'template, e.g. {"parameter_username":"******",' '"parameter_password":"******"}')) arg_parser.add_argument( '--username', dest='username', default=None, help='use this username to try to log in if a login form is found') arg_parser.add_argument( '--password', dest='password', default=None, help='use this password to try to log in if a login form is found') arg_parser.add_argument('--proxy', dest='proxy', default=None, help='http proxy') arg_parser.add_argument('--skip-extract-outlinks', dest='skip_extract_outlinks', action='store_true', help=argparse.SUPPRESS) arg_parser.add_argument('--skip-visit-hashtags', dest='skip_visit_hashtags', action='store_true', help=argparse.SUPPRESS) arg_parser.add_argument('--skip-youtube-dl', dest='skip_youtube_dl', action='store_true', help=argparse.SUPPRESS) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) behavior_parameters = {} if args.behavior_parameters: behavior_parameters = json.loads(args.behavior_parameters) site = brozzler.Site( None, { 'id': -1, 'seed': args.url, 'behavior_parameters': behavior_parameters, 'username': args.username, 'password': args.password }) page = brozzler.Page(None, {'url': args.url, 'site_id': site.id}) worker = brozzler.BrozzlerWorker( frontier=None, proxy=args.proxy, skip_extract_outlinks=args.skip_extract_outlinks, skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl) def on_screenshot(screenshot_png): OK_CHARS = (string.ascii_letters + string.digits) filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format( ''.join(ch if ch in OK_CHARS else '_' for ch in args.url), datetime.datetime.now()) # logging.info('len(screenshot_png)=%s', len(screenshot_png)) with open(filename, 'wb') as f: f.write(screenshot_png) logging.info('wrote screenshot to %s', filename) browser = brozzler.Browser(chrome_exe=args.chrome_exe) try: browser.start(proxy=args.proxy) outlinks = worker.brozzle_page( browser, site, page, on_screenshot=on_screenshot, enable_youtube_dl=not args.skip_youtube_dl) logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks))) except brozzler.ReachedLimit as e: logging.error('reached limit %s', e) finally: browser.stop()
def page(self, id): result = self.r.table("pages").get(id).run() if result: return brozzler.Page(**result) else: return None