def accept_reject_or_neither(self, url, parent_page=None): ''' Returns `True` (accepted), `False` (rejected), or `None` (no decision). `None` usually means rejected, unless `max_hops_off` comes into play. ''' if not isinstance(url, urlcanon.ParsedUrl): url = urlcanon.semantic(url) if not url.scheme in (b'http', b'https'): # XXX doesn't belong here maybe (where? worker ignores unknown # schemes?) return False try_parent_urls = [] if parent_page: try_parent_urls.append(urlcanon.semantic(parent_page.url)) if parent_page.redirect_url: try_parent_urls.append( urlcanon.semantic(parent_page.redirect_url)) # enforce max_hops if (parent_page and "max_hops" in self.scope and parent_page.hops_from_seed >= self.scope["max_hops"]): return False # enforce reject rules if "blocks" in self.scope: for block_rule in self.scope["blocks"]: rule = urlcanon.MatchRule(**block_rule) if try_parent_urls: for parent_url in try_parent_urls: if rule.applies(url, parent_url): return False else: if rule.applies(url): return False # honor accept rules for accept_rule in self.scope["accepts"]: rule = urlcanon.MatchRule(**accept_rule) if try_parent_urls: for parent_url in try_parent_urls: if rule.applies(url, parent_url): return True else: if rule.applies(url): return True # no decision if we reach here return None
def test_match_rules(): rule = urlcanon.MatchRule( surt=urlcanon.semantic(b'http://example.com/foo/bar').surt()) assert not rule.applies('hTTp://EXAmple.com.../FOo/Bar#zuh') assert rule.applies('http://example.com/foo/bar') assert not rule.applies('http://example.com/foo/baz') rule = urlcanon.MatchRule( ssurt=urlcanon.semantic(b'http://example.com/foo/bar').ssurt()) assert not rule.applies('hTTp://EXAmple.com.../FOo/Bar#zuh') assert rule.applies(b'http://example.com/foo/bar') assert not rule.applies('http://example.com/foo/baz') rule = urlcanon.MatchRule( ssurt=urlcanon.semantic('http://example.com/foo/bar').ssurt().decode('ascii')) assert not rule.applies('hTTp://EXAmple.com.../FOo/Bar#zuh') assert rule.applies(b'http://example.com/foo/bar') assert not rule.applies('http://example.com/foo/baz') rule = urlcanon.MatchRule( url_match='REGEX_MATCH', value=b'^.*/audio_file/.*\.mp3$') assert not rule.applies('http://foo.com/some.mp3') assert rule.applies('http://foo.com/blah/audio_file/some.mp3') rule = urlcanon.MatchRule( url_match='SURT_MATCH', value=b'http://(com,vimeocdn,') assert rule.applies('http://a.b.vimeocdn.com/blahblah') assert not rule.applies('https://a.b.vimeocdn.com/blahblah') rule = urlcanon.MatchRule( url_match='STRING_MATCH', value=b'ec-media.soundcloud.com') rule = urlcanon.MatchRule( regex=b'^https?://twitter\.com.*$') rule = urlcanon.MatchRule(substring=b'facebook.com') assert rule.applies('https://www.facebook.com/whatevz') rule = urlcanon.MatchRule( regex=b'^https?://(www.)?youtube.com/watch?.*$', parent_url_regex=b'^https?://(www.)?youtube.com/user/.*$') assert not rule.applies('https://www.youtube.com/watch?v=dUIn5OAPS5s') assert rule.applies( 'https://www.youtube.com/watch?v=dUIn5OAPS5s', parent_url='https://www.youtube.com/user/SonoraSantaneraVEVO') rule = urlcanon.MatchRule( domain=b'twitter.com', url_match='REGEX_MATCH', value=b'^.*lang=(?!en).*$') assert not rule.applies('https://twitter.com/twit') assert not rule.applies('https://twitter.com/twit?lang=en') assert rule.applies('https://twitter.com/twit?lang=es')
def is_in_scope(self, url, parent_page=None): if not isinstance(url, urlcanon.ParsedUrl): url = urlcanon.semantic(url) try_parent_urls = [] if parent_page: try_parent_urls.append(urlcanon.semantic(parent_page.url)) if parent_page.redirect_url: try_parent_urls.append( urlcanon.semantic(parent_page.redirect_url)) might_accept = False if not url.scheme in (b'http', b'https'): # XXX doesn't belong here maybe (where? worker ignores unknown # schemes?) return False elif (parent_page and "max_hops" in self.scope and parent_page.hops_from_seed >= self.scope["max_hops"]): pass elif url.surt().startswith(self.scope["surt"].encode("utf-8")): might_accept = True elif parent_page and parent_page.hops_off_surt < self.scope.get( "max_hops_off_surt", 0): might_accept = True elif "accepts" in self.scope: for accept_rule in self.scope["accepts"]: rule = urlcanon.MatchRule(**accept_rule) if try_parent_urls: for parent_url in try_parent_urls: if rule.applies(url, parent_url): might_accept = True else: if rule.applies(url): might_accept = True if might_accept: if "blocks" in self.scope: for block_rule in self.scope["blocks"]: rule = urlcanon.MatchRule(**block_rule) if try_parent_urls: for parent_url in try_parent_urls: if rule.applies(url, parent_url): return False else: if rule.applies(url): return False return True else: return False
def _build_fresh_page(self, site, parent_page, url, hops_off=0): url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode('utf-8') urlcanon.canon.remove_fragment(url_for_crawling) page = brozzler.Page( self.rr, { 'url': str(url_for_crawling), 'site_id': site.id, 'job_id': site.job_id, 'hops_from_seed': parent_page.hops_from_seed + 1, 'hop_path': str(parent_page.hop_path if parent_page.hop_path else "") + "L", 'via_page_id': parent_page.id, 'via_page_url': parent_page.url, 'hops_off_surt': hops_off, 'hashtags': [hashtag] if hashtag else [] }) return page
def _scope_and_enforce_robots(self, site, parent_page, outlinks): ''' Returns tuple ( dict of {page_id: Page} of fresh `brozzler.Page` representing in scope links accepted by robots policy, set of in scope urls (canonicalized) blocked by robots policy, set of out-of-scope urls (canonicalized)). ''' pages = {} # {page_id: Page, ...} blocked = set() out_of_scope = set() for url in outlinks or []: url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) decision = site.accept_reject_or_neither( url_for_scoping, parent_page=parent_page) if decision is True: hops_off = 0 elif decision is None: decision = parent_page.hops_off < site.scope.get( 'max_hops_off', 0) hops_off = parent_page.hops_off + 1 if decision is True: if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): fresh_page = self._build_fresh_page( site, parent_page, url, hops_off) if fresh_page.id in pages: self._merge_page(pages[fresh_page.id], fresh_page) else: pages[fresh_page.id] = fresh_page else: blocked.add(str(url_for_crawling)) else: out_of_scope.add(str(url_for_crawling)) return pages, blocked, out_of_scope
def _build_fresh_pages(self, site, parent_page, urls): ''' Returns a dict of page_id => brozzler.Page. ''' pages = {} for url in urls: url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode('utf-8') urlcanon.canon.remove_fragment(url_for_crawling) if not url_for_scoping.surt().startswith( site.scope['surt'].encode('utf-8')): hops_off_surt = parent_page.hops_off_surt + 1 else: hops_off_surt = 0 page = brozzler.Page( self.rr, { 'url': str(url_for_crawling), 'site_id': site.id, 'job_id': site.job_id, 'hops_from_seed': parent_page.hops_from_seed + 1, 'via_page_id': parent_page.id, 'hops_off_surt': hops_off_surt, 'hashtags': [] }) if page.id in pages: pages[page.id].priority += page.priority page = pages[page.id] else: pages[page.id] = page if hashtag: page.hashtags = list(set(page.hashtags + [hashtag])) return pages
def _enforce_blocks(self, warcprox_meta): """ Sends a 403 response and raises warcprox.RequestBlockedByRule if the url is blocked by a rule in warcprox_meta. """ url = urlcanon.semantic(self.url) if warcprox_meta and "blocks" in warcprox_meta: for rule in warcprox_meta["blocks"]: block_rule = urlcanon.MatchRule(**rule) if block_rule.applies(url): body = ("request rejected by warcprox: blocked by " "rule found in Warcprox-Meta header: %s" % rule).encode("utf-8") self.send_response(403, "Forbidden") self.send_header("Content-Type", "text/plain;charset=utf-8") self.send_header("Connection", "close") self.send_header("Content-Length", len(body)) response_meta = {"blocked-by-rule":rule} self.send_header( "Warcprox-Meta", json.dumps(response_meta, separators=(",",":"))) self.end_headers() if self.command != "HEAD": self.wfile.write(body) self.connection.close() raise warcprox.RequestBlockedByRule( "%s 403 %s %s -- blocked by rule in Warcprox-Meta " "request header %s" % ( self.client_address[0], self.command, self.url, rule))
def __call__(self, url): try: key = urlcanon.semantic(url).surt().decode('ascii') # logging.debug('%s -> %s', url, key) return key except Exception as e: return url
def _assemble_entry(self, recorded_url, records): if recorded_url.payload_digest: if recorded_url.payload_digest.name == "sha1": sha1base32 = base64.b32encode( recorded_url.payload_digest.digest() ).decode("utf-8") else: self.logger.warn( "digest type is %r but big captures table is indexed " "by sha1", recorded_url.payload_digest.name) else: digest = hashlib.new("sha1", records[0].content[1]) sha1base32 = base64.b32encode(digest.digest()).decode("utf-8") if (recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta): bucket = recorded_url.warcprox_meta["captures-bucket"] else: bucket = "__unspecified__" canon_surt = urlcanon.semantic(recorded_url.url).surt().decode('ascii') entry = { # id only specified for rethinkdb partitioning "id": "{} {}".format( canon_surt[:20], records[0].id.decode("utf-8")[10:-1]), "abbr_canon_surt": canon_surt[:150], "canon_surt": canon_surt, "timestamp": recorded_url.timestamp.replace( tzinfo=doublethink.UTC), "url": recorded_url.url.decode("utf-8"), "offset": records[0].offset, "filename": os.path.basename(records[0].warc_filename), "warc_type": records[0].type.decode("utf-8"), "warc_id": records[0].id.decode("utf-8"), "sha1base32": sha1base32, "content_type": recorded_url.mimetype, "response_code": recorded_url.status, "http_method": recorded_url.method, "bucket": bucket, "record_length": records[0].length, # compressed (or not) length of # warc record including record # headers "wire_bytes": recorded_url.size, # count of bytes transferred over # the wire, including http headers # if any } if recorded_url.warcprox_meta: if "dedup-ok" in recorded_url.warcprox_meta: entry["dedup_ok"] = recorded_url.warcprox_meta["dedup-ok"] if "captures-table-extra-fields" in recorded_url.warcprox_meta: extras = recorded_url.warcprox_meta[ "captures-table-extra-fields"] for extra_field in extras: entry[extra_field] = extras[extra_field] return entry
def _assemble_entry(self, recorded_url, records): if recorded_url.payload_digest: if recorded_url.payload_digest.name == "sha1": sha1base32 = base64.b32encode( recorded_url.payload_digest.digest() ).decode("utf-8") else: self.logger.warn( "digest type is %r but big captures table is indexed " "by sha1", recorded_url.payload_digest.name) else: digest = hashlib.new("sha1", records[0].content[1]) sha1base32 = base64.b32encode(digest.digest()).decode("utf-8") if (recorded_url.warcprox_meta and "dedup-bucket" in recorded_url.warcprox_meta): bucket = recorded_url.warcprox_meta["dedup-bucket"] else: bucket = "__unspecified__" canon_surt = urlcanon.semantic(recorded_url.url).surt().decode('ascii') entry = { # id only specified for rethinkdb partitioning "id": "{} {}".format( canon_surt[:20], records[0].id.decode("utf-8")[10:-1]), "abbr_canon_surt": canon_surt[:150], "canon_surt": canon_surt, "timestamp": recorded_url.timestamp.replace( tzinfo=doublethink.UTC), "url": recorded_url.url.decode("utf-8"), "offset": records[0].offset, "filename": os.path.basename(records[0].warc_filename), "warc_type": records[0].type.decode("utf-8"), "warc_id": records[0].id.decode("utf-8"), "sha1base32": sha1base32, "content_type": recorded_url.mimetype, "response_code": recorded_url.status, "http_method": recorded_url.method, "bucket": bucket, "record_length": records[0].length, # compressed (or not) length of # warc record including record # headers "wire_bytes": recorded_url.size, # count of bytes transferred over # the wire, including http headers # if any } if recorded_url.warcprox_meta: if "dedup-ok" in recorded_url.warcprox_meta: entry["dedup_ok"] = recorded_url.warcprox_meta["dedup-ok"] if "captures-table-extra-fields" in recorded_url.warcprox_meta: extras = recorded_url.warcprox_meta[ "captures-table-extra-fields"] for extra_field in extras: entry[extra_field] = extras[extra_field] return entry
def _on_screenshot(screenshot_jpeg): if on_screenshot: on_screenshot(screenshot_jpeg) if self._using_warcprox(site): self.logger.info( "sending WARCPROX_WRITE_RECORD request to %s with " "screenshot for %s", self._proxy_for(site), page) thumbnail_jpeg = self.thumb_jpeg(screenshot_jpeg) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="screenshot:%s" % str(urlcanon.semantic(page.url)), warc_type="resource", content_type="image/jpeg", payload=screenshot_jpeg, extra_headers=site.extra_headers()) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="thumbnail:%s" % str(urlcanon.semantic(page.url)), warc_type="resource", content_type="image/jpeg", payload=thumbnail_jpeg, extra_headers=site.extra_headers())
def _on_screenshot(screenshot_png): if on_screenshot: on_screenshot(screenshot_png) if self._using_warcprox(site): self.logger.info( "sending WARCPROX_WRITE_RECORD request to %s with " "screenshot for %s", self._proxy_for(site), page) screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs( screenshot_png) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="screenshot:%s" % str(urlcanon.semantic(page.url)), warc_type="resource", content_type="image/jpeg", payload=screenshot_jpeg, extra_headers=site.extra_headers()) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="thumbnail:%s" % str(urlcanon.semantic(page.url)), warc_type="resource", content_type="image/jpeg", payload=thumbnail_jpeg, extra_headers=site.extra_headers())
def _build_fresh_page(self, site, parent_page, url, hops_off=0): url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode('utf-8') urlcanon.canon.remove_fragment(url_for_crawling) page = brozzler.Page(self.rr, { 'url': str(url_for_crawling), 'site_id': site.id, 'job_id': site.job_id, 'hops_from_seed': parent_page.hops_from_seed + 1, 'via_page_id': parent_page.id, 'hops_off_surt': hops_off, 'hashtags': [hashtag] if hashtag else []}) return page
def unravel_buckets(url, warcprox_meta): ''' Unravels bucket definitions in Warcprox-Meta header. Each bucket definition can either be a string, which signifies the name of the bucket, or a dict. If a dict it is expected to have at least an item with key 'bucket' whose value is the name of the bucket. The other currently recognized item is 'tally-domains', which if supplied should be a list of domains. This instructs warcprox to additionally tally substats of the given bucket by domain. Host stats are stored in the stats table under the key '{parent-bucket}:{domain(normalized)}'. Returns: list of strings Example Warcprox-Meta header (a real one will likely have other sections besides 'stats'): Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}} In this case the return value would be ["bucket1","bucket2","bucket2:foo.bar.com","bucket2:192.168.10.20"] ''' buckets = ["__all__"] if (warcprox_meta and "stats" in warcprox_meta and "buckets" in warcprox_meta["stats"]): for bucket in warcprox_meta["stats"]["buckets"]: if isinstance(bucket, dict): if not 'bucket' in bucket: self.logger.warning( 'ignoring invalid stats bucket in ' 'warcprox-meta header %s', bucket) continue buckets.append(bucket['bucket']) if bucket.get('tally-domains'): canon_url = urlcanon.semantic(url) for domain in bucket['tally-domains']: domain = urlcanon.normalize_host(domain).decode( 'ascii') if urlcanon.url_matches_domain(canon_url, domain): buckets.append('%s:%s' % (bucket['bucket'], domain)) else: buckets.append(bucket) else: buckets.append("__unspecified__") return buckets
def unravel_buckets(url, warcprox_meta): ''' Unravels bucket definitions in Warcprox-Meta header. Each bucket definition can either be a string, which signifies the name of the bucket, or a dict. If a dict it is expected to have at least an item with key 'bucket' whose value is the name of the bucket. The other currently recognized item is 'tally-domains', which if supplied should be a list of domains. This instructs warcprox to additionally tally substats of the given bucket by domain. Host stats are stored in the stats table under the key '{parent-bucket}:{domain(normalized)}'. Returns: list of strings Example Warcprox-Meta header (a real one will likely have other sections besides 'stats'): Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}} In this case the return value would be ["bucket1","bucket2","bucket2:foo.bar.com","bucket2:192.168.10.20"] ''' buckets = ["__all__"] if (warcprox_meta and "stats" in warcprox_meta and "buckets" in warcprox_meta["stats"]): for bucket in warcprox_meta["stats"]["buckets"]: if isinstance(bucket, dict): if not 'bucket' in bucket: self.logger.warn( 'ignoring invalid stats bucket in ' 'warcprox-meta header %s', bucket) continue buckets.append(bucket['bucket']) if bucket.get('tally-domains'): canon_url = urlcanon.semantic(url) for domain in bucket['tally-domains']: domain = urlcanon.normalize_host(domain).decode('ascii') if urlcanon.url_matches_domain(canon_url, domain): buckets.append( '%s:%s' % (bucket['bucket'], domain)) else: buckets.append(bucket) else: buckets.append("__unspecified__") return buckets
def _try_youtube_dl(self, ydl, site, page): try: self.logger.info("trying youtube-dl on {}".format(page)) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "<urlopen error # no host given>" resulting in ProxyError # needs automated test info = ydl.extract_info(str(urlcanon.whatwg(page.url))) self._remember_videos(page, ydl.brozzler_spy) # logging.info('XXX %s', json.dumps(info)) if self._using_warcprox(site): info_json = json.dumps(info, sort_keys=True, indent=4) self.logger.info( "sending WARCPROX_WRITE_RECORD request to warcprox " "with youtube-dl json for %s", page) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="youtube-dl:%s" % str(urlcanon.semantic(page.url)), warc_type="metadata", content_type= "application/vnd.youtube-dl_formats+json;charset=utf-8", payload=info_json.encode("utf-8"), extra_headers=site.extra_headers()) except brozzler.ShutdownRequested as e: raise except BaseException as e: if hasattr( e, "exc_info" ) and e.exc_info[0] == youtube_dl.utils.UnsupportedError: pass elif (hasattr(e, "exc_info") and e.exc_info[0] == urllib.error.HTTPError and hasattr(e.exc_info[1], "code") and e.exc_info[1].code == 420): raise brozzler.ReachedLimit(e.exc_info[1]) elif (hasattr(e, 'exc_info') and e.exc_info[0] == urllib.error.URLError and self._proxy_for(site)): # connection problem when using a proxy == proxy error (XXX?) raise brozzler.ProxyError( 'youtube-dl hit apparent proxy error from ' '%s' % page.url) from e else: raise
def _try_youtube_dl(worker, ydl, site, page): try: logging.info("trying yt-dlp on %s", page) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "<urlopen error # no host given>" resulting in ProxyError # needs automated test # and yt-dlp needs sanitize_info for extract_info ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url)))) _remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups) if worker._using_warcprox(site): info_json = json.dumps(ie_result, sort_keys=True, indent=4) logging.info( "sending WARCPROX_WRITE_RECORD request to warcprox " "with yt-dlp json for %s", page) worker._warcprox_write_record( warcprox_address=worker._proxy_for(site), url="youtube-dl:%s" % str(urlcanon.semantic(page.url)), warc_type="metadata", content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", payload=info_json.encode("utf-8"), extra_headers=site.extra_headers(page)) return ie_result except brozzler.ShutdownRequested as e: raise except Exception as e: if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError: return None elif (hasattr(e, "exc_info") and e.exc_info[0] == urllib.error.HTTPError and hasattr(e.exc_info[1], "code") and e.exc_info[1].code == 420): raise brozzler.ReachedLimit(e.exc_info[1]) elif (hasattr(e, 'exc_info') and e.exc_info[0] == urllib.error.URLError and worker._proxy_for(site)): # connection problem when using a proxy == proxy error (XXX?) raise brozzler.ProxyError( 'yt-dlp hit apparent proxy error from ' '%s' % page.url) from e else: raise
def _try_youtube_dl(worker, ydl, site, page): try: logging.info("trying youtube-dl on %s", page) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "<urlopen error # no host given>" resulting in ProxyError # needs automated test ie_result = ydl.extract_info(str(urlcanon.whatwg(page.url))) _remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups) if worker._using_warcprox(site): info_json = json.dumps(ie_result, sort_keys=True, indent=4) logging.info( "sending WARCPROX_WRITE_RECORD request to warcprox " "with youtube-dl json for %s", page) worker._warcprox_write_record( warcprox_address=worker._proxy_for(site), url="youtube-dl:%s" % str(urlcanon.semantic(page.url)), warc_type="metadata", content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", payload=info_json.encode("utf-8"), extra_headers=site.extra_headers()) return ie_result except brozzler.ShutdownRequested as e: raise except Exception as e: if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError: return None elif (hasattr(e, "exc_info") and e.exc_info[0] == urllib.error.HTTPError and hasattr(e.exc_info[1], "code") and e.exc_info[1].code == 420): raise brozzler.ReachedLimit(e.exc_info[1]) elif (hasattr(e, 'exc_info') and e.exc_info[0] == urllib.error.URLError and worker._proxy_for(site)): # connection problem when using a proxy == proxy error (XXX?) raise brozzler.ProxyError( 'youtube-dl hit apparent proxy error from ' '%s' % page.url) from e else: raise
def _scope_and_enforce_robots(self, site, parent_page, outlinks): ''' Returns tuple ( set of in scope urls (uncanonicalized) accepted by robots policy, set of in scope urls (canonicalized) blocked by robots policy, set of out-of-scope urls (canonicalized)). ''' in_scope = set() blocked = set() out_of_scope = set() for url in outlinks or []: url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) urlcanon.canon.remove_fragment(url_for_crawling) if site.is_in_scope(url_for_scoping, parent_page=parent_page): if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): in_scope.add(url) else: blocked.add(str(url_for_crawling)) else: out_of_scope.add(str(url_for_crawling)) return in_scope, blocked, out_of_scope
def scope_and_schedule_outlinks(self, site, parent_page, outlinks): decisions = {"accepted": set(), "blocked": set(), "rejected": set()} counts = {"added": 0, "updated": 0, "rejected": 0, "blocked": 0} for url in outlinks or []: url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode('utf-8') urlcanon.canon.remove_fragment(url_for_crawling) if site.is_in_scope(url_for_scoping, parent_page=parent_page): if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): if not url_for_scoping.surt().startswith( site.scope["surt"].encode("utf-8")): hops_off_surt = parent_page.hops_off_surt + 1 else: hops_off_surt = 0 new_child_page = brozzler.Page( self.rr, { 'url': str(url_for_crawling), 'site_id': site.id, 'job_id': site.job_id, 'hops_from_seed': parent_page.hops_from_seed + 1, 'via_page_id': parent_page.id, 'hops_off_surt': hops_off_surt }) existing_child_page = brozzler.Page.load( self.rr, new_child_page.id) if existing_child_page: existing_child_page.priority += new_child_page.priority if hashtag and existing_child_page.hashtags: hashtags = set(existing_child_page.hashtags) hashtags.add(hashtag) existing_child_page.hashtags = list(hashtags) elif hashtag: existing_child_page.hashtags = [hashtag] existing_child_page.save() counts["updated"] += 1 else: if hashtag: new_child_page.hashtags = [ hashtag, ] new_child_page.save() counts["added"] += 1 decisions["accepted"].add(str(url_for_crawling)) else: counts["blocked"] += 1 decisions["blocked"].add(str(url_for_crawling)) else: counts["rejected"] += 1 decisions["rejected"].add(str(url_for_crawling)) parent_page.outlinks = {} for k in decisions: parent_page.outlinks[k] = list(decisions[k]) parent_page.save() self.logger.info( "%s new links added, %s existing links updated, %s links " "rejected, %s links blocked by robots from %s", counts["added"], counts["updated"], counts["rejected"], counts["blocked"], parent_page)
def brozzler_list_captures(argv=None): ''' Handy utility for looking up entries in the rethinkdb "captures" table by url or sha1. ''' import urlcanon argv = argv or sys.argv arg_parser = argparse.ArgumentParser( prog=os.path.basename(argv[0]), formatter_class=BetterArgumentDefaultsHelpFormatter) arg_parser.add_argument( '-p', '--prefix', dest='prefix', action='store_true', help=('use prefix match for url (n.b. may not work as expected if ' 'searching key has query string because canonicalization can ' 'reorder query parameters)')) arg_parser.add_argument('--yaml', dest='yaml', action='store_true', help=('yaml output (default is json)')) add_rethinkdb_options(arg_parser) add_common_options(arg_parser, argv) arg_parser.add_argument('url_or_sha1', metavar='URL_or_SHA1', help='url or sha1 to look up in captures table') args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) rr = rethinker(args) if args.url_or_sha1[:5] == 'sha1:': if args.prefix: logging.warn( 'ignoring supplied --prefix option which does not apply ' 'to lookup by sha1') # assumes it's already base32 (XXX could detect if hex and convert) sha1base32 = args.url_or_sha1[5:].upper() reql = rr.table('captures').between([sha1base32, r.minval, r.minval], [sha1base32, r.maxval, r.maxval], index='sha1_warc_type') logging.debug('querying rethinkdb: %s', reql) results = reql.run() else: key = urlcanon.semantic(args.url_or_sha1).surt().decode('ascii') abbr_start_key = key[:150] if args.prefix: # surt is necessarily ascii and \x7f is the last ascii character abbr_end_key = key[:150] + '\x7f' end_key = key + '\x7f' else: abbr_end_key = key[:150] end_key = key reql = rr.table('captures').between([abbr_start_key, r.minval], [abbr_end_key, r.maxval], index='abbr_canon_surt_timestamp', right_bound='closed') reql = reql.order_by(index='abbr_canon_surt_timestamp') reql = reql.filter(lambda capture: (capture['canon_surt'] >= key) & (capture['canon_surt'] <= end_key)) logging.debug('querying rethinkdb: %s', reql) results = reql.run() if args.yaml: yaml.dump_all(results, stream=sys.stdout, explicit_start=True, default_flow_style=False) else: for result in results: print(json.dumps(result, cls=Jsonner, indent=2))
def brozzler_list_captures(argv=None): ''' Handy utility for looking up entries in the rethinkdb "captures" table by url or sha1. ''' import urlcanon argv = argv or sys.argv arg_parser = argparse.ArgumentParser( prog=os.path.basename(argv[0]), formatter_class=BetterArgumentDefaultsHelpFormatter) arg_parser.add_argument( '-p', '--prefix', dest='prefix', action='store_true', help=( 'use prefix match for url (n.b. may not work as expected if ' 'searching key has query string because canonicalization can ' 'reorder query parameters)')) arg_parser.add_argument( '--yaml', dest='yaml', action='store_true', help=( 'yaml output (default is json)')) add_rethinkdb_options(arg_parser) add_common_options(arg_parser, argv) arg_parser.add_argument( 'url_or_sha1', metavar='URL_or_SHA1', help='url or sha1 to look up in captures table') args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) rr = rethinker(args) if args.url_or_sha1[:5] == 'sha1:': if args.prefix: logging.warn( 'ignoring supplied --prefix option which does not apply ' 'to lookup by sha1') # assumes it's already base32 (XXX could detect if hex and convert) sha1base32 = args.url_or_sha1[5:].upper() reql = rr.table('captures').between( [sha1base32, r.minval, r.minval], [sha1base32, r.maxval, r.maxval], index='sha1_warc_type') logging.debug('querying rethinkdb: %s', reql) results = reql.run() else: key = urlcanon.semantic(args.url_or_sha1).surt().decode('ascii') abbr_start_key = key[:150] if args.prefix: # surt is necessarily ascii and \x7f is the last ascii character abbr_end_key = key[:150] + '\x7f' end_key = key + '\x7f' else: abbr_end_key = key[:150] end_key = key reql = rr.table('captures').between( [abbr_start_key, r.minval], [abbr_end_key, r.maxval], index='abbr_canon_surt_timestamp', right_bound='closed') reql = reql.order_by(index='abbr_canon_surt_timestamp') reql = reql.filter( lambda capture: (capture['canon_surt'] >= key) & (capture['canon_surt'] <= end_key)) logging.debug('querying rethinkdb: %s', reql) results = reql.run() if args.yaml: yaml.dump_all( results, stream=sys.stdout, explicit_start=True, default_flow_style=False) else: for result in results: print(json.dumps(result, cls=Jsonner, indent=2))
def canon_url(self): if not self.url: return None if self._canon_hurl is None: self._canon_hurl = urlcanon.semantic(self.url) return str(self._canon_hurl)