def _warcprox_write_record( self, warcprox_address, url, warc_type, content_type, payload, extra_headers=None): headers = {"Content-Type":content_type,"WARC-Type":warc_type,"Host":"N/A"} if extra_headers: headers.update(extra_headers) request = urllib.request.Request(url, method="WARCPROX_WRITE_RECORD", headers=headers, data=payload) # XXX setting request.type="http" is a hack to stop urllib from trying # to tunnel if url is https request.type = "http" request.set_proxy(warcprox_address, "http") try: with urllib.request.urlopen(request, timeout=600) as response: if response.getcode() != 204: self.logger.warning( 'got "%s %s" response on warcprox ' 'WARCPROX_WRITE_RECORD request (expected 204)', response.getcode(), response.reason) return request, response except urllib.error.HTTPError as e: self.logger.warning( 'got "%s %s" response on warcprox ' 'WARCPROX_WRITE_RECORD request (expected 204)', e.getcode(), e.info()) return request, None except urllib.error.URLError as e: raise brozzler.ProxyError( 'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e except ConnectionError as e: raise brozzler.ProxyError( 'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
def is_permitted_by_robots(site, url, proxy=None): ''' Checks if `url` is permitted by robots.txt. In case of problems fetching robots.txt, different things can happen. Reppy (the robots.txt parsing library) handles some exceptions internally and applies an appropriate policy. It bubbles up other exceptions. Of these, there are two kinds that this function raises for the caller to handle, described below. Yet other types of exceptions are caught, and the fetch is retried up to 10 times. In this case, after the 10th failure, the function returns `False` (i.e. forbidden by robots). Returns: bool: `True` if `site.ignore_robots` is set, or if `url` is permitted by robots.txt, `False` otherwise Raises: brozzler.ReachedLimit: if warcprox responded with 420 Reached Limit requests.exceptions.ProxyError: if the proxy is down ''' if site.ignore_robots: return True tries_left = 10 while True: try: result = _robots_cache(site, proxy).allowed( url, site.user_agent or "brozzler") return result except Exception as e: if isinstance(e, reppy.exceptions.ServerError) and isinstance( e.args[0], brozzler.ReachedLimit): raise e.args[0] elif hasattr(e, 'args') and isinstance( e.args[0], requests.exceptions.ProxyError): # reppy has wrapped an exception that we want to bubble up raise brozzler.ProxyError(e) else: if tries_left > 0: logging.warn( "caught exception fetching robots.txt (%r tries " "left) for %r: %r", tries_left, url, e) tries_left -= 1 else: logging.error( "caught exception fetching robots.txt (0 tries " "left) for %r: %r", url, e, exc_info=True) return False
def _fetch_url(self, site, url): proxies = None if self._proxy_for(site): proxies = { 'http': 'http://%s' % self._proxy_for(site), 'https': 'http://%s' % self._proxy_for(site), } self.logger.info('fetching %s', url) try: # response is ignored requests.get( url, proxies=proxies, headers=site.extra_headers(), verify=False) except requests.exceptions.ProxyError as e: raise brozzler.ProxyError( 'proxy error fetching %s' % url) from e
def _try_youtube_dl(self, ydl, site, page): try: self.logger.info("trying youtube-dl on {}".format(page)) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "<urlopen error # no host given>" resulting in ProxyError # needs automated test info = ydl.extract_info(str(urlcanon.whatwg(page.url))) self._remember_videos(page, ydl.brozzler_spy) # logging.info('XXX %s', json.dumps(info)) if self._using_warcprox(site): info_json = json.dumps(info, sort_keys=True, indent=4) self.logger.info( "sending WARCPROX_WRITE_RECORD request to warcprox " "with youtube-dl json for %s", page) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="youtube-dl:%s" % str(urlcanon.semantic(page.url)), warc_type="metadata", content_type= "application/vnd.youtube-dl_formats+json;charset=utf-8", payload=info_json.encode("utf-8"), extra_headers=site.extra_headers()) except brozzler.ShutdownRequested as e: raise except BaseException as e: if hasattr( e, "exc_info" ) and e.exc_info[0] == youtube_dl.utils.UnsupportedError: pass elif (hasattr(e, "exc_info") and e.exc_info[0] == urllib.error.HTTPError and hasattr(e.exc_info[1], "code") and e.exc_info[1].code == 420): raise brozzler.ReachedLimit(e.exc_info[1]) elif (hasattr(e, 'exc_info') and e.exc_info[0] == urllib.error.URLError and self._proxy_for(site)): # connection problem when using a proxy == proxy error (XXX?) raise brozzler.ProxyError( 'youtube-dl hit apparent proxy error from ' '%s' % page.url) from e else: raise
def _proxy_for(self, site): if self._proxy: return self._proxy elif site.proxy: return site.proxy elif self._warcprox_auto: svc = self._choose_warcprox() if svc is None: raise brozzler.ProxyError( 'no available instances of warcprox in the service ' 'registry') site.proxy = '%s:%s' % (svc['host'], svc['port']) site.save() self.logger.info( 'chose warcprox instance %r from service registry for %r', site.proxy, site) return site.proxy return None
def _try_youtube_dl(worker, ydl, site, page): try: logging.info("trying yt-dlp on %s", page) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "<urlopen error # no host given>" resulting in ProxyError # needs automated test # and yt-dlp needs sanitize_info for extract_info ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url)))) _remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups) if worker._using_warcprox(site): info_json = json.dumps(ie_result, sort_keys=True, indent=4) logging.info( "sending WARCPROX_WRITE_RECORD request to warcprox " "with yt-dlp json for %s", page) worker._warcprox_write_record( warcprox_address=worker._proxy_for(site), url="youtube-dl:%s" % str(urlcanon.semantic(page.url)), warc_type="metadata", content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", payload=info_json.encode("utf-8"), extra_headers=site.extra_headers(page)) return ie_result except brozzler.ShutdownRequested as e: raise except Exception as e: if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError: return None elif (hasattr(e, "exc_info") and e.exc_info[0] == urllib.error.HTTPError and hasattr(e.exc_info[1], "code") and e.exc_info[1].code == 420): raise brozzler.ReachedLimit(e.exc_info[1]) elif (hasattr(e, 'exc_info') and e.exc_info[0] == urllib.error.URLError and worker._proxy_for(site)): # connection problem when using a proxy == proxy error (XXX?) raise brozzler.ProxyError( 'yt-dlp hit apparent proxy error from ' '%s' % page.url) from e else: raise
def is_permitted_by_robots(site, url, proxy=None): ''' Checks if `url` is permitted by robots.txt. Treats any kind of error fetching robots.txt as "allow all". See http://builds.archive.org/javadoc/heritrix-3.x-snapshot/org/archive/modules/net/CrawlServer.html#updateRobots(org.archive.modules.CrawlURI) for some background on that policy. Returns: bool: `True` if `site.ignore_robots` is set, or if `url` is permitted by robots.txt, `False` otherwise Raises: brozzler.ReachedLimit: if warcprox responded with 420 Reached Limit requests.exceptions.ProxyError: if the proxy is down ''' if site.ignore_robots: return True try: result = _robots_cache(site, proxy).allowed( url, site.user_agent or "brozzler") return result except Exception as e: if isinstance(e, reppy.exceptions.ServerError) and isinstance( e.args[0], brozzler.ReachedLimit): raise e.args[0] elif hasattr(e, 'args') and isinstance( e.args[0], requests.exceptions.ProxyError): # reppy has wrapped an exception that we want to bubble up raise brozzler.ProxyError(e) else: logging.warn( "returning true (permitted) after problem fetching " "robots.txt for %r: %r", url, e) return True