Example #1
0
    def _warcprox_write_record(
            self, warcprox_address, url, warc_type, content_type,
            payload, extra_headers=None):
        headers = {"Content-Type":content_type,"WARC-Type":warc_type,"Host":"N/A"}
        if extra_headers:
            headers.update(extra_headers)
        request = urllib.request.Request(url, method="WARCPROX_WRITE_RECORD",
                headers=headers, data=payload)

        # XXX setting request.type="http" is a hack to stop urllib from trying
        # to tunnel if url is https
        request.type = "http"
        request.set_proxy(warcprox_address, "http")

        try:
            with urllib.request.urlopen(request, timeout=600) as response:
                if response.getcode() != 204:
                    self.logger.warning(
                            'got "%s %s" response on warcprox '
                            'WARCPROX_WRITE_RECORD request (expected 204)',
                            response.getcode(), response.reason)
                return request, response
        except urllib.error.HTTPError as e:
            self.logger.warning(
                    'got "%s %s" response on warcprox '
                    'WARCPROX_WRITE_RECORD request (expected 204)',
                    e.getcode(), e.info())
            return request, None
        except urllib.error.URLError as e:
            raise brozzler.ProxyError(
                    'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
        except ConnectionError as e:
            raise brozzler.ProxyError(
                    'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
Example #2
0
def is_permitted_by_robots(site, url, proxy=None):
    '''
    Checks if `url` is permitted by robots.txt.

    In case of problems fetching robots.txt, different things can happen.
    Reppy (the robots.txt parsing library) handles some exceptions internally
    and applies an appropriate policy. It bubbles up other exceptions. Of
    these, there are two kinds that this function raises for the caller to
    handle, described below. Yet other types of exceptions are caught, and the
    fetch is retried up to 10 times. In this case, after the 10th failure, the
    function returns `False` (i.e. forbidden by robots).

    Returns:
        bool: `True` if `site.ignore_robots` is set, or if `url` is permitted
            by robots.txt, `False` otherwise

    Raises:
        brozzler.ReachedLimit: if warcprox responded with 420 Reached Limit
        requests.exceptions.ProxyError: if the proxy is down
    '''
    if site.ignore_robots:
        return True

    tries_left = 10
    while True:
        try:
            result = _robots_cache(site, proxy).allowed(
                url, site.user_agent or "brozzler")
            return result
        except Exception as e:
            if isinstance(e, reppy.exceptions.ServerError) and isinstance(
                    e.args[0], brozzler.ReachedLimit):
                raise e.args[0]
            elif hasattr(e, 'args') and isinstance(
                    e.args[0], requests.exceptions.ProxyError):
                # reppy has wrapped an exception that we want to bubble up
                raise brozzler.ProxyError(e)
            else:
                if tries_left > 0:
                    logging.warn(
                        "caught exception fetching robots.txt (%r tries "
                        "left) for %r: %r", tries_left, url, e)
                    tries_left -= 1
                else:
                    logging.error(
                        "caught exception fetching robots.txt (0 tries "
                        "left) for %r: %r",
                        url,
                        e,
                        exc_info=True)
                    return False
Example #3
0
    def _fetch_url(self, site, url):
        proxies = None
        if self._proxy_for(site):
            proxies = {
                'http': 'http://%s' % self._proxy_for(site),
                'https': 'http://%s' % self._proxy_for(site),
            }

        self.logger.info('fetching %s', url)
        try:
            # response is ignored
            requests.get(
                    url, proxies=proxies, headers=site.extra_headers(),
                    verify=False)
        except requests.exceptions.ProxyError as e:
            raise brozzler.ProxyError(
                    'proxy error fetching %s' % url) from e
Example #4
0
    def _try_youtube_dl(self, ydl, site, page):
        try:
            self.logger.info("trying youtube-dl on {}".format(page))

            with brozzler.thread_accept_exceptions():
                # we do whatwg canonicalization here to avoid "<urlopen error
                # no host given>" resulting in ProxyError
                # needs automated test
                info = ydl.extract_info(str(urlcanon.whatwg(page.url)))
            self._remember_videos(page, ydl.brozzler_spy)
            # logging.info('XXX %s', json.dumps(info))
            if self._using_warcprox(site):
                info_json = json.dumps(info, sort_keys=True, indent=4)
                self.logger.info(
                    "sending WARCPROX_WRITE_RECORD request to warcprox "
                    "with youtube-dl json for %s", page)
                self._warcprox_write_record(
                    warcprox_address=self._proxy_for(site),
                    url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
                    warc_type="metadata",
                    content_type=
                    "application/vnd.youtube-dl_formats+json;charset=utf-8",
                    payload=info_json.encode("utf-8"),
                    extra_headers=site.extra_headers())
        except brozzler.ShutdownRequested as e:
            raise
        except BaseException as e:
            if hasattr(
                    e, "exc_info"
            ) and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
                pass
            elif (hasattr(e, "exc_info")
                  and e.exc_info[0] == urllib.error.HTTPError
                  and hasattr(e.exc_info[1], "code")
                  and e.exc_info[1].code == 420):
                raise brozzler.ReachedLimit(e.exc_info[1])
            elif (hasattr(e, 'exc_info')
                  and e.exc_info[0] == urllib.error.URLError
                  and self._proxy_for(site)):
                # connection problem when using a proxy == proxy error (XXX?)
                raise brozzler.ProxyError(
                    'youtube-dl hit apparent proxy error from '
                    '%s' % page.url) from e
            else:
                raise
Example #5
0
 def _proxy_for(self, site):
     if self._proxy:
         return self._proxy
     elif site.proxy:
         return site.proxy
     elif self._warcprox_auto:
         svc = self._choose_warcprox()
         if svc is None:
             raise brozzler.ProxyError(
                     'no available instances of warcprox in the service '
                     'registry')
         site.proxy = '%s:%s' % (svc['host'], svc['port'])
         site.save()
         self.logger.info(
                 'chose warcprox instance %r from service registry for %r',
                 site.proxy, site)
         return site.proxy
     return None
Example #6
0
def _try_youtube_dl(worker, ydl, site, page):
    try:
        logging.info("trying yt-dlp on %s", page)

        with brozzler.thread_accept_exceptions():
            # we do whatwg canonicalization here to avoid "<urlopen error
            # no host given>" resulting in ProxyError
            # needs automated test
            # and yt-dlp needs sanitize_info for extract_info
            ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url))))
        _remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups)
        if worker._using_warcprox(site):
            info_json = json.dumps(ie_result, sort_keys=True, indent=4)
            logging.info(
                    "sending WARCPROX_WRITE_RECORD request to warcprox "
                    "with yt-dlp json for %s", page)
            worker._warcprox_write_record(
                    warcprox_address=worker._proxy_for(site),
                    url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
                    warc_type="metadata",
                    content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
                    payload=info_json.encode("utf-8"),
                    extra_headers=site.extra_headers(page))
        return ie_result
    except brozzler.ShutdownRequested as e:
        raise
    except Exception as e:
        if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
            return None
        elif (hasattr(e, "exc_info")
                and e.exc_info[0] == urllib.error.HTTPError
                and hasattr(e.exc_info[1], "code")
                and e.exc_info[1].code == 420):
            raise brozzler.ReachedLimit(e.exc_info[1])
        elif (hasattr(e, 'exc_info')
                and e.exc_info[0] == urllib.error.URLError
                and worker._proxy_for(site)):
            # connection problem when using a proxy == proxy error (XXX?)
            raise brozzler.ProxyError(
                    'yt-dlp hit apparent proxy error from '
                    '%s' % page.url) from e
        else:
            raise
Example #7
0
def is_permitted_by_robots(site, url, proxy=None):
    '''
    Checks if `url` is permitted by robots.txt.

    Treats any kind of error fetching robots.txt as "allow all". See
    http://builds.archive.org/javadoc/heritrix-3.x-snapshot/org/archive/modules/net/CrawlServer.html#updateRobots(org.archive.modules.CrawlURI)
    for some background on that policy.

    Returns:
        bool: `True` if `site.ignore_robots` is set, or if `url` is permitted
            by robots.txt, `False` otherwise

    Raises:
        brozzler.ReachedLimit: if warcprox responded with 420 Reached Limit
        requests.exceptions.ProxyError: if the proxy is down
    '''
    if site.ignore_robots:
        return True

    try:
        result = _robots_cache(site, proxy).allowed(
                url, site.user_agent or "brozzler")
        return result
    except Exception as e:
        if isinstance(e, reppy.exceptions.ServerError) and isinstance(
                e.args[0], brozzler.ReachedLimit):
            raise e.args[0]
        elif hasattr(e, 'args') and isinstance(
                e.args[0], requests.exceptions.ProxyError):
            # reppy has wrapped an exception that we want to bubble up
            raise brozzler.ProxyError(e)
        else:
            logging.warn(
                    "returning true (permitted) after problem fetching "
                    "robots.txt for %r: %r", url, e)
            return True