Beispiel #1
0
 def get(self, url, *args, **kwargs):
     res = super().get(url, *args, **kwargs)
     if res.status_code == 420 and 'warcprox-meta' in res.headers:
         raise brozzler.ReachedLimit(warcprox_meta=json.loads(
             res.headers['warcprox-meta']),
                                     http_payload=res.text)
     else:
         return res
Beispiel #2
0
 def _network_response_received(self, message):
     if (not self._reached_limit
             and message["params"]["response"]["status"] == 420
             and "Warcprox-Meta" in CaseInsensitiveDict(
                 message["params"]["response"]["headers"])):
         warcprox_meta = json.loads(
             CaseInsensitiveDict(
                 message["params"]["response"]["headers"])["Warcprox-Meta"])
         self._reached_limit = brozzler.ReachedLimit(
             warcprox_meta=warcprox_meta)
         self.logger.info("reached limit %s", self._reached_limit)
     if self.on_response:
         self.on_response(message)
Beispiel #3
0
    def _try_youtube_dl(self, ydl, site, page):
        try:
            self.logger.info("trying youtube-dl on {}".format(page))

            with brozzler.thread_accept_exceptions():
                # we do whatwg canonicalization here to avoid "<urlopen error
                # no host given>" resulting in ProxyError
                # needs automated test
                info = ydl.extract_info(str(urlcanon.whatwg(page.url)))
            self._remember_videos(page, ydl.brozzler_spy)
            # logging.info('XXX %s', json.dumps(info))
            if self._using_warcprox(site):
                info_json = json.dumps(info, sort_keys=True, indent=4)
                self.logger.info(
                    "sending WARCPROX_WRITE_RECORD request to warcprox "
                    "with youtube-dl json for %s", page)
                self._warcprox_write_record(
                    warcprox_address=self._proxy_for(site),
                    url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
                    warc_type="metadata",
                    content_type=
                    "application/vnd.youtube-dl_formats+json;charset=utf-8",
                    payload=info_json.encode("utf-8"),
                    extra_headers=site.extra_headers())
        except brozzler.ShutdownRequested as e:
            raise
        except BaseException as e:
            if hasattr(
                    e, "exc_info"
            ) and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
                pass
            elif (hasattr(e, "exc_info")
                  and e.exc_info[0] == urllib.error.HTTPError
                  and hasattr(e.exc_info[1], "code")
                  and e.exc_info[1].code == 420):
                raise brozzler.ReachedLimit(e.exc_info[1])
            elif (hasattr(e, 'exc_info')
                  and e.exc_info[0] == urllib.error.URLError
                  and self._proxy_for(site)):
                # connection problem when using a proxy == proxy error (XXX?)
                raise brozzler.ProxyError(
                    'youtube-dl hit apparent proxy error from '
                    '%s' % page.url) from e
            else:
                raise
Beispiel #4
0
 def _network_response_received(self, message):
     if (message['params']['response']['status'] == 420
             and 'Warcprox-Meta' in CaseInsensitiveDict(
                 message['params']['response']['headers'])):
         if not self.reached_limit:
             warcprox_meta = json.loads(CaseInsensitiveDict(
                 message['params']['response']['headers'])['Warcprox-Meta'])
             self.reached_limit = brozzler.ReachedLimit(
                     warcprox_meta=warcprox_meta)
             self.logger.info('reached limit %s', self.reached_limit)
             brozzler.thread_raise(
                     self.calling_thread, brozzler.ReachedLimit)
         else:
             self.logger.info(
                     'reached limit but self.reached_limit is already set, '
                     'assuming the calling thread is already handling this')
     if self.on_response:
         self.on_response(message)
Beispiel #5
0
def _try_youtube_dl(worker, ydl, site, page):
    try:
        logging.info("trying yt-dlp on %s", page)

        with brozzler.thread_accept_exceptions():
            # we do whatwg canonicalization here to avoid "<urlopen error
            # no host given>" resulting in ProxyError
            # needs automated test
            # and yt-dlp needs sanitize_info for extract_info
            ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url))))
        _remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups)
        if worker._using_warcprox(site):
            info_json = json.dumps(ie_result, sort_keys=True, indent=4)
            logging.info(
                    "sending WARCPROX_WRITE_RECORD request to warcprox "
                    "with yt-dlp json for %s", page)
            worker._warcprox_write_record(
                    warcprox_address=worker._proxy_for(site),
                    url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
                    warc_type="metadata",
                    content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
                    payload=info_json.encode("utf-8"),
                    extra_headers=site.extra_headers(page))
        return ie_result
    except brozzler.ShutdownRequested as e:
        raise
    except Exception as e:
        if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
            return None
        elif (hasattr(e, "exc_info")
                and e.exc_info[0] == urllib.error.HTTPError
                and hasattr(e.exc_info[1], "code")
                and e.exc_info[1].code == 420):
            raise brozzler.ReachedLimit(e.exc_info[1])
        elif (hasattr(e, 'exc_info')
                and e.exc_info[0] == urllib.error.URLError
                and worker._proxy_for(site)):
            # connection problem when using a proxy == proxy error (XXX?)
            raise brozzler.ProxyError(
                    'yt-dlp hit apparent proxy error from '
                    '%s' % page.url) from e
        else:
            raise