コード例 #1
0
ファイル: worker.py プロジェクト: internetarchive/brozzler
 def _try_youtube_dl(self, ydl, site, page):
     try:
         self.logger.info("trying youtube-dl on {}".format(page))
         info = ydl.extract_info(page.url)
         if self._proxy(site) and self._enable_warcprox_features(site):
             info_json = json.dumps(info, sort_keys=True, indent=4)
             self.logger.info(
                     "sending WARCPROX_WRITE_RECORD request to warcprox "
                     "with youtube-dl json for %s", page)
             self._warcprox_write_record(
                     warcprox_address=self._proxy(site),
                     url="youtube-dl:%s" % brozzler.fixup(page.url),
                     warc_type="metadata",
                     content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
                     payload=info_json.encode("utf-8"),
                     extra_headers=site.extra_headers())
     except brozzler.ShutdownRequested as e:
         raise
     except BaseException as e:
         if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
             pass
         elif (hasattr(e, "exc_info") and e.exc_info[0] ==
                 urllib.error.HTTPError and hasattr(e.exc_info[1], "code")
                 and e.exc_info[1].code == 420):
             raise brozzler.ReachedLimit(e.exc_info[1])
         else:
             raise
コード例 #2
0
ファイル: worker.py プロジェクト: internetarchive/brozzler
 def _on_screenshot(screenshot_png):
     if on_screenshot:
         on_screenshot(screenshot_png)
     elif self._proxy(site) and self._enable_warcprox_features(site):
         self.logger.info("sending WARCPROX_WRITE_RECORD request "
                          "to warcprox with screenshot for %s", page)
         screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
                 screenshot_png)
         self._warcprox_write_record(warcprox_address=self._proxy(site),
                 url="screenshot:%s" % brozzler.fixup(page.url),
                 warc_type="resource", content_type="image/jpeg",
                 payload=screenshot_jpeg,
                 extra_headers=site.extra_headers())
         self._warcprox_write_record(warcprox_address=self._proxy(site),
                 url="thumbnail:%s" % brozzler.fixup(page.url),
                 warc_type="resource", content_type="image/jpeg",
                 payload=thumbnail_jpeg,
                 extra_headers=site.extra_headers())