Beispiel #1
0
    def brozzle_page(self, browser, site, page, on_screenshot=None):
        def _on_screenshot(screenshot_png):
            if on_screenshot:
                on_screenshot(screenshot_png)
            elif self._proxy(site) and self._enable_warcprox_features(site):
                self.logger.info("sending WARCPROX_WRITE_RECORD request "
                                 "to warcprox with screenshot for %s", page)
                screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
                        screenshot_png)
                self._warcprox_write_record(warcprox_address=self._proxy(site),
                        url="screenshot:%s" % brozzler.fixup(page.url),
                        warc_type="resource", content_type="image/jpeg",
                        payload=screenshot_jpeg,
                        extra_headers=site.extra_headers())
                self._warcprox_write_record(warcprox_address=self._proxy(site),
                        url="thumbnail:%s" % brozzler.fixup(page.url),
                        warc_type="resource", content_type="image/jpeg",
                        payload=thumbnail_jpeg,
                        extra_headers=site.extra_headers())

        self.logger.info("brozzling {}".format(page))
        try:
            with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
                ydl = self._youtube_dl(tempdir, site)
                ydl_spy = ydl.brozzler_spy # remember for later
                self._try_youtube_dl(ydl, site, page)
        except brozzler.ReachedLimit as e:
            raise
        except brozzler.ShutdownRequested:
            raise
        except Exception as e:
            if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
                    and hasattr(e.exc_info[1], 'code')
                    and e.exc_info[1].code == 430):
                self.logger.info(
                        'youtube-dl got %s %s processing %s',
                        e.exc_info[1].code, e.exc_info[1].msg, page.url)
            else:
                self.logger.error(
                        "youtube_dl raised exception on %s", page, exc_info=True)

        if self._needs_browsing(page, ydl_spy):
            self.logger.info('needs browsing: %s', page)
            if not browser.is_running():
                browser.start(proxy=self._proxy(site), cookie_db=site.cookie_db)
            final_page_url, outlinks = browser.browse_page(
                    page.url, extra_headers=site.extra_headers(),
                    behavior_parameters=site.behavior_parameters,
                    user_agent=site.user_agent,
                    on_screenshot=_on_screenshot)
            if final_page_url != page.url:
                page.note_redirect(final_page_url)
            return outlinks
        else:
            if not self._already_fetched(page, ydl_spy):
                self.logger.info('needs fetch: %s', page)
                self._fetch_url(site, page)
            else:
                self.logger.info('already fetched: %s', page)
            return []
Beispiel #2
0
 def _brozzle_site(self, browser, ydl, site):
     start = time.time()
     page = None
     try:
         browser.start(proxy=site.proxy)
         while not self._shutdown_requested.is_set() and time.time() - start < 60:
             page = self._frontier.claim_page(site, self._id)
             outlinks = self.brozzle_page(browser, ydl, site, page)
             self._frontier.completed_page(site, page)
             self._frontier.scope_and_schedule_outlinks(site, page, outlinks)
             page = None
     except brozzler.NothingToClaim:
         self.logger.info("no pages left for site %s", site)
     except brozzler.ReachedLimit as e:
         self._frontier.reached_limit(site, e)
     except brozzler.browser.BrowsingAborted:
         self.logger.info("{} shut down".format(browser))
     except:
         self.logger.critical("unexpected exception", exc_info=True)
     finally:
         self.logger.info("finished session brozzling site, stopping browser and disclaiming site")
         browser.stop()
         self._frontier.disclaim_site(site, page)
         self._browser_pool.release(browser)
Beispiel #3
0
    def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
        def _on_screenshot(screenshot_jpeg):
            if on_screenshot:
                on_screenshot(screenshot_jpeg)
            if self._using_warcprox(site):
                self.logger.info(
                        "sending WARCPROX_WRITE_RECORD request to %s with "
                        "screenshot for %s", self._proxy_for(site), page)
                thumbnail_jpeg = self.thumb_jpeg(screenshot_jpeg)
                self._warcprox_write_record(
                        warcprox_address=self._proxy_for(site),
                        url="screenshot:%s" % str(urlcanon.semantic(page.url)),
                        warc_type="resource", content_type="image/jpeg",
                        payload=screenshot_jpeg,
                        extra_headers=site.extra_headers())
                self._warcprox_write_record(
                        warcprox_address=self._proxy_for(site),
                        url="thumbnail:%s" % str(urlcanon.semantic(page.url)),
                        warc_type="resource", content_type="image/jpeg",
                        payload=thumbnail_jpeg,
                        extra_headers=site.extra_headers())

        def _on_response(chrome_msg):
            if ('params' in chrome_msg
                    and 'response' in chrome_msg['params']
                    and 'mimeType' in chrome_msg['params']['response']
                    and chrome_msg['params']['response'].get('mimeType', '').startswith('video/')
                    # skip manifests of DASH segmented video -
                    # see https://github.com/internetarchive/brozzler/pull/70
                    and chrome_msg['params']['response']['mimeType'] != 'video/vnd.mpeg.dash.mpd'
                    and chrome_msg['params']['response'].get('status') in (200, 206)):
                video = {
                    'blame': 'browser',
                    'url': chrome_msg['params']['response'].get('url'),
                    'response_code': chrome_msg['params']['response']['status'],
                    'content-type': chrome_msg['params']['response']['mimeType'],
                }
                response_headers = CaseInsensitiveDict(
                        chrome_msg['params']['response']['headers'])
                if 'content-length' in response_headers:
                    video['content-length'] = int(response_headers['content-length'])
                if 'content-range' in response_headers:
                    video['content-range'] = response_headers['content-range']
                logging.debug('embedded video %s', video)
                if not 'videos' in page:
                    page.videos = []
                page.videos.append(video)

        sw_fetched = set()
        def _on_service_worker_version_updated(chrome_msg):
            # https://github.com/internetarchive/brozzler/issues/140
            self.logger.trace('%r', chrome_msg)
            if chrome_msg.get('params', {}).get('versions'):
                url = chrome_msg.get('params', {}).get('versions')[0]\
                        .get('scriptURL')
                if url and url not in sw_fetched:
                    self.logger.info('fetching service worker script %s', url)
                    self._fetch_url(site, url)
                    sw_fetched.add(url)

        if not browser.is_running():
            browser.start(
                    proxy=self._proxy_for(site),
                    cookie_db=site.get('cookie_db'))
        final_page_url, outlinks = browser.browse_page(
                page.url, extra_headers=site.extra_headers(),
                behavior_parameters=site.get('behavior_parameters'),
                username=site.get('username'), password=site.get('password'),
                user_agent=site.get('user_agent'),
                on_screenshot=_on_screenshot, on_response=_on_response,
                on_request=on_request,
                on_service_worker_version_updated=_on_service_worker_version_updated,
                hashtags=page.hashtags,
                skip_extract_outlinks=self._skip_extract_outlinks,
                skip_visit_hashtags=self._skip_visit_hashtags,
                skip_youtube_dl=self._skip_youtube_dl,
                simpler404=self._simpler404,
                screenshot_full_page=self._screenshot_full_page,
                page_timeout=self._page_timeout,
                behavior_timeout=self._behavior_timeout,
                download_throughput=self._download_throughput)
        if final_page_url != page.url:
            page.note_redirect(final_page_url)
        return outlinks
Beispiel #4
0
    def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
        def _on_screenshot(screenshot_png):
            if on_screenshot:
                on_screenshot(screenshot_png)
            if self._using_warcprox(site):
                self.logger.info(
                        "sending WARCPROX_WRITE_RECORD request to %s with "
                        "screenshot for %s", self._proxy_for(site), page)
                screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
                        screenshot_png)
                self._warcprox_write_record(
                        warcprox_address=self._proxy_for(site),
                        url="screenshot:%s" % str(urlcanon.semantic(page.url)),
                        warc_type="resource", content_type="image/jpeg",
                        payload=screenshot_jpeg,
                        extra_headers=site.extra_headers())
                self._warcprox_write_record(
                        warcprox_address=self._proxy_for(site),
                        url="thumbnail:%s" % str(urlcanon.semantic(page.url)),
                        warc_type="resource", content_type="image/jpeg",
                        payload=thumbnail_jpeg,
                        extra_headers=site.extra_headers())

        def _on_response(chrome_msg):
            if ('params' in chrome_msg
                    and 'response' in chrome_msg['params']
                    and 'mimeType' in chrome_msg['params']['response']
                    and chrome_msg['params']['response'].get('mimeType', '').startswith('video/')
                    # skip manifests of DASH segmented video -
                    # see https://github.com/internetarchive/brozzler/pull/70
                    and chrome_msg['params']['response']['mimeType'] != 'video/vnd.mpeg.dash.mpd'
                    and chrome_msg['params']['response'].get('status') in (200, 206)):
                video = {
                    'blame': 'browser',
                    'url': chrome_msg['params']['response'].get('url'),
                    'response_code': chrome_msg['params']['response']['status'],
                    'content-type': chrome_msg['params']['response']['mimeType'],
                }
                response_headers = CaseInsensitiveDict(
                        chrome_msg['params']['response']['headers'])
                if 'content-length' in response_headers:
                    video['content-length'] = int(response_headers['content-length'])
                if 'content-range' in response_headers:
                    video['content-range'] = response_headers['content-range']
                logging.debug('embedded video %s', video)
                if not 'videos' in page:
                    page.videos = []
                page.videos.append(video)

        sw_fetched = set()
        def _on_service_worker_version_updated(chrome_msg):
            # https://github.com/internetarchive/brozzler/issues/140
            self.logger.trace('%r', chrome_msg)
            if chrome_msg.get('params', {}).get('versions'):
                url = chrome_msg.get('params', {}).get('versions')[0]\
                        .get('scriptURL')
                if url and url not in sw_fetched:
                    self.logger.info('fetching service worker script %s', url)
                    self._fetch_url(site, url)
                    sw_fetched.add(url)

        if not browser.is_running():
            browser.start(
                    proxy=self._proxy_for(site),
                    cookie_db=site.get('cookie_db'))
        final_page_url, outlinks = browser.browse_page(
                page.url, extra_headers=site.extra_headers(),
                behavior_parameters=site.get('behavior_parameters'),
                username=site.get('username'), password=site.get('password'),
                user_agent=site.get('user_agent'),
                on_screenshot=_on_screenshot, on_response=_on_response,
                on_request=on_request,
                on_service_worker_version_updated=_on_service_worker_version_updated,
                hashtags=page.hashtags,
                skip_extract_outlinks=self._skip_extract_outlinks,
                skip_visit_hashtags=self._skip_visit_hashtags,
                skip_youtube_dl=self._skip_youtube_dl,
                page_timeout=self._page_timeout,
                behavior_timeout=self._behavior_timeout)
        if final_page_url != page.url:
            page.note_redirect(final_page_url)
        return outlinks
Beispiel #5
0
    def brozzle_page(self, browser, site, page, on_screenshot=None):
        def _on_screenshot(screenshot_png):
            if on_screenshot:
                on_screenshot(screenshot_png)
            elif self._proxy(site) and self._enable_warcprox_features(site):
                self.logger.info(
                    "sending WARCPROX_WRITE_RECORD request "
                    "to warcprox with screenshot for %s", page)
                screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
                    screenshot_png)
                self._warcprox_write_record(warcprox_address=self._proxy(site),
                                            url="screenshot:{}".format(
                                                page.url),
                                            warc_type="resource",
                                            content_type="image/jpeg",
                                            payload=screenshot_jpeg,
                                            extra_headers=site.extra_headers())
                self._warcprox_write_record(warcprox_address=self._proxy(site),
                                            url="thumbnail:{}".format(
                                                page.url),
                                            warc_type="resource",
                                            content_type="image/jpeg",
                                            payload=thumbnail_jpeg,
                                            extra_headers=site.extra_headers())

        self.logger.info("brozzling {}".format(page))
        try:
            with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
                ydl = self._youtube_dl(tempdir, site)
                ydl_spy = ydl.brozzler_spy  # remember for later
                self._try_youtube_dl(ydl, site, page)
        except brozzler.ReachedLimit as e:
            raise
        except Exception as e:
            if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
                    and hasattr(e.exc_info[1], 'code')
                    and e.exc_info[1].code == 430):
                self.logger.info('youtube-dl got %s %s processing %s',
                                 e.exc_info[1].code, e.exc_info[1].msg,
                                 page.url)
            else:
                self.logger.error("youtube_dl raised exception on %s",
                                  page,
                                  exc_info=True)

        if self._needs_browsing(page, ydl_spy):
            self.logger.info('needs browsing: %s', page)
            if not browser.is_running():
                browser.start(proxy=self._proxy(site),
                              cookie_db=site.cookie_db)
            outlinks = browser.browse_page(page.url,
                                           extra_headers=site.extra_headers(),
                                           on_screenshot=_on_screenshot,
                                           on_url_change=page.note_redirect)
            return outlinks
        else:
            if not self._already_fetched(page, ydl_spy):
                self.logger.info('needs fetch: %s', page)
                self._fetch_url(site, page)
            else:
                self.logger.info('already fetched: %s', page)
            return []
Beispiel #6
0
    def _browse_page(self, browser, site, page, on_screenshot=None):
        def _on_screenshot(screenshot_png):
            if on_screenshot:
                on_screenshot(screenshot_png)
            if self._using_warcprox(site):
                self.logger.info(
                    "sending WARCPROX_WRITE_RECORD request to %s with "
                    "screenshot for %s", self._proxy_for(site), page)
                screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
                    screenshot_png)
                self._warcprox_write_record(
                    warcprox_address=self._proxy_for(site),
                    url="screenshot:%s" % str(urlcanon.semantic(page.url)),
                    warc_type="resource",
                    content_type="image/jpeg",
                    payload=screenshot_jpeg,
                    extra_headers=site.extra_headers())
                self._warcprox_write_record(
                    warcprox_address=self._proxy_for(site),
                    url="thumbnail:%s" % str(urlcanon.semantic(page.url)),
                    warc_type="resource",
                    content_type="image/jpeg",
                    payload=thumbnail_jpeg,
                    extra_headers=site.extra_headers())

        def _on_response(chrome_msg):
            if ('params' in chrome_msg and 'response' in chrome_msg['params']
                    and 'mimeType' in chrome_msg['params']['response']
                    and chrome_msg['params']['response'].get(
                        'mimeType', '').startswith('video/')
                    and chrome_msg['params']['response'].get('status')
                    in (200, 206)):
                video = {
                    'blame': 'browser',
                    'url': chrome_msg['params']['response'].get('url'),
                    'response_code':
                    chrome_msg['params']['response']['status'],
                    'content-type':
                    chrome_msg['params']['response']['mimeType'],
                }
                response_headers = CaseInsensitiveDict(
                    chrome_msg['params']['response']['headers'])
                if 'content-length' in response_headers:
                    video['content-length'] = int(
                        response_headers['content-length'])
                if 'content-range' in response_headers:
                    video['content-range'] = response_headers['content-range']
                logging.debug('embedded video %s', video)
                if not 'videos' in page:
                    page.videos = []
                page.videos.append(video)

        if not browser.is_running():
            browser.start(proxy=self._proxy_for(site),
                          cookie_db=site.get('cookie_db'))
        final_page_url, outlinks = browser.browse_page(
            page.url,
            extra_headers=site.extra_headers(),
            behavior_parameters=site.get('behavior_parameters'),
            username=site.get('username'),
            password=site.get('password'),
            user_agent=site.get('user_agent'),
            on_screenshot=_on_screenshot,
            on_response=_on_response,
            hashtags=page.hashtags,
            skip_extract_outlinks=self._skip_extract_outlinks,
            skip_visit_hashtags=self._skip_visit_hashtags)
        if final_page_url != page.url:
            page.note_redirect(final_page_url)
        return outlinks