def brozzle_page(self, browser, site, page, on_screenshot=None): def _on_screenshot(screenshot_png): if on_screenshot: on_screenshot(screenshot_png) elif self._proxy(site) and self._enable_warcprox_features(site): self.logger.info("sending WARCPROX_WRITE_RECORD request " "to warcprox with screenshot for %s", page) screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs( screenshot_png) self._warcprox_write_record(warcprox_address=self._proxy(site), url="screenshot:%s" % brozzler.fixup(page.url), warc_type="resource", content_type="image/jpeg", payload=screenshot_jpeg, extra_headers=site.extra_headers()) self._warcprox_write_record(warcprox_address=self._proxy(site), url="thumbnail:%s" % brozzler.fixup(page.url), warc_type="resource", content_type="image/jpeg", payload=thumbnail_jpeg, extra_headers=site.extra_headers()) self.logger.info("brozzling {}".format(page)) try: with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: ydl = self._youtube_dl(tempdir, site) ydl_spy = ydl.brozzler_spy # remember for later self._try_youtube_dl(ydl, site, page) except brozzler.ReachedLimit as e: raise except brozzler.ShutdownRequested: raise except Exception as e: if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2 and hasattr(e.exc_info[1], 'code') and e.exc_info[1].code == 430): self.logger.info( 'youtube-dl got %s %s processing %s', e.exc_info[1].code, e.exc_info[1].msg, page.url) else: self.logger.error( "youtube_dl raised exception on %s", page, exc_info=True) if self._needs_browsing(page, ydl_spy): self.logger.info('needs browsing: %s', page) if not browser.is_running(): browser.start(proxy=self._proxy(site), cookie_db=site.cookie_db) final_page_url, outlinks = browser.browse_page( page.url, extra_headers=site.extra_headers(), behavior_parameters=site.behavior_parameters, user_agent=site.user_agent, on_screenshot=_on_screenshot) if final_page_url != page.url: page.note_redirect(final_page_url) return outlinks else: if not self._already_fetched(page, ydl_spy): self.logger.info('needs fetch: %s', page) self._fetch_url(site, page) else: self.logger.info('already fetched: %s', page) return []
def _brozzle_site(self, browser, ydl, site): start = time.time() page = None try: browser.start(proxy=site.proxy) while not self._shutdown_requested.is_set() and time.time() - start < 60: page = self._frontier.claim_page(site, self._id) outlinks = self.brozzle_page(browser, ydl, site, page) self._frontier.completed_page(site, page) self._frontier.scope_and_schedule_outlinks(site, page, outlinks) page = None except brozzler.NothingToClaim: self.logger.info("no pages left for site %s", site) except brozzler.ReachedLimit as e: self._frontier.reached_limit(site, e) except brozzler.browser.BrowsingAborted: self.logger.info("{} shut down".format(browser)) except: self.logger.critical("unexpected exception", exc_info=True) finally: self.logger.info("finished session brozzling site, stopping browser and disclaiming site") browser.stop() self._frontier.disclaim_site(site, page) self._browser_pool.release(browser)
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def _on_screenshot(screenshot_jpeg): if on_screenshot: on_screenshot(screenshot_jpeg) if self._using_warcprox(site): self.logger.info( "sending WARCPROX_WRITE_RECORD request to %s with " "screenshot for %s", self._proxy_for(site), page) thumbnail_jpeg = self.thumb_jpeg(screenshot_jpeg) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="screenshot:%s" % str(urlcanon.semantic(page.url)), warc_type="resource", content_type="image/jpeg", payload=screenshot_jpeg, extra_headers=site.extra_headers()) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="thumbnail:%s" % str(urlcanon.semantic(page.url)), warc_type="resource", content_type="image/jpeg", payload=thumbnail_jpeg, extra_headers=site.extra_headers()) def _on_response(chrome_msg): if ('params' in chrome_msg and 'response' in chrome_msg['params'] and 'mimeType' in chrome_msg['params']['response'] and chrome_msg['params']['response'].get('mimeType', '').startswith('video/') # skip manifests of DASH segmented video - # see https://github.com/internetarchive/brozzler/pull/70 and chrome_msg['params']['response']['mimeType'] != 'video/vnd.mpeg.dash.mpd' and chrome_msg['params']['response'].get('status') in (200, 206)): video = { 'blame': 'browser', 'url': chrome_msg['params']['response'].get('url'), 'response_code': chrome_msg['params']['response']['status'], 'content-type': chrome_msg['params']['response']['mimeType'], } response_headers = CaseInsensitiveDict( chrome_msg['params']['response']['headers']) if 'content-length' in response_headers: video['content-length'] = int(response_headers['content-length']) if 'content-range' in response_headers: video['content-range'] = response_headers['content-range'] logging.debug('embedded video %s', video) if not 'videos' in page: page.videos = [] page.videos.append(video) sw_fetched = set() def _on_service_worker_version_updated(chrome_msg): # https://github.com/internetarchive/brozzler/issues/140 self.logger.trace('%r', chrome_msg) if chrome_msg.get('params', {}).get('versions'): url = chrome_msg.get('params', {}).get('versions')[0]\ .get('scriptURL') if url and url not in sw_fetched: self.logger.info('fetching service worker script %s', url) self._fetch_url(site, url) sw_fetched.add(url) if not browser.is_running(): browser.start( proxy=self._proxy_for(site), cookie_db=site.get('cookie_db')) final_page_url, outlinks = browser.browse_page( page.url, extra_headers=site.extra_headers(), behavior_parameters=site.get('behavior_parameters'), username=site.get('username'), password=site.get('password'), user_agent=site.get('user_agent'), on_screenshot=_on_screenshot, on_response=_on_response, on_request=on_request, on_service_worker_version_updated=_on_service_worker_version_updated, hashtags=page.hashtags, skip_extract_outlinks=self._skip_extract_outlinks, skip_visit_hashtags=self._skip_visit_hashtags, skip_youtube_dl=self._skip_youtube_dl, simpler404=self._simpler404, screenshot_full_page=self._screenshot_full_page, page_timeout=self._page_timeout, behavior_timeout=self._behavior_timeout, download_throughput=self._download_throughput) if final_page_url != page.url: page.note_redirect(final_page_url) return outlinks
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def _on_screenshot(screenshot_png): if on_screenshot: on_screenshot(screenshot_png) if self._using_warcprox(site): self.logger.info( "sending WARCPROX_WRITE_RECORD request to %s with " "screenshot for %s", self._proxy_for(site), page) screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs( screenshot_png) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="screenshot:%s" % str(urlcanon.semantic(page.url)), warc_type="resource", content_type="image/jpeg", payload=screenshot_jpeg, extra_headers=site.extra_headers()) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="thumbnail:%s" % str(urlcanon.semantic(page.url)), warc_type="resource", content_type="image/jpeg", payload=thumbnail_jpeg, extra_headers=site.extra_headers()) def _on_response(chrome_msg): if ('params' in chrome_msg and 'response' in chrome_msg['params'] and 'mimeType' in chrome_msg['params']['response'] and chrome_msg['params']['response'].get('mimeType', '').startswith('video/') # skip manifests of DASH segmented video - # see https://github.com/internetarchive/brozzler/pull/70 and chrome_msg['params']['response']['mimeType'] != 'video/vnd.mpeg.dash.mpd' and chrome_msg['params']['response'].get('status') in (200, 206)): video = { 'blame': 'browser', 'url': chrome_msg['params']['response'].get('url'), 'response_code': chrome_msg['params']['response']['status'], 'content-type': chrome_msg['params']['response']['mimeType'], } response_headers = CaseInsensitiveDict( chrome_msg['params']['response']['headers']) if 'content-length' in response_headers: video['content-length'] = int(response_headers['content-length']) if 'content-range' in response_headers: video['content-range'] = response_headers['content-range'] logging.debug('embedded video %s', video) if not 'videos' in page: page.videos = [] page.videos.append(video) sw_fetched = set() def _on_service_worker_version_updated(chrome_msg): # https://github.com/internetarchive/brozzler/issues/140 self.logger.trace('%r', chrome_msg) if chrome_msg.get('params', {}).get('versions'): url = chrome_msg.get('params', {}).get('versions')[0]\ .get('scriptURL') if url and url not in sw_fetched: self.logger.info('fetching service worker script %s', url) self._fetch_url(site, url) sw_fetched.add(url) if not browser.is_running(): browser.start( proxy=self._proxy_for(site), cookie_db=site.get('cookie_db')) final_page_url, outlinks = browser.browse_page( page.url, extra_headers=site.extra_headers(), behavior_parameters=site.get('behavior_parameters'), username=site.get('username'), password=site.get('password'), user_agent=site.get('user_agent'), on_screenshot=_on_screenshot, on_response=_on_response, on_request=on_request, on_service_worker_version_updated=_on_service_worker_version_updated, hashtags=page.hashtags, skip_extract_outlinks=self._skip_extract_outlinks, skip_visit_hashtags=self._skip_visit_hashtags, skip_youtube_dl=self._skip_youtube_dl, page_timeout=self._page_timeout, behavior_timeout=self._behavior_timeout) if final_page_url != page.url: page.note_redirect(final_page_url) return outlinks
def brozzle_page(self, browser, site, page, on_screenshot=None): def _on_screenshot(screenshot_png): if on_screenshot: on_screenshot(screenshot_png) elif self._proxy(site) and self._enable_warcprox_features(site): self.logger.info( "sending WARCPROX_WRITE_RECORD request " "to warcprox with screenshot for %s", page) screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs( screenshot_png) self._warcprox_write_record(warcprox_address=self._proxy(site), url="screenshot:{}".format( page.url), warc_type="resource", content_type="image/jpeg", payload=screenshot_jpeg, extra_headers=site.extra_headers()) self._warcprox_write_record(warcprox_address=self._proxy(site), url="thumbnail:{}".format( page.url), warc_type="resource", content_type="image/jpeg", payload=thumbnail_jpeg, extra_headers=site.extra_headers()) self.logger.info("brozzling {}".format(page)) try: with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: ydl = self._youtube_dl(tempdir, site) ydl_spy = ydl.brozzler_spy # remember for later self._try_youtube_dl(ydl, site, page) except brozzler.ReachedLimit as e: raise except Exception as e: if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2 and hasattr(e.exc_info[1], 'code') and e.exc_info[1].code == 430): self.logger.info('youtube-dl got %s %s processing %s', e.exc_info[1].code, e.exc_info[1].msg, page.url) else: self.logger.error("youtube_dl raised exception on %s", page, exc_info=True) if self._needs_browsing(page, ydl_spy): self.logger.info('needs browsing: %s', page) if not browser.is_running(): browser.start(proxy=self._proxy(site), cookie_db=site.cookie_db) outlinks = browser.browse_page(page.url, extra_headers=site.extra_headers(), on_screenshot=_on_screenshot, on_url_change=page.note_redirect) return outlinks else: if not self._already_fetched(page, ydl_spy): self.logger.info('needs fetch: %s', page) self._fetch_url(site, page) else: self.logger.info('already fetched: %s', page) return []
def _browse_page(self, browser, site, page, on_screenshot=None): def _on_screenshot(screenshot_png): if on_screenshot: on_screenshot(screenshot_png) if self._using_warcprox(site): self.logger.info( "sending WARCPROX_WRITE_RECORD request to %s with " "screenshot for %s", self._proxy_for(site), page) screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs( screenshot_png) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="screenshot:%s" % str(urlcanon.semantic(page.url)), warc_type="resource", content_type="image/jpeg", payload=screenshot_jpeg, extra_headers=site.extra_headers()) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="thumbnail:%s" % str(urlcanon.semantic(page.url)), warc_type="resource", content_type="image/jpeg", payload=thumbnail_jpeg, extra_headers=site.extra_headers()) def _on_response(chrome_msg): if ('params' in chrome_msg and 'response' in chrome_msg['params'] and 'mimeType' in chrome_msg['params']['response'] and chrome_msg['params']['response'].get( 'mimeType', '').startswith('video/') and chrome_msg['params']['response'].get('status') in (200, 206)): video = { 'blame': 'browser', 'url': chrome_msg['params']['response'].get('url'), 'response_code': chrome_msg['params']['response']['status'], 'content-type': chrome_msg['params']['response']['mimeType'], } response_headers = CaseInsensitiveDict( chrome_msg['params']['response']['headers']) if 'content-length' in response_headers: video['content-length'] = int( response_headers['content-length']) if 'content-range' in response_headers: video['content-range'] = response_headers['content-range'] logging.debug('embedded video %s', video) if not 'videos' in page: page.videos = [] page.videos.append(video) if not browser.is_running(): browser.start(proxy=self._proxy_for(site), cookie_db=site.get('cookie_db')) final_page_url, outlinks = browser.browse_page( page.url, extra_headers=site.extra_headers(), behavior_parameters=site.get('behavior_parameters'), username=site.get('username'), password=site.get('password'), user_agent=site.get('user_agent'), on_screenshot=_on_screenshot, on_response=_on_response, hashtags=page.hashtags, skip_extract_outlinks=self._skip_extract_outlinks, skip_visit_hashtags=self._skip_visit_hashtags) if final_page_url != page.url: page.note_redirect(final_page_url) return outlinks