Beispiel #1
0
 def delay_context_exit():
     gate = brozzler.thread_accept_exceptions()
     orig_exit = type(gate).__exit__
     try:
         type(gate).__exit__ = lambda self, et, ev, t: (
                 brozzler.sleep(2), orig_exit(self, et, ev, t), False)[-1]
         with brozzler.thread_accept_exceptions() as gate:
             brozzler.sleep(2)
     except Exception as e:
         nonlocal thread_caught_exception
         thread_caught_exception = e
     finally:
         type(gate).__exit__ = orig_exit
Beispiel #2
0
 def delay_context_exit():
     gate = brozzler.thread_accept_exceptions()
     orig_exit = type(gate).__exit__
     try:
         type(gate).__exit__ = lambda self, et, ev, t: (
                 brozzler.sleep(2), orig_exit(self, et, ev, t), False)[-1]
         with brozzler.thread_accept_exceptions() as gate:
             brozzler.sleep(2)
     except Exception as e:
         nonlocal thread_caught_exception
         thread_caught_exception = e
     finally:
         type(gate).__exit__ = orig_exit
Beispiel #3
0
    def two_with_blocks():
        try:
            with brozzler.thread_accept_exceptions():
                time.sleep(2)
            return # test fails
        except Exception1 as e:
            pass
        except:
            return # fail test

        try:
            with brozzler.thread_accept_exceptions():
                brozzler.sleep(2)
        except Exception as e:
            nonlocal thread_caught_exception
            thread_caught_exception = e
Beispiel #4
0
 def accept_immediately():
     try:
         with brozzler.thread_accept_exceptions():
             brozzler.sleep(2)
     except Exception as e:
         nonlocal thread_caught_exception
         thread_caught_exception = e
Beispiel #5
0
    def two_with_blocks():
        try:
            with brozzler.thread_accept_exceptions():
                time.sleep(2)
            return # test fails
        except Exception1 as e:
            pass
        except:
            return # fail test

        try:
            with brozzler.thread_accept_exceptions():
                brozzler.sleep(2)
        except Exception as e:
            nonlocal thread_caught_exception
            thread_caught_exception = e
Beispiel #6
0
 def accept_immediately():
     try:
         with brozzler.thread_accept_exceptions():
             brozzler.sleep(2)
     except Exception as e:
         nonlocal thread_caught_exception
         thread_caught_exception = e
Beispiel #7
0
 def accept_eventually():
     try:
         brozzler.sleep(2)
         with brozzler.thread_accept_exceptions():
             pass
     except Exception as e:
         nonlocal thread_caught_exception
         thread_caught_exception = e
Beispiel #8
0
 def accept_eventually():
     try:
         brozzler.sleep(2)
         with brozzler.thread_accept_exceptions():
             pass
     except Exception as e:
         nonlocal thread_caught_exception
         thread_caught_exception = e
Beispiel #9
0
    def _try_youtube_dl(self, ydl, site, page):
        try:
            self.logger.info("trying youtube-dl on {}".format(page))

            with brozzler.thread_accept_exceptions():
                # we do whatwg canonicalization here to avoid "<urlopen error
                # no host given>" resulting in ProxyError
                # needs automated test
                info = ydl.extract_info(str(urlcanon.whatwg(page.url)))
            self._remember_videos(page, ydl.brozzler_spy)
            # logging.info('XXX %s', json.dumps(info))
            if self._using_warcprox(site):
                info_json = json.dumps(info, sort_keys=True, indent=4)
                self.logger.info(
                    "sending WARCPROX_WRITE_RECORD request to warcprox "
                    "with youtube-dl json for %s", page)
                self._warcprox_write_record(
                    warcprox_address=self._proxy_for(site),
                    url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
                    warc_type="metadata",
                    content_type=
                    "application/vnd.youtube-dl_formats+json;charset=utf-8",
                    payload=info_json.encode("utf-8"),
                    extra_headers=site.extra_headers())
        except brozzler.ShutdownRequested as e:
            raise
        except BaseException as e:
            if hasattr(
                    e, "exc_info"
            ) and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
                pass
            elif (hasattr(e, "exc_info")
                  and e.exc_info[0] == urllib.error.HTTPError
                  and hasattr(e.exc_info[1], "code")
                  and e.exc_info[1].code == 420):
                raise brozzler.ReachedLimit(e.exc_info[1])
            elif (hasattr(e, 'exc_info')
                  and e.exc_info[0] == urllib.error.URLError
                  and self._proxy_for(site)):
                # connection problem when using a proxy == proxy error (XXX?)
                raise brozzler.ProxyError(
                    'youtube-dl hit apparent proxy error from '
                    '%s' % page.url) from e
            else:
                raise
Beispiel #10
0
def _try_youtube_dl(worker, ydl, site, page):
    try:
        logging.info("trying yt-dlp on %s", page)

        with brozzler.thread_accept_exceptions():
            # we do whatwg canonicalization here to avoid "<urlopen error
            # no host given>" resulting in ProxyError
            # needs automated test
            # and yt-dlp needs sanitize_info for extract_info
            ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url))))
        _remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups)
        if worker._using_warcprox(site):
            info_json = json.dumps(ie_result, sort_keys=True, indent=4)
            logging.info(
                    "sending WARCPROX_WRITE_RECORD request to warcprox "
                    "with yt-dlp json for %s", page)
            worker._warcprox_write_record(
                    warcprox_address=worker._proxy_for(site),
                    url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
                    warc_type="metadata",
                    content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
                    payload=info_json.encode("utf-8"),
                    extra_headers=site.extra_headers(page))
        return ie_result
    except brozzler.ShutdownRequested as e:
        raise
    except Exception as e:
        if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
            return None
        elif (hasattr(e, "exc_info")
                and e.exc_info[0] == urllib.error.HTTPError
                and hasattr(e.exc_info[1], "code")
                and e.exc_info[1].code == 420):
            raise brozzler.ReachedLimit(e.exc_info[1])
        elif (hasattr(e, 'exc_info')
                and e.exc_info[0] == urllib.error.URLError
                and worker._proxy_for(site)):
            # connection problem when using a proxy == proxy error (XXX?)
            raise brozzler.ProxyError(
                    'yt-dlp hit apparent proxy error from '
                    '%s' % page.url) from e
        else:
            raise
Beispiel #11
0
def _try_youtube_dl(worker, ydl, site, page):
    try:
        logging.info("trying youtube-dl on %s", page)

        with brozzler.thread_accept_exceptions():
            # we do whatwg canonicalization here to avoid "<urlopen error
            # no host given>" resulting in ProxyError
            # needs automated test
            ie_result = ydl.extract_info(str(urlcanon.whatwg(page.url)))
        _remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups)
        if worker._using_warcprox(site):
            info_json = json.dumps(ie_result, sort_keys=True, indent=4)
            logging.info(
                    "sending WARCPROX_WRITE_RECORD request to warcprox "
                    "with youtube-dl json for %s", page)
            worker._warcprox_write_record(
                    warcprox_address=worker._proxy_for(site),
                    url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
                    warc_type="metadata",
                    content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
                    payload=info_json.encode("utf-8"),
                    extra_headers=site.extra_headers())
        return ie_result
    except brozzler.ShutdownRequested as e:
        raise
    except Exception as e:
        if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
            return None
        elif (hasattr(e, "exc_info")
                and e.exc_info[0] == urllib.error.HTTPError
                and hasattr(e.exc_info[1], "code")
                and e.exc_info[1].code == 420):
            raise brozzler.ReachedLimit(e.exc_info[1])
        elif (hasattr(e, 'exc_info')
                and e.exc_info[0] == urllib.error.URLError
                and worker._proxy_for(site)):
            # connection problem when using a proxy == proxy error (XXX?)
            raise brozzler.ProxyError(
                    'youtube-dl hit apparent proxy error from '
                    '%s' % page.url) from e
        else:
            raise
Beispiel #12
0
    def browse_page(
            self, page_url, extra_headers=None,
            user_agent=None, behavior_parameters=None, behaviors_dir=None,
            on_request=None, on_response=None, on_screenshot=None,
            username=None, password=None, hashtags=None,
            skip_extract_outlinks=False, skip_visit_hashtags=False,
            skip_youtube_dl=False, page_timeout=300, behavior_timeout=900):
        '''
        Browses page in browser.

        Browser should already be running, i.e. start() should have been
        called. Opens the page_url in the browser, runs behaviors, takes a
        screenshot, extracts outlinks.

        Args:
            page_url: url of the page to browse
            extra_headers: dict of extra http headers to configure the browser
                to send with every request (default None)
            user_agent: user agent string, replaces browser default if
                supplied (default None)
            behavior_parameters: dict of parameters for populating the
                javascript behavior template (default None)
            behaviors_dir: Directory containing behaviors.yaml and JS templates
                (default None loads Brozzler default JS behaviors)
            on_request: callback to invoke on every Network.requestWillBeSent
                event, takes one argument, the json-decoded message (default
                None)
            on_response: callback to invoke on every Network.responseReceived
                event, takes one argument, the json-decoded message (default
                None)
            on_screenshot: callback to invoke when screenshot is obtained,
                takes one argument, the the raw jpeg bytes (default None)
                # XXX takes two arguments, the url of the page at the time the
                # screenshot was taken, and the raw jpeg bytes (default None)

        Returns:
            A tuple (final_page_url, outlinks).
            final_page_url: the url in the location bar at the end of the
                browse_page cycle, which could be different from the original
                page url if the page redirects, javascript has changed the url
                in the location bar, etc
            outlinks: a list of navigational links extracted from the page

        Raises:
            brozzler.ProxyError: in case of proxy connection error
            BrowsingException: if browsing the page fails in some other way
        '''
        if not self.is_running():
            raise BrowsingException('browser has not been started')
        if self.is_browsing:
            raise BrowsingException('browser is already busy browsing a page')
        self.is_browsing = True
        if on_request:
            self.websock_thread.on_request = on_request
        if on_response:
            self.websock_thread.on_response = on_response
        try:
            with brozzler.thread_accept_exceptions():
                self.configure_browser(
                        extra_headers=extra_headers,
                        user_agent=user_agent)
                self.navigate_to_page(page_url, timeout=page_timeout)
                if password:
                    self.try_login(username, password, timeout=page_timeout)
                    # if login redirected us, return to page_url
                    if page_url != self.url().split('#')[0]:
                        self.logger.debug(
                            'login navigated away from %s; returning!',
                            page_url)
                        self.navigate_to_page(page_url, timeout=page_timeout)
                if on_screenshot:
                    self._try_screenshot(on_screenshot)
                behavior_script = brozzler.behavior_script(
                        page_url, behavior_parameters,
                        behaviors_dir=behaviors_dir)
                self.run_behavior(behavior_script, timeout=behavior_timeout)
                if skip_extract_outlinks:
                    outlinks = []
                else:
                    outlinks = self.extract_outlinks()
                if not skip_visit_hashtags:
                    self.visit_hashtags(self.url(), hashtags, outlinks)
                final_page_url = self.url()
                return final_page_url, outlinks
        except brozzler.ReachedLimit:
            # websock_thread has stashed the ReachedLimit exception with
            # more information, raise that one
            raise self.websock_thread.reached_limit
        except websocket.WebSocketConnectionClosedException as e:
            self.logger.error('websocket closed, did chrome die?')
            raise BrowsingException(e)
        finally:
            self.is_browsing = False
            self.websock_thread.on_request = None
            self.websock_thread.on_response = None
Beispiel #13
0
    def browse_page(
            self, page_url, extra_headers=None,
            user_agent=None, behavior_parameters=None, behaviors_dir=None,
            on_request=None, on_response=None,
            on_service_worker_version_updated=None, on_screenshot=None,
            username=None, password=None, hashtags=None,
            skip_extract_outlinks=False, skip_visit_hashtags=False,
            skip_youtube_dl=False, page_timeout=300, behavior_timeout=900):
        '''
        Browses page in browser.

        Browser should already be running, i.e. start() should have been
        called. Opens the page_url in the browser, runs behaviors, takes a
        screenshot, extracts outlinks.

        Args:
            page_url: url of the page to browse
            extra_headers: dict of extra http headers to configure the browser
                to send with every request (default None)
            user_agent: user agent string, replaces browser default if
                supplied (default None)
            behavior_parameters: dict of parameters for populating the
                javascript behavior template (default None)
            behaviors_dir: Directory containing behaviors.yaml and JS templates
                (default None loads Brozzler default JS behaviors)
            on_request: callback to invoke on every Network.requestWillBeSent
                event, takes one argument, the json-decoded message (default
                None)
            on_response: callback to invoke on every Network.responseReceived
                event, takes one argument, the json-decoded message (default
                None)
            on_service_worker_version_updated: callback to invoke on every
                ServiceWorker.workerVersionUpdated event, takes one argument,
                the json-decoded message (default None)
            on_screenshot: callback to invoke when screenshot is obtained,
                takes one argument, the the raw jpeg bytes (default None)
                # XXX takes two arguments, the url of the page at the time the
                # screenshot was taken, and the raw jpeg bytes (default None)
            username: username string to use to try logging in if a login form
                is found in the page (default None)
            password: password string to use to try logging in if a login form
                is found in the page (default None)
            ... (there are more)

        Returns:
            A tuple (final_page_url, outlinks).
            final_page_url: the url in the location bar at the end of the
                browse_page cycle, which could be different from the original
                page url if the page redirects, javascript has changed the url
                in the location bar, etc
            outlinks: a list of navigational links extracted from the page

        Raises:
            brozzler.ProxyError: in case of proxy connection error
            BrowsingException: if browsing the page fails in some other way
        '''
        if not self.is_running():
            raise BrowsingException('browser has not been started')
        if self.is_browsing:
            raise BrowsingException('browser is already busy browsing a page')
        self.is_browsing = True
        if on_request:
            self.websock_thread.on_request = on_request
        if on_response:
            self.websock_thread.on_response = on_response
        if on_service_worker_version_updated:
            self.websock_thread.on_service_worker_version_updated = \
                    on_service_worker_version_updated
        try:
            with brozzler.thread_accept_exceptions():
                self.configure_browser(
                        extra_headers=extra_headers,
                        user_agent=user_agent)
                self.navigate_to_page(page_url, timeout=page_timeout)
                if password:
                    self.try_login(username, password, timeout=page_timeout)
                    # if login redirected us, return to page_url
                    if page_url != self.url().split('#')[0]:
                        self.logger.debug(
                            'login navigated away from %s; returning!',
                            page_url)
                        self.navigate_to_page(page_url, timeout=page_timeout)
                if on_screenshot:
                    self._try_screenshot(on_screenshot)
                behavior_script = brozzler.behavior_script(
                        page_url, behavior_parameters,
                        behaviors_dir=behaviors_dir)
                self.run_behavior(behavior_script, timeout=behavior_timeout)
                if skip_extract_outlinks:
                    outlinks = []
                else:
                    outlinks = self.extract_outlinks()
                if not skip_visit_hashtags:
                    self.visit_hashtags(self.url(), hashtags, outlinks)
                final_page_url = self.url()
                return final_page_url, outlinks
        except brozzler.ReachedLimit:
            # websock_thread has stashed the ReachedLimit exception with
            # more information, raise that one
            raise self.websock_thread.reached_limit
        except websocket.WebSocketConnectionClosedException as e:
            self.logger.error('websocket closed, did chrome die?')
            raise BrowsingException(e)
        finally:
            self.is_browsing = False
            self.websock_thread.on_request = None
            self.websock_thread.on_response = None
Beispiel #14
0
    def browse_page(self,
                    page_url,
                    extra_headers=None,
                    user_agent=None,
                    behavior_parameters=None,
                    behaviors_dir=None,
                    on_request=None,
                    on_response=None,
                    on_service_worker_version_updated=None,
                    on_screenshot=None,
                    username=None,
                    password=None,
                    hashtags=None,
                    screenshot_full_page=False,
                    skip_extract_outlinks=False,
                    skip_visit_hashtags=False,
                    skip_youtube_dl=False,
                    simpler404=False,
                    page_timeout=300,
                    behavior_timeout=900,
                    extract_outlinks_timeout=60,
                    download_throughput=-1,
                    stealth=False):
        '''
        Browses page in browser.

        Browser should already be running, i.e. start() should have been
        called. Opens the page_url in the browser, runs behaviors, takes a
        screenshot, extracts outlinks.

        Args:
            page_url: url of the page to browse
            extra_headers: dict of extra http headers to configure the browser
                to send with every request (default None)
            user_agent: user agent string, replaces browser default if
                supplied (default None)
            behavior_parameters: dict of parameters for populating the
                javascript behavior template (default None)
            behaviors_dir: Directory containing behaviors.yaml and JS templates
                (default None loads Brozzler default JS behaviors)
            on_request: callback to invoke on every Network.requestWillBeSent
                event, takes one argument, the json-decoded message (default
                None)
            on_response: callback to invoke on every Network.responseReceived
                event, takes one argument, the json-decoded message (default
                None)
            on_service_worker_version_updated: callback to invoke on every
                ServiceWorker.workerVersionUpdated event, takes one argument,
                the json-decoded message (default None)
            on_screenshot: callback to invoke when screenshot is obtained,
                takes one argument, the the raw jpeg bytes (default None)
                # XXX takes two arguments, the url of the page at the time the
                # screenshot was taken, and the raw jpeg bytes (default None)
            username: username string to use to try logging in if a login form
                is found in the page (default None)
            password: password string to use to try logging in if a login form
                is found in the page (default None)
            ... (there are more)

        Returns:
            A tuple (final_page_url, outlinks).
            final_page_url: the url in the location bar at the end of the
                browse_page cycle, which could be different from the original
                page url if the page redirects, javascript has changed the url
                in the location bar, etc
            outlinks: a list of navigational links extracted from the page

        Raises:
            brozzler.ProxyError: in case of proxy connection error
            BrowsingException: if browsing the page fails in some other way
        '''
        if not self.is_running():
            raise BrowsingException('browser has not been started')
        if self.is_browsing:
            raise BrowsingException('browser is already busy browsing a page')
        self.is_browsing = True
        if on_request:
            self.websock_thread.on_request = on_request
        if on_response:
            self.websock_thread.on_response = on_response
        if on_service_worker_version_updated:
            self.websock_thread.on_service_worker_version_updated = \
                    on_service_worker_version_updated
        try:
            with brozzler.thread_accept_exceptions():
                self.configure_browser(extra_headers=extra_headers,
                                       user_agent=user_agent,
                                       download_throughput=download_throughput,
                                       stealth=stealth)
                self.navigate_to_page(page_url, timeout=page_timeout)
                if password:
                    self.try_login(username, password, timeout=page_timeout)
                    # if login redirected us, return to page_url
                    if page_url != self.url().split('#')[0]:
                        self.logger.debug(
                            'login navigated away from %s; returning!',
                            page_url)
                        self.navigate_to_page(page_url, timeout=page_timeout)
                # If the target page HTTP status is 4xx/5xx, there is no point
                # in running behaviors, outlink and hashtag extraction as we
                # didn't get a valid page. Screenshot should run because i
                # may be useful to have a picture of the error page.
                # This is only enabled with option `simpler404`.
                run_behaviors = True
                if simpler404 and (self.websock_thread.page_status is None
                                   or self.websock_thread.page_status >= 400):
                    run_behaviors = False

                if run_behaviors and behavior_timeout > 0:
                    behavior_script = brozzler.behavior_script(
                        page_url,
                        behavior_parameters,
                        behaviors_dir=behaviors_dir)
                    self.run_behavior(behavior_script,
                                      timeout=behavior_timeout)
                final_page_url = self.url()
                if on_screenshot:
                    self._try_screenshot(on_screenshot, screenshot_full_page)
                if not run_behaviors or skip_extract_outlinks:
                    outlinks = []
                else:
                    outlinks = self.extract_outlinks(
                        timeout=extract_outlinks_timeout)
                if run_behaviors and not skip_visit_hashtags:
                    self.visit_hashtags(final_page_url, hashtags, outlinks)
                return final_page_url, outlinks
        except brozzler.ReachedLimit:
            # websock_thread has stashed the ReachedLimit exception with
            # more information, raise that one
            raise self.websock_thread.reached_limit
        except websocket.WebSocketConnectionClosedException as e:
            self.logger.error('websocket closed, did chrome die?')
            raise BrowsingException(e)
        finally:
            self.is_browsing = False
            self.websock_thread.on_request = None
            self.websock_thread.on_response = None