Ejemplo n.º 1
0
    async def goto(
        self, url: str, wait: str = "load", *args: Any, **kwargs: Any
    ) -> NavigationResult:
        """Navigate the browser to the supplied URL. The return value
        of this function indicates the next action to be performed by the crawler

        :param url: The URL of the page to navigate to
        :param wait: The wait condition that all the pages frame have
        before navigation is considered complete
        :param kwargs: Any additional arguments for use in navigating
        :return: An NavigationResult indicating the next action of the crawler
        """
        self._url = url
        logged_method = f"goto"
        try:
            response = await self.frames.mainFrame.goto(
                url, waitUntil=wait, timeout=self._navigation_timeout
            )
            self.set_timestamp_from_response(response)
            info = (
                Helper.json_string(
                    url=url,
                    responseURL=response.url,
                    status=response.status,
                    mime=response.mimeType,
                )
                if response is not None
                else Helper.json_string(url=url)
            )
            self.logger.info(logged_method, f"we navigated to the page - {info}")
            self.frontier.crawling_new_page(self.main_frame.url)
            return self._determine_navigation_result(response)
        except NavigationError as ne:
            if ne.disconnected:
                self.logger.critical(
                    logged_method,
                    f"connection closed while navigating to {url}",
                    exc_info=ne,
                )
                return NavigationResult.EXIT_CRAWL_LOOP
            if ne.timeout or ne.response is not None:
                return self._determine_navigation_result(ne.response)
            self.logger.exception(
                logged_method, f"navigation failed for {url}", exc_info=ne
            )
            return NavigationResult.SKIP_URL
        except Exception as e:
            self.logger.exception(
                logged_method, f"unknown error while navigating to {url}", exc_info=e
            )
            return NavigationResult.EXIT_CRAWL_LOOP
Ejemplo n.º 2
0
    def __init__(
        self,
        behavior_js: str,
        tab: Tab,
        next_action_expression: str,
        loop: Optional[AbstractEventLoop] = None,
        collect_outlinks: bool = False,
        post_run_actions: bool = False,
        frame: Optional[Union[Frame, Callable[[], Frame]]] = None,
    ) -> None:
        """Initialize the new WRBehaviorRunner instance

        :param behavior_js: The behavior's JS
        :param tab: The tab the behavior's JS will be run in
        :param next_action_expression: The JS expression used to initiate a behavior's action
        :param loop: The event loop used by the automation
        :param collect_outlinks: Should outlinks be collected after each action
        :param post_run_actions: Should a screenshot be taken once the behavior is done
        :param frame: Optional reference to or callable returning a simplechrome.FrameManager.Frame
        that the behavior is to be run in
        """
        self.behavior_js: str = behavior_js
        self.tab: Tab = tab
        self.next_action_expression: str = next_action_expression
        self.collect_outlinks: bool = collect_outlinks
        self.post_run_actions: bool = post_run_actions
        self.frame: Optional[Union[Frame, Callable[[], Frame]]] = frame
        self.loop: AbstractEventLoop = Helper.ensure_loop(loop)
        self.logger: AutoLogger = create_autologger("behaviorRunner",
                                                    "WRBehaviorRunner")
        self._done: bool = False
        self._paused: bool = False
        self._did_init: bool = False
        self._running_task: Optional[Task] = None
        self._num_actions_performed: int = 0
Ejemplo n.º 3
0
 def __init__(
     self,
     browser: Browser,
     tab_data: Dict[str, str],
     redis: Optional[Redis] = None,
     session: Optional[ClientSession] = None,
     *args: Any,
     **kwargs: Any,
 ) -> None:
     super().__init__(loop=Helper.ensure_loop(browser.loop))
     self.browser: Browser = browser
     self.redis = redis
     self.session = session
     self.tab_data: Dict[str, str] = tab_data
     self.client: Optional[Client] = None
     self.logger: AutoLogger = create_autologger("tabs",
                                                 self.__class__.__name__)
     self._url: str = self.tab_data["url"]
     self._id: str = self.tab_data["id"]
     self._timestamp: str = None
     self._behaviors_paused: bool = False
     self._connection_closed: bool = False
     self._running: bool = False
     self._reconnecting: bool = False
     self._graceful_shutdown: bool = False
     self._default_handling_of_dialogs: bool = True
     self._behavior_run_task: Optional[Task] = None
     self._reconnect_promise: Optional[Task] = None
     self._running_behavior: Optional[Behavior] = None
     self._close_reason: Optional[CloseReason] = None
     self._viewport: Optional[Dict] = None
Ejemplo n.º 4
0
    async def _extract_href_from_remote_node(
        self, node: Dict, outlink_accum: List[str]
    ) -> None:
        """Converts the supplied node to it's runtime object and retrieves the value of
        calling the href property getting on the node, adding the value of the href
        to the supplied out link accumulator if it has a crawlable scheme e.g. http(s)

        :param node: A node dict returned by DOM.getFlattenedDocument
        :param outlink_accum: A list used to accumulate valid out links
        """
        # the supplied node dictionary represents the dom node as is
        # i.e. any attributes listed in that dictionary are not resolved
        # according to the browser's attribute resolution algorithm
        # hence the need to resolve (convert the node to a runtime DOM object)
        # and call the node's getter for the href attribute
        runtime_node = await self.client.DOM.resolveNode(nodeId=node["nodeId"])
        obj_id = runtime_node["object"]["objectId"]
        results = await self.client.Runtime.callFunctionOn(
            self.href_fn, objectId=obj_id
        )
        await self.client.Runtime.releaseObject(objectId=obj_id)
        # the url here is fully resolved against the origin it exists in
        # thus safe for usage in programmatic navigation
        url = results.get("result", {}).get("value")
        if Helper.url_has_crawlable_scheme(url):
            outlink_accum.append(url)
Ejemplo n.º 5
0
    def __init__(self,
                 conf: AutomationConfig,
                 loop: Optional[AbstractEventLoop] = None) -> None:
        """Create a new driver

        :param conf: The automation configuration object
        :param loop: The event loop to be used
        """
        self.conf: AutomationConfig = conf
        self.loop: AbstractEventLoop = Helper.ensure_loop(loop)
        self.did_init: bool = False
        self.shutdown_condition: ShutdownCondition = ShutdownCondition(
            loop=self.loop)
        self.session: ClientSession = Helper.create_aio_http_client_session(
            loop)
        self.behavior_manager: RemoteBehaviorManager = RemoteBehaviorManager(
            conf=self.conf, session=self.session, loop=self.loop)
        self.redis: Redis = None
        self.logger: AutoLogger = create_autologger("drivers",
                                                    self.__class__.__name__)
        self._browser_exit_infos: List[BrowserExitInfo] = []
Ejemplo n.º 6
0
    def __init__(self, loop: Optional[AbstractEventLoop] = None) -> None:
        """Initialize the new ShutdownCondition instance

        :param loop: The event loop used by the automation
        """
        self.loop: AbstractEventLoop = Helper.ensure_loop(loop)
        self._shutdown_event: Event = Event(loop=self.loop)
        self._shutdown_from_signal: bool = False

        # SIGINT for local debugging
        self.loop.add_signal_handler(SIGINT, self._initiate_shutdown_signal)
        self.loop.add_signal_handler(SIGTERM, self._initiate_shutdown_signal)
Ejemplo n.º 7
0
    async def close(self) -> None:
        logged_method = "close"
        self.logger.info(
            logged_method, f"closing {'gracefully' if self._graceful_shutdown else ''}"
        )
        hard_close = not self._graceful_shutdown

        if self._running_behavior is not None and hard_close:
            self.logger.info(logged_method, "ending the running behavior")
            self._running_behavior.end()

        if self._crawl_loop_running():
            msg = (
                "canceling the crawl loop task"
                if hard_close
                else "waiting for the crawl loop task to end gracefully"
            )
            self.logger.info(logged_method, msg)
            try:
                if hard_close:
                    await Helper.timed_future_completion(
                        self.crawl_loop_task,
                        timeout=15,
                        cancel=hard_close,
                        loop=self.loop,
                    )
                else:
                    await self.crawl_loop_task
            except Exception as e:
                self.logger.exception(
                    logged_method,
                    "the crawl loop threw an unexpected exception while waiting for it to end",
                    exc_info=e,
                )

        end_info = Helper.json_string(id=self.reqid, time=int(time.time()))
        self.logger.info(logged_method, f"crawl loop task ended - {end_info}")

        if self._graceful_shutdown:
            await self.frontier.remove_current_from_pending()

        await self.navigation_reset()
        self.crawl_loop_task = None

        is_frontier_exhausted = await self.frontier.exhausted()
        if self._close_reason is None and is_frontier_exhausted:
            self._close_reason = CloseReason.CRAWL_END

        await self.redis.lpush(self.config.redis_keys.auto_done, end_info)
        await super().close()
Ejemplo n.º 8
0
    def __init__(
        self,
        conf: AutomationConfig,
        session: ClientSession,
        loop: Optional[AbstractEventLoop] = None,
    ) -> None:
        """Initialize the new instance of RemoteBehaviorManager

        :param conf: The automation's config
        :param session: The HTTP session to use for making the behavior requests
        :param loop: The event loop for the automation
        """
        self.conf: AutomationConfig = conf
        self.session: ClientSession = session
        self.loop: AbstractEventLoop = Helper.ensure_loop(loop)
        self.logger: AutoLogger = create_autologger("remoteBehaviorManager",
                                                    "RemoteBehaviorManager")
Ejemplo n.º 9
0
    async def _post_action(self) -> None:
        """Executes the actions we are configured to do after an behavior's action.

        Available post run actions:
         - Out link collection
        """
        logged_method = "post action"
        self.logger.debug(
            logged_method,
            Helper.json_string(action_count=self._num_actions_performed))
        self._num_actions_performed += 1
        # If the behavior runner is configured to collect out links, the collection occurs after every 10
        # actions initiated. This is done in order to ensure that the performance of running an behavior does
        # not degrade due to a page having lots of out links (10k+).
        # Note: the previous handling of out links was to collect them after every action
        if self.collect_outlinks and self._num_actions_performed % 10 == 0:
            self.logger.debug(logged_method, f"collecting outlinks")
            await self.tab.collect_outlinks()
Ejemplo n.º 10
0
    async def add(self, url: str, depth: int) -> bool:
        """Conditionally adds a URL to frontier.

        The addition condition is not seen, in scope, and not an
        inner page link.

        If the supplied URL is an inner page link it is added
        to the inner page links set.

        :param url: The URL to maybe add to the frontier
        :param depth: The depth the URL is to be crawled at
        :return: T/F indicating if the URL @ depth was added to the frontier
        """
        logged_method = "add"
        url_info = Helper.json_string(url=url, depth=depth, page=self.scope.current_page)

        in_scope = self.scope.in_scope(url)
        if not in_scope:
            self.logger.info(
                logged_method,
                f"Not adding URL to the frontier, not in scope - {url_info}",
            )
            return False

        if self.scope.is_inner_page_link(url):
            await self.redis.sadd(self.keys.inner_page_links, url)
            self.logger.info(
                logged_method,
                f"Not adding URL to the frontier, inner page link - {url_info}",
            )
            return False

        was_added = await self.redis.sadd(self.keys.seen, url)
        if was_added == 0:
            self.logger.info(
                logged_method, f"Not adding URL to the frontier, seen - {url_info}"
            )
            return False

        await self.redis.rpush(self.keys.queue, url_info)
        self.logger.info(logged_method, f"Added URL to the frontier - {url_info}")
        return True
Ejemplo n.º 11
0
    def __init__(
        self,
        redis: Redis,
        config: AutomationConfig,
        loop: Optional[AbstractEventLoop] = None,
    ):
        """Initialize the new instance of RedisFrontier

        :param redis: The redis instance to be used
        :param config: The automation config
        :param loop: The event loop used by the automation
        """
        self.config: AutomationConfig = config
        self.crawl_depth: int = -1
        self.currently_crawling: Optional[Dict[str, Union[str, int]]] = None
        self.keys: RedisKeys = self.config.redis_keys
        self.logger: AutoLogger = create_autologger("frontier", "RedisFrontier")
        self.loop: AbstractEventLoop = Helper.ensure_loop(loop)
        self.redis: Redis = redis
        self.scope: RedisScope = RedisScope(self.redis, self.keys)
        self._did_wait: bool = False
Ejemplo n.º 12
0
 def __init__(
     self,
     config: AutomationConfig,
     behavior_manager: BehaviorManager,
     session: Optional[ClientSession] = None,
     redis: Optional[Redis] = None,
     loop: Optional[AbstractEventLoop] = None,
 ) -> None:
     """
     :param config: The configuration of this automation
     :param loop: Optional reference to the running event loop
     :param redis: Optional instance of redis to use
     """
     super().__init__(loop=Helper.ensure_loop(loop))
     self.tab_datas: List[Dict] = None
     self.redis: Optional[Redis] = redis
     self.session: Optional[ClientSession] = session
     self.tabs: Dict[str, Tab] = {}
     self.tab_closed_reasons: Dict[str, TabClosedInfo] = {}
     self.running: bool = False
     self.logger: AutoLogger = create_autologger("chrome_browser", "Chrome")
     self._config: AutomationConfig = config
     self._behavior_manager: BehaviorManager = behavior_manager
Ejemplo n.º 13
0
    async def evaluate_in_page(self,
                               js_string: str,
                               contextId: Optional[Any] = None) -> Any:
        """Evaluates the supplied string of JavaScript in the tab

        :param js_string: The string of JavaScript to be evaluated
        :return: The results of the evaluation if any
        """
        logged_method = "evaluate_in_page"
        self.logger.debug(logged_method, "evaluating js in page")
        try:
            results = await self.client.Runtime.evaluate(
                js_string,
                contextId=contextId,
                userGesture=True,
                awaitPromise=True,
                includeCommandLineAPI=True,
                returnByValue=True,
            )
        except Exception as e:
            if not isinstance(e, CancelledError):
                self.logger.exception(
                    logged_method,
                    "evaluating js in page failed due to an python error",
                    exc_info=e,
                )
            return {"done": True}
        js_exception = results.get("exceptionDetails")
        if js_exception:
            jse_dets = Helper.getExceptionMessage(js_exception)
            self.logger.critical(
                logged_method,
                f"evaluating js in page failed due to an JS error - {jse_dets}",
            )
            return {}
        return results.get("result", {}).get("value")