Example #1
0
    def navigate(self, url):
        #to reduce memory usage
        if self.driver_processed_urls_count > 100:
            self.stop_executable()
            self.start_executable()
            self.driver_processed_urls_count = 0
        self.driver_processed_urls_count += 1

        #leave only one window tab, close other tabs
        self.close_not_first_tab()
        self.logger.debug("selenium navigate to {}, window tabs count={}".format(url, len(self.the_driver.window_handles)))
        self.the_driver.switch_to.window(self.the_driver.window_handles[0])

        # navigation
        try:
            self.the_driver.set_page_load_timeout(self.page_load_timeout)
            self.the_driver.get(url)
        except IndexError as exp:
            raise THttpRequester.RobotHttpException("general IndexError inside urllib.request.urlopen",
                                                    url, 520, "GET")
        except TimeoutException as exp:
            title = self.the_driver.title
            if len(title) == 0:
                raise
        self.check_http_code(url)
Example #2
0
 def check_http_code(self, url):
     if len(self.the_driver.page_source) < 700:
         title = self.the_driver.title.strip()
         if title.startswith('4') or title.startswith('3') or title.startswith('5'):
             words = title.split(' ')
             if words[0] in POPULAR_ERROR_HTTP_CODES:
                 message = " ".join(words[1:])
                 if message == POPULAR_ERROR_HTTP_CODES[words[0]]:
                     raise THttpRequester.RobotHttpException(message, url, words[0], "GET")
Example #3
0
 def recognize_protocol_and_www_selenium(self):
     for url in urllib_parse_pro.get_url_modifications(self.input_site_url):
         try:
             self.parent_project.selenium_driver.navigate(url)
             time.sleep(3)
             title = self.parent_project.selenium_driver.the_driver.title
             html = self.parent_project.selenium_driver.the_driver.page_source
             self.init_main_page_url_from_redirected_url(
                 self.parent_project.selenium_driver.the_driver.current_url,
                 title, html)
             return
         except WebDriverException as exp:
             self.logger.error(
                 "cannot fetch {}  with selenium, sleep 3 sec".format(url))
             time.sleep(3)
     raise THttpRequester.RobotHttpException(
         "there is no way to access {}".format(self.input_site_url),
         self.input_site_url, 404, "GET")