def search(self, search_term, right_click = True): """ Search search_term in browser. Return True if search succeeded. @parmeter search_term: the words to search click: whether to right click on each search result @return result_set: the set of results """ # start browser self.browser = start_browser(self.crawl_config.browser_type, incognito=False, user_agent=self.crawl_config.user_agent) self.browser.set_page_load_timeout(15) # search start = 0 ad_set = set() search_set = set() while start < self.crawl_config.count: try: # google search advertisements or results url = 'https://www.google.com/?gws_rd=ssl#q=' url += '+'.join(search_term.split(' ')) # append start when the start is greater than zero if start > 0: url += '&start={0}'.format(start) self.browser.get(url) # wait until page load complete elem = wait_find_element(self.browser, 'id', 'ires') if elem is None: raise Exception("Page load failed.") time.sleep(random.randint(1, 3)) ad_set = ad_set | self.ad_links() if right_click: search_set = search_set | self.search_results() start = start + 10 except: # For robustness, don't throw errors here. safe_quit(self.browser) logger = logging.getLogger("global") logger.error("error in search") logger.error(sys.exc_info()[0]) if switch_vpn_state(self.connected): self.connected = not self.connected self.browser = restart_browser(self.crawl_config.browser_type, incognito=False, user_agent=self.crawl_config.user_agent, browser=self.browser) safe_quit(self.browser) return ad_set, search_set
def fetch_url(self, url): while True: self.lock.acquire() if self.browser_queue.empty(): self.lock.release() time.sleep(5) else: browser = self.browser_queue.get() self.lock.release() break result = CD.CrawlResult() # record whether url loading failed! result.url = url result.url_md5 = hex_md5(url) result.success = True try: # This line is used to handle alert: <stay on this page> <leave this page> browser.get(result.url) browser.execute_script("window.onbeforeunload = function() {};") time.sleep(1) if self.crawl_config.browser_type == CD.CrawlConfig.CHROME and \ (('404 Not Found' in browser.title) \ or ('403' in browser.title) \ or ('Forbidden' in browser.title) \ or ('not available' in browser.title) \ or ('Problem loading page' in browser.title) \ or ('Page not found' in browser.title) \ or ('Error' in browser.title) \ or ('Access denied' in browser.title) \ or (browser.current_url == 'data:text/html,chromewebdata')): result.landing_url = browser.current_url result.landing_url_md5 = hex_md5(result.landing_url) result.success = False elif self.crawl_config.browser_type == CD.CrawlConfig.FIREFOX and \ (('404 Not Found' in browser.title) \ or ('403' in browser.title) \ or ('Forbidden' in browser.title) \ or ('not available' in browser.title) \ or ('Problem loading page' in browser.title) \ or ('Page not found' in browser.title) \ or ('Error' in browser.title) \ or ('Access denied' in browser.title)): result.landing_url = browser.current_url result.landing_url_md5 = hex_md5(result.landing_url) result.success = False else: ############# # md5 the original url url_md5_dir = self.crawl_config.user_agent_md5_dir + result.url_md5 + '/' mkdir_if_not_exist(url_md5_dir) # get the landing url result.landing_url = browser.current_url result.landing_url_md5 = hex_md5(result.landing_url) # get the whole page source response = browser.execute_script("return document.documentElement.innerHTML;") result.file_path = url_md5_dir + 'index.html' f = open(result.file_path, 'w') f.write(response.encode('utf-8')) f.close() browser.delete_all_cookies() if len(browser.window_handles) > 1: # close all the other windows current_window_handle = browser.current_window_handle for handle in browser.window_handles: if handle != current_window_handle: browser.switch_to_window(handle) browser.close() # switch back to the current window browser.switch_to_window(current_window_handle) except: result.success = False browser = restart_browser(self.crawl_config.browser_type, incognito=False, user_agent=self.crawl_config.user_agent, browser=browser) self.browser_queue.put(browser) logger = logging.getLogger("global") logger.info("the length of the browser_queue") logger.info(self.browser_queue.qsize()) return result