def run(self): while True: self.get_requests() if not self.requests: break self.results = self.loop.run_until_complete(asyncio.wait([r()() for r in self.requests])) for task in self.results[0]: scrape = task.result() if scrape: cache_results(scrape.parser, scrape.query, scrape.search_engine_name, scrape.scrape_method, scrape.page_number) if scrape.parser: serp = parse_serp(parser=scrape.parser, scraper=scrape, query=scrape.query) self.scraper_search.serps.append(serp) self.session.add(serp) self.session.commit() store_serp_result(serp)
def cache_results(self): """Caches the html for the current request.""" cache_results(self.parser, self.query, self.search_engine_name, self.scrape_method, self.page_number, db_lock=self.db_lock)
def _search(self, searchtype='normal'): """The actual search and parsing of the results. Private, internal method. Parsing is done with lxml and cssselect. The html structure of the Google Search results may change over time. Effective: February 2014 There are several parts of a SERP results page the average user is most likely interested: (Probably in this order) - Non-advertisement links, as well as their little snippet and title - The message that indicates how many results were found. For example: "About 834,000,000 results (0.39 seconds)" - Advertisement search results (links, titles, snippets like above) Problem: This data comes in a wide range of different formats, depending on the parameters set in the search. Investigations over the different formats are done in the directory tests/serp_formats. """ self._build_query(searchtype) # After building the query, all parameters are set, so we know what we're requesting. logger.debug("Created new GoogleScrape object with searchparams={}".format(pprint.pformat(self.search_params))) html = get_cached(self.search_query, Config['GLOBAL'].get('base_search_url'), params=self.search_params) self.search_results['cache_file'] = os.path.join(Config['GLOBAL'].get('cachedir'), cached_file_name(self.search_query, Config['GLOBAL'].get('base_search_url'), self.search_params)) if not html: try: r = self.requests.get(Config['GLOBAL'].get('base_search_url'), headers=self._HEADERS, params=self.search_params, timeout=3.0) logger.debug("Scraped with url: {} and User-Agent: {}".format(r.url, self._HEADERS['User-Agent'])) except self.requests.ConnectionError as ce: logger.error('Network problem occurred {}'.format(ce)) raise ce except self.requests.Timeout as te: logger.error('Connection timeout {}'.format(te)) raise te if not r.ok: logger.error('HTTP Error: {}'.format(r.status_code)) if str(r.status_code)[0] == '5': print('Maybe google recognizes you as sneaky spammer after' ' you requested their services too inexhaustibly :D') return False html = r.text if Config['HTTP'].getboolean('view', False): self.browserview(html) # cache fresh results cache_results(html, self.search_query, url=Config['GLOBAL'].get('base_search_url'), params=self.search_params) self.search_results['cache_file'] = os.path.join(Config['GLOBAL'].get('cachedir'), cached_file_name(self.search_query, Config['GLOBAL'].get('base_search_url'), self.search_params)) self.parser = GoogleParser(html, searchtype=self.searchtype) self.search_results.update(self.parser.all_results)
def search(self): """Search with webdriver. Fills out the search form of the search engine for each keyword. Clicks the next link while num_pages_per_keyword is not reached. """ n = 0 for self.current_keyword in self.keywords: super().next_keyword_info(n) self.search_input = self._wait_until_search_input_field_appears() if self.search_input is False: self.search_input = self.handle_request_denied() if self.search_input: self.search_input.clear() time.sleep(.25) self.search_input.send_keys(self.current_keyword + Keys.ENTER) else: raise GoogleSearchError('Cannot get handle to the input form!') for self.current_page in range(1, self.num_pages_per_keyword + 1): # Waiting until the keyword appears in the title may # not be enough. The content may still be from the old page. try: WebDriverWait(self.webdriver, 5).until(EC.title_contains(self.current_keyword)) except TimeoutException as e: logger.error(SeleniumSearchError('Keyword "{}" not found in title: {}'.format(self.current_keyword, self.webdriver.title))) break # match the largest sleep range sleep_time = random.randrange(*self._largest_sleep_range(self.search_number)) time.sleep(sleep_time) html = self.webdriver.page_source self.parser.parse(html) self.store() out(str(self.parser), lvl=2) # Lock for the sake that two threads write to same file (not probable) with self.cache_lock: cache_results(html, self.current_keyword, self.search_engine, self.scrapemethod) self.search_number += 1 # Click the next page link not when leaving the loop if self.current_page < self.num_pages_per_keyword + 1: self.next_url = self._goto_next_page() if not self.next_url: break n += 1
def search(self): """Search with webdriver.""" self.webdriver.get(self.) # match the largest sleep range j = random.randrange(*self._largest_sleep_range(i)) if self.proxy: logger.info('[i] Page number={}, ScraperThread({url}) ({ip}:{port} {} is sleeping for {} seconds...Next keyword: ["{kw}"]'.format(page_num, self._ident, j, url= next_url, ip=self.proxy.host, port=self.proxy.port, kw=kw)) else: logger.info('[i] Page number={}, ScraperThread({url}) ({} is sleeping for {} seconds...Next keyword: ["{}"]'.format(page_num, self._ident, j, kw, url=next_url)) time.sleep(j) try: self.element = WebDriverWait(self.webdriver, 10).until(EC.presence_of_element_located((By.NAME, "q"))) except TimeoutException as e: if not self.handle_request_denied(): open('/tmp/out.png', 'wb').write(self.webdriver.get_screenshot_as_png()) raise GoogleSearchError('`q` search input cannot be found.') if write_kw: self.element.clear() time.sleep(.25) self.element.send_keys(kw + Keys.ENTER) write_kw = False # Waiting until the keyword appears in the title may # not be enough. The content may still be off the old page. try: WebDriverWait(self.webdriver, 10).until(EC.title_contains(kw)) except TimeoutException as e: logger.debug('Keyword not found in title: {}'.format(e)) try: # wait until the next page link emerges WebDriverWait(self.webdriver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#pnnext'))) next_url = self.webdriver.find_element_by_css_selector('#pnnext').get_attribute('href') except TimeoutException as te: logger.debug('Cannot locate next page html id #pnnext') except WebDriverException as e: # leave if no next results page is available pass # That's because we sleep explicitly one second, so the site and # whatever js loads all shit dynamically has time to update the # DOM accordingly. time.sleep(1.5) html = self._maybe_crop(self.webdriver.page_source) if self.rlock or self.queue: # Lock for the sake that two threads write to same file (not probable) self.rlock.acquire() cache_results(html, kw, self.url) self.rlock.release() # commit in intervals specified in the config self.queue.put(self._get_parse_links(html, kw, page_num=page_num+1, ip=self.ip)) self._results.append(self._get_parse_links(html, kw, only_results=True).all_results)
def search(self): """Search with webdriver. Called within the blocking_search search loop. """ for self.current_keyword in self.keywords: for self.current_page in range(1, self.num_pages_per_keyword + 1): # match the largest sleep range sleep_time = random.randrange(*self._largest_sleep_range(self.search_number)) time.sleep(sleep_time) # Waiting until the keyword appears in the title may # not be enough. The content may still be from the old page. try: WebDriverWait(self.webdriver, 5).until(EC.title_contains(self.current_keyword)) except TimeoutException as e: logger.error(SeleniumSearchError('Keyword "{}" not found in title: {}'.format(self.current_keyword, self.webdriver.title))) html = self.webdriver.page_source self.parser.parse(html) self.store() out(str(self.parser), lvl=2) # Lock for the sake that two threads write to same file (not probable) with self.cache_lock: cache_results(html, self.current_keyword, self.search_engine, self.scrapemethod) self.search_number += 1 if self.current_page > 1: self.next_url = self._goto_next_page() try: self.search_input = WebDriverWait(self.webdriver, 5).until( EC.presence_of_element_located(self._get_search_input_field())) except TimeoutException as e: logger.error(e) if not self.handle_request_denied(): open('/tmp/out.png', 'wb').write(self.webdriver.get_screenshot_as_png()) raise GoogleSearchError('search input field cannot be found.') if self.search_input: self.search_input.clear() time.sleep(.25) self.search_input.send_keys(self.current_keyword + Keys.ENTER)
def search(self): """Search with webdriver.""" next_url = None # match the largest sleep range sleep_time = random.randrange(*self._largest_sleep_range(self.search_number)) # log stuff if verbosity is set accordingly if Config['GLOBAL'].getint('verbosity', 1 ) > 1: if self.proxy: logger.info('[i] Page number={}, ScraperThread({url}) ({ip}:{port} {} is sleeping for {} seconds...Next keyword: ["{kw}"]'.format(self.current_page, self._ident, sleep_time, url=next_url, ip=self.proxy.host, port=self.proxy.port, kw=self.current_keyword)) else: logger.info('[i] Page number={}, ScraperThread({url}) ({} is sleeping for {} seconds...Next keyword: ["{}"]'.format(self.current_page, self._ident, sleep_time, self.current_keyword, url=next_url)) time.sleep(sleep_time) try: self.search_input = WebDriverWait(self.webdriver, 5).until(EC.presence_of_element_located(self._get_search_input_field())) except TimeoutException as e: logger.error(e) if not self.handle_request_denied(): open('/tmp/out.png', 'wb').write(self.webdriver.get_screenshot_as_png()) raise GoogleSearchError('search input field cannot be found.') # Waiting until the keyword appears in the title may # not be enough. The content may still be off the old page. try: WebDriverWait(self.webdriver, 5).until(EC.title_contains(self.current_keyword)) except TimeoutException as e: raise SeleniumSearchError('Keyword not found in title: {}'.format(e)) next_url = self._get_next_page_url() # That's because we sleep explicitly one second, so the site and # whatever js loads all shit dynamically has time to update the # DOM accordingly. time.sleep(1.5) html = self.webdriver.page_source self.parser.parse(html) self.store() print(self.parser) # Lock for the sake that two threads write to same file (not probable) with (yield from self.cache_lock): cache_results(html, self.current_keyword, self.search_engine, self.scrapemethod) self.search_number += 1
def search(self, *args, rand=False, **kwargs): """The actual search for the search engine.""" self.build_search() if rand: self.headers['User-Agent'] = random.choice(self.USER_AGENTS) html = get_cached(self.current_keyword, self.base_search_url, params=self.search_params) if not html: try: if Config['GLOBAL'].getint('verbosity', 0) > 1: logger.info( '[HTTP] Base_url: {base_url}, headers={headers}, params={params}' .format(base_url=self.base_search_url, headers=self.headers, params=self.search_params)) r = self.requests.get(self.base_search_url, headers=self.headers, params=self.search_params, timeout=3.0) except self.requests.ConnectionError as ce: logger.error('Network problem occurred {}'.format(ce)) raise ce except self.requests.Timeout as te: logger.error('Connection timeout {}'.format(te)) raise te if not r.ok: logger.error('HTTP Error: {}'.format(r.status_code)) self.handle_request_denied(r.status_code) return False html = r.text # cache fresh results cache_results(html, self.current_keyword, url=self.base_search_url, params=self.search_params) self.parser.parse(html) self.store() print(self.parser)
def search(self, *args, rand=False, **kwargs): """The actual search for the search engine.""" self.build_search() if rand: self.headers['User-Agent'] = random.choice(self.USER_AGENTS) try: out('[HTTP - {proxy}] Base_url: {base_url}, headers={headers}, params={params}'.format( proxy=self.proxy, base_url=self.base_search_url, headers=self.headers, params=self.search_params), lvl=3) super().next_keyword_info(self.n) request = self.requests.get(self.base_search_url, headers=self.headers, params=self.search_params, timeout=3.0) except self.requests.ConnectionError as ce: logger.error('Network problem occurred {}'.format(ce)) raise ce except self.requests.Timeout as te: logger.error('Connection timeout {}'.format(te)) raise te if not request.ok: logger.error('HTTP Error: {}'.format(request.status_code)) self.handle_request_denied(request.status_code) return False html = request.text # cache fresh results with self.cache_lock: cache_results(html, self.current_keyword, self.search_engine, self.scrapemethod) self.parser.parse(html) self.store() out(str(self.parser), lvl=2) self.n += 1
def search(self, *args, rand=False, **kwargs): """The actual search for the search engine.""" self.build_search() if rand: self.headers['User-Agent'] = random.choice(self.USER_AGENTS) html = get_cached(self.current_keyword, self.base_search_url, params=self.search_params) if not html: try: if Config['GLOBAL'].getint('verbosity', 0) > 1: logger.info('[HTTP] Base_url: {base_url}, headers={headers}, params={params}'.format( base_url=self.base_search_url, headers=self.headers, params=self.search_params) ) r = self.requests.get(self.base_search_url, headers=self.headers, params=self.search_params, timeout=3.0) except self.requests.ConnectionError as ce: logger.error('Network problem occurred {}'.format(ce)) raise ce except self.requests.Timeout as te: logger.error('Connection timeout {}'.format(te)) raise te if not r.ok: logger.error('HTTP Error: {}'.format(r.status_code)) self.handle_request_denied(r.status_code) return False html = r.text # cache fresh results cache_results(html, self.current_keyword, url=self.base_search_url, params=self.search_params) self.parser.parse(html) # TODO: remove it and save it to a data storage print(self.parser)
def cache_results(self): """Caches the html for the current request.""" if Config['GLOBAL'].getboolean('do_caching', False): with self.cache_lock: cache_results(self.parser.cleaned_html, self.current_keyword, self.search_engine, self.scrapemethod)
def _search(self, searchtype='normal'): """The actual search and parsing of the results. Private, internal method. Parsing is done with lxml and cssselect. The html structure of the Google Search results may change over time. Effective: February 2014 There are several parts of a SERP results page the average user is most likely interested: (Probably in this order) - Non-advertisement links, as well as their little snippet and title - The message that indicates how many results were found. For example: "About 834,000,000 results (0.39 seconds)" - Advertisement search results (links, titles, snippets like above) Problem: This data comes in a wide range of different formats, depending on the parameters set in the search. Investigations over the different formats are done in the directory tests/serp_formats. """ self._build_query(searchtype) # After building the query, all parameters are set, so we know what we're requesting. logger.debug( "Created new GoogleScrape object with searchparams={}".format( pprint.pformat(self.search_params))) html = get_cached(self.search_query, Config['GLOBAL'].get('base_search_url'), params=self.search_params) self.search_results['cache_file'] = os.path.join( Config['GLOBAL'].get('cachedir'), cached_file_name(self.search_query, Config['GLOBAL'].get('base_search_url'), self.search_params)) if not html: try: r = self.requests.get(Config['GLOBAL'].get('base_search_url'), headers=self._HEADERS, params=self.search_params, timeout=3.0) logger.debug("Scraped with url: {} and User-Agent: {}".format( r.url, self._HEADERS['User-Agent'])) except self.requests.ConnectionError as ce: logger.error('Network problem occurred {}'.format(ce)) raise ce except self.requests.Timeout as te: logger.error('Connection timeout {}'.format(te)) raise te if not r.ok: logger.error('HTTP Error: {}'.format(r.status_code)) if str(r.status_code)[0] == '5': print('Maybe google recognizes you as sneaky spammer after' ' you requested their services too inexhaustibly :D') return False html = r.text if Config['HTTP'].getboolean('view', False): self.browserview(html) # cache fresh results cache_results(html, self.search_query, url=Config['GLOBAL'].get('base_search_url'), params=self.search_params) self.search_results['cache_file'] = os.path.join( Config['GLOBAL'].get('cachedir'), cached_file_name(self.search_query, Config['GLOBAL'].get('base_search_url'), self.search_params)) self.parser = GoogleParser(html, searchtype=self.searchtype) self.search_results.update(self.parser.all_results)
def search(self): """Search with webdriver.""" next_url = None # match the largest sleep range sleep_time = random.randrange( *self._largest_sleep_range(self.search_number)) # log stuff if verbosity is set accordingly if Config['GLOBAL'].getint('verbosity', 1) > 1: if self.proxy: logger.info( '[i] Page number={}, ScraperThread({url}) ({ip}:{port} {} is sleeping for {} seconds...Next keyword: ["{kw}"]' .format(self.current_page, self._ident, sleep_time, url=next_url, ip=self.proxy.host, port=self.proxy.port, kw=self.current_keyword)) else: logger.info( '[i] Page number={}, ScraperThread({url}) ({} is sleeping for {} seconds...Next keyword: ["{}"]' .format(self.current_page, self._ident, sleep_time, self.current_keyword, url=next_url)) time.sleep(sleep_time) try: self.search_input = WebDriverWait(self.webdriver, 5).until( EC.presence_of_element_located(self._get_search_input_field())) except TimeoutException as e: logger.error(e) if not self.handle_request_denied(): open('/tmp/out.png', 'wb').write(self.webdriver.get_screenshot_as_png()) raise GoogleSearchError('search input field cannot be found.') # Waiting until the keyword appears in the title may # not be enough. The content may still be off the old page. try: WebDriverWait(self.webdriver, 5).until(EC.title_contains(self.current_keyword)) except TimeoutException as e: raise SeleniumSearchError( 'Keyword not found in title: {}'.format(e)) next_url = self._get_next_page_url() # That's because we sleep explicitly one second, so the site and # whatever js loads all shit dynamically has time to update the # DOM accordingly. time.sleep(1.5) html = self.webdriver.page_source self.parser.parse(html) self.store() print(self.parser) if self.rlock: # Lock for the sake that two threads write to same file (not probable) self.rlock.acquire() cache_results(html, self.current_keyword, next_url if next_url else self.starting_point) self.rlock.release() self.search_number += 1