def parse_all_cached_files(scrape_jobs, session, scraper_search): """Walk recursively through the cachedir (as given by the Config) and parse all cached files. Args: session: An sql alchemy session to add the entities scraper_search: Abstract object representing the current search. Returns: The scrape jobs that couldn't be parsed from the cache directory. """ files = _get_all_cache_files() num_cached = num_total = 0 mapping = {} for job in scrape_jobs: cache_name = cached_file_name( job['query'], job['search_engine'], job['scrape_method'], job['page_number'] ) mapping[cache_name] = job num_total += 1 for path in files: # strip of the extension of the path if it has eny fname = os.path.split(path)[1] clean_filename = fname for ext in ALLOWED_COMPRESSION_ALGORITHMS: if fname.endswith(ext): clean_filename = fname.rstrip('.' + ext) job = mapping.get(clean_filename, None) if job: # We found a file that contains the keyword, search engine name and # searchmode that fits our description. Let's see if there is already # an record in the database and link it to our new ScraperSearch object. serp = get_serp_from_database(session, job['query'], job['search_engine'], job['scrape_method'], job['page_number']) if not serp: serp = parse_again(fname, job['search_engine'], job['scrape_method'], job['query']) serp.scraper_searches.append(scraper_search) session.add(serp) if num_cached % 200 == 0: session.commit() store_serp_result(serp) num_cached += 1 scrape_jobs.remove(job) out('{} cache files found in {}'.format(len(files), Config['GLOBAL'].get('cachedir')), lvl=2) out('{}/{} objects have been read from the cache. {} remain to get scraped.'.format( num_cached, num_total, num_total - num_cached), lvl=2) session.add(scraper_search) session.commit() return scrape_jobs
def request(): url = self.base_search_url + urlencode(self.params) response = yield from aiohttp.request('GET', url, params=self.params, headers=self.headers) if response.status != 200: self.status = 'not successful: ' + response.status self.requested_at = datetime.datetime.utcnow() out('[+] {} requested keyword \'{}\' on {}. Response status: {}'. format(self.requested_by, self.query, self.search_engine_name, response.status), lvl=2) out('[i] URL: {} HEADERS: {}'.format(url, self.headers), lvl=3) if response.status == 200: body = yield from response.read_and_close(decode=False) self.parser = self.parser(body) return self return None
def instance_creation_info(self, scraper_name): """Debug message whenever a scraping worker is created""" out('[+] {}[{}][search-type:{}][{}] using search engine "{}". Num keywords={}, num pages for keyword={}' .format(scraper_name, self.requested_by, self.search_type, self.base_search_url, self.search_engine_name, len(self.jobs), self.pages_per_keyword), lvl=1)
def search(self): """Search with webdriver. Fills out the search form of the search engine for each keyword. Clicks the next link while pages_per_keyword is not reached. """ for self.query, self.pages_per_keyword in self.jobs.items(): self.search_input = self._wait_until_search_input_field_appears() if self.search_input is False and Config['PROXY_POLICY'].getboolean('stop_on_detection'): self.status = 'Malicious request detected' super().after_search() return if self.search_input is False: self.search_input = self.handle_request_denied() if self.search_input: self.search_input.clear() time.sleep(.25) try: self.search_input.send_keys(self.query + Keys.ENTER) except ElementNotVisibleException as e: time.sleep(2) self.search_input.send_keys(self.query + Keys.ENTER) self.requested_at = datetime.datetime.utcnow() else: out('{}: Cannot get handle to the input form for keyword {}.'.format(self.name, self.query), lvl=4) continue super().detection_prevention_sleep() super().keyword_info() for self.page_number in self.pages_per_keyword: self.wait_until_serp_loaded() try: self.html = self.webdriver.execute_script('return document.body.innerHTML;') except WebDriverException as e: self.html = self.webdriver.page_source super().after_search() # Click the next page link not when leaving the loop # in the next iteration. if self.page_number in self.pages_per_keyword: self.next_url = self._goto_next_page() self.requested_at = datetime.datetime.utcnow() if not self.next_url: break
def keyword_info(self): """Print a short summary where we are in the scrape and what's the next keyword.""" out('[{thread_name}][{ip}]]Keyword: "{keyword}" with {num_pages} pages, slept {delay} seconds before scraping. {done}/{all} already scraped.' .format(thread_name=self.name, ip=self.requested_by, keyword=self.query, num_pages=self.pages_per_keyword, delay=self.current_delay, done=self.search_number, all=self.num_keywords), lvl=2)
def search(self, rand=False, timeout=15): """The actual search for the search engine. When raising StopScrapingException, the scraper will stop. When return False, the scraper tries to continue with next keyword. """ success = True self.build_search() if rand: self.headers['User-Agent'] = random.choice(user_agents) try: super().detection_prevention_sleep() super().keyword_info() request = self.requests.get(self.base_search_url + urlencode(self.search_params), headers=self.headers, timeout=timeout) self.requested_at = datetime.datetime.utcnow() self.html = request.text out('[HTTP - {url}, headers={headers}, params={params}'.format( url=request.url, headers=self.headers, params=self.search_params), lvl=3) except self.requests.ConnectionError as ce: self.status = 'Network problem occurred {}'.format(ce) success = False except self.requests.Timeout as te: self.status = 'Connection timeout {}'.format(te) success = False except self.requests.exceptions.RequestException as e: # In case of any http networking exception that wasn't caught # in the actual request, just end the worker. self.status = 'Stopping scraping because {}'.format(e) if not request.ok: self.handle_request_denied(request.status_code) success = False super().after_search() return success
def after_search(self): """Store the results and parse em. Notify the progress queue if necessary. """ self.search_number += 1 if not self.store(): out('No results to store for keyword: "{}" in search engine: {}'. format(self.query, self.search_engine_name), lvl=4) if self.progress_queue: self.progress_queue.put(1) self.cache_results()
def handle_request_denied(self): """Checks whether Google detected a potentially harmful request. Whenever such potential abuse is detected, Google shows an captcha. This method just blocks as long as someone entered the captcha in the browser window. When the window is not visible (For example when using PhantomJS), this method makes a png from the html code and shows it to the user, which should enter it in a command line. Returns: The search input field. Raises: MaliciousRequestDetected when there was not way to stp Google From denying our requests. """ # selenium webdriver objects have no status code :/ super().handle_request_denied('400') needles = self.malicious_request_needles[self.search_engine_name] if needles and needles['inurl'] in self.webdriver.current_url and needles['inhtml'] in self.webdriver.page_source: if Config['SELENIUM'].getboolean('manual_captcha_solving', False): with self.captcha_lock: import tempfile tf = tempfile.NamedTemporaryFile('wb') tf.write(self.webdriver.get_screenshot_as_png()) import webbrowser webbrowser.open('file://{}'.format(tf.name)) solution = input('enter the captcha please...') self.webdriver.find_element_by_name('submit').send_keys(solution + Keys.ENTER) try: self.search_input = WebDriverWait(self.webdriver, 5).until( EC.visibility_of_element_located(self._get_search_input_field())) except TimeoutException as e: raise MaliciousRequestDetected('Requesting with this ip is not possible at the moment.') tf.close() else: # Just wait until the user solves the captcha in the browser window # 10 hours if needed :D out('Waiting for user to solve captcha', lvl=1) return self._wait_until_search_input_field_appears(10*60*60)
def run(self): """Run the SelScraper.""" if not self._get_webdriver(): raise_or_log('{}: Aborting due to no available selenium webdriver.'.format(self.name), exception_obj=SeleniumMisconfigurationError) try: self.webdriver.set_window_size(400, 400) self.webdriver.set_window_position(400*(self.browser_num % 4), 400*(math.floor(self.browser_num//4))) except WebDriverException as e: out('Cannot set window size: {}'.format(e), lvl=4) super().before_search() if self.startable: self.build_search() self.search() if self.webdriver: self.webdriver.close()
def fix_broken_cache_names(url, search_engine, scrapemode, page_number): """Fix broken cache names. Args: url: A list of strings to add to each cached_file_name() call. """ files = _get_all_cache_files() logger.debug('{} cache files found in {}'.format(len(files), Config['GLOBAL'].get('cachedir', '.scrapecache'))) r = re.compile(r'<title>(?P<kw>.*?) - Google Search</title>') for i, path in enumerate(files): fname = os.path.split(path)[1].strip() data = read_cached_file(path) infilekws = r.search(data).group('kw') realname = cached_file_name(infilekws, search_engine, scrapemode, page_number) if fname != realname: out('The search query in the title element in file {} differ from that hash of its name. Fixing...'.format(path), lvl=3) src = os.path.abspath(path) dst = os.path.abspath(os.path.join(os.path.split(path)[0], realname)) logger.debug('Renamed from {} => {}'.format(src, dst)) os.rename(src, dst) logger.debug('Renamed {} files.'.format(i))
def _find_next_page_element(self): """Finds the element that locates the next page for any search engine. Returns: The element that needs to be clicked to get to the next page. """ if self.search_type == 'normal': selector = self.next_page_selectors[self.search_engine_name] try: # wait until the next page link emerges WebDriverWait(self.webdriver, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, selector))) return self.webdriver.find_element_by_css_selector(selector) except TimeoutException as te: out('{}: Cannot locate next page element: {}'.format(self.name, te), lvl=4) return False except WebDriverException as e: out('{} Cannot locate next page element: {}'.format(self.name, e), lvl=4) return False elif self.search_type == 'image': self.page_down() return True
def main(return_results=False, parse_cmd_line=True): """Runs the googlescraper application as determined by the various configuration points. The main() function encompasses the core functionality of googlescraper. But it shouldn't be the main() functions job to check the validity of the provided configuration. Args: return_results: When GoogleScrape is used from within another program, don't print results to stdout, store them in a database instead. parse_cmd_line: Whether to get options from the command line or not. Returns: A database session to the results when return_results is True """ if parse_cmd_line: parse_cmd_args() # If the configuration file to use is explicitly specified, update the current configuration # with it. if Config['GLOBAL'].get('config_file', None): update_config_with_file(Config['GLOBAL'].get('config_file', None)) if Config['GLOBAL'].getboolean('view_config'): from googlescraper.config import CONFIG_FILE print(open(CONFIG_FILE).read()) return if Config['GLOBAL'].getboolean('version'): from googlescraper.version import __version__ print(__version__) return if Config['GLOBAL'].getboolean('clean', False): try: os.remove('google_scraper.db') if sys.platform == 'linux': os.system('rm {}/*'.format(Config['GLOBAL'].get('cachedir'))) except: pass return init_outfile(force_reload=True) kwfile = Config['SCRAPING'].get('keyword_file', '') if kwfile: kwfile = os.path.abspath(kwfile) keyword = Config['SCRAPING'].get('keyword') keywords = {keyword for keyword in set(Config['SCRAPING'].get('keywords', []).split('\n')) if keyword} proxy_file = Config['GLOBAL'].get('proxy_file', '') proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '') se = Config['SCRAPING'].get('search_engines', 'google') if se.strip() == '*': se = Config['SCRAPING'].get('supported_search_engines', 'google') search_engines = list({search_engine.strip() for search_engine in se.split(',') if search_engine.strip()}) assert search_engines, 'No search engine specified' num_search_engines = len(search_engines) num_workers = Config['SCRAPING'].getint('num_workers') scrape_method = Config['SCRAPING'].get('scrape_method') pages = Config['SCRAPING'].getint('num_pages_for_keyword', 1) method = Config['SCRAPING'].get('scrape_method', 'http') if Config['GLOBAL'].getboolean('shell', False): namespace = {} Session = get_session(scoped=False) namespace['session'] = Session() namespace['ScraperSearch'] = ScraperSearch namespace['SERP'] = SERP namespace['Link'] = Link namespace['Proxy'] = googlescraper.database.Proxy print('Available objects:') print('session - A sqlalchemy session of the results database') print('ScraperSearch - Search/Scrape job instances') print('SERP - A search engine results page') print('Link - A single link belonging to a SERP') print('Proxy - Proxies stored for scraping projects.') start_python_console(namespace) return if not (keyword or keywords) and not kwfile: raise_or_log('No keywords to scrape for. Please provide either an keyword file (Option: --keyword-file) or specify and keyword with --keyword.') # Just print the help. get_command_line(True) return if Config['GLOBAL'].getboolean('fix_cache_names'): fix_broken_cache_names() logger.info('renaming done. restart for normal use.') return keywords = [keyword, ] if keyword else keywords scrape_jobs = {} if kwfile: if not os.path.exists(kwfile): raise_or_log('The keyword file {} does not exist.'.format(kwfile), exception_obj=InvalidConfigurationException) else: if kwfile.endswith('.py'): # we need to import the variable "scrape_jobs" from the module. sys.path.append(os.path.dirname(kwfile)) try: modname = os.path.split(kwfile)[-1].rstrip('.py') scrape_jobs = getattr(__import__(modname, fromlist=['scrape_jobs']), 'scrape_jobs') except ImportError as e: logger.warning(e) else: # Clean the keywords of duplicates right in the beginning keywords = set([line.strip() for line in open(kwfile, 'r').read().split('\n') if line.strip()]) if not scrape_jobs: scrape_jobs = default_scrape_jobs_for_keywords(keywords, search_engines, scrape_method, pages) scrape_jobs = list(scrape_jobs) if Config['GLOBAL'].getboolean('clean_cache_files', False): clean_cachefiles() return if Config['GLOBAL'].getboolean('check_oto', False): _caching_is_one_to_one(keyword) if Config['SCRAPING'].getint('num_results_per_page') > 100: raise_or_log('Not more that 100 results per page available for searches.', exception_obj=InvalidConfigurationException) proxies = [] if proxy_db: proxies = get_proxies_from_mysql_db(proxy_db) elif proxy_file: proxies = parse_proxy_file(proxy_file) if Config['SCRAPING'].getboolean('use_own_ip'): proxies.append(None) if not proxies: raise InvalidConfigurationException('No proxies available and using own IP is prohibited by configuration. Turning down.') valid_search_types = ('normal', 'video', 'news', 'image') if Config['SCRAPING'].get('search_type') not in valid_search_types: raise_or_log('Invalid search type! Select one of {}'.format(repr(valid_search_types)), exception_obj=InvalidConfigurationException) if Config['GLOBAL'].getboolean('simulate', False): print('*' * 60 + 'SIMULATION' + '*' * 60) logger.info('If googlescraper would have been run without the --simulate flag, it would have:') logger.info('Scraped for {} keywords, with {} results a page, in total {} pages for each keyword'.format( len(keywords), Config['SCRAPING'].getint('num_results_per_page', 0), Config['SCRAPING'].getint('num_pages_for_keyword'))) if None in proxies: logger.info('Also using own ip address to scrape.') else: logger.info('Not scraping with own ip address.') logger.info('Used {} unique ip addresses in total'.format(len(proxies))) if proxies: logger.info('The following proxies are used: \n\t\t{}'.format('\n\t\t'.join([proxy.host + ':' + proxy.port for proxy in proxies if proxy]))) logger.info('By using {} mode with {} worker instances'.format(Config['SCRAPING'].get('scrape_method'), Config['SCRAPING'].getint('num_workers'))) return # get a scoped sqlalchemy session Session = get_session(scoped=False) session = Session() # add fixtures fixtures(session) # add proxies to the database add_proxies_to_db(proxies, session) # ask the user to continue the last scrape. We detect a continuation of a # previously established scrape, if the keyword-file is the same and unmodified since # the beginning of the last scrape. scraper_search = None if kwfile and Config['GLOBAL'].getboolean('continue_last_scrape', False): searches = session.query(ScraperSearch).\ filter(ScraperSearch.keyword_file == kwfile).\ order_by(ScraperSearch.started_searching).\ all() if searches: last_search = searches[-1] last_modified = datetime.datetime.utcfromtimestamp(os.path.getmtime(last_search.keyword_file)) # if the last modification is older then the starting of the search if last_modified < last_search.started_searching: scraper_search = last_search logger.info('Continuing last scrape.') if not scraper_search: scraper_search = ScraperSearch( keyword_file=os.path.abspath(kwfile), number_search_engines_used=num_search_engines, number_proxies_used=len(proxies), number_search_queries=len(keywords), started_searching=datetime.datetime.utcnow(), used_search_engines=','.join(search_engines) ) # First of all, lets see how many requests remain to issue after searching the cache. if Config['GLOBAL'].getboolean('do_caching'): scrape_jobs = parse_all_cached_files(scrape_jobs, session, scraper_search) if scrape_jobs: # Create a lock to synchronize database access in the sqlalchemy session db_lock = threading.Lock() # create a lock to cache results cache_lock = threading.Lock() # A lock to prevent multiple threads from solving captcha, used in selenium instances. captcha_lock = threading.Lock() out('Going to scrape {num_keywords} keywords with {num_proxies} proxies by using {num_threads} threads.'.format( num_keywords=len(list(scrape_jobs)), num_proxies=len(proxies), num_threads=num_search_engines ), lvl=1) # Let the games begin if method in ('selenium', 'http'): # Show the progress of the scraping q = queue.Queue() progress_thread = ShowProgressQueue(q, len(scrape_jobs)) progress_thread.start() workers = queue.Queue() num_worker = 0 for search_engine in search_engines: for proxy in proxies: for worker in range(num_workers): num_worker += 1 workers.put( ScrapeWorkerFactory( mode=method, proxy=proxy, search_engine=search_engine, session=session, db_lock=db_lock, cache_lock=cache_lock, scraper_search=scraper_search, captcha_lock=captcha_lock, progress_queue=q, browser_num=num_worker ) ) for job in scrape_jobs: while True: worker = workers.get() workers.put(worker) if worker.is_suitabe(job): worker.add_job(job) break threads = [] while not workers.empty(): worker = workers.get() thread = worker.get_worker() if thread: threads.append(thread) for t in threads: t.start() for t in threads: t.join() # after threads are done, stop the progress queue. q.put('done') elif method == 'http-async': scheduler = AsyncScrapeScheduler(scrape_jobs, session=session, scraper_search=scraper_search, db_lock=db_lock) scheduler.run() else: raise InvalidConfigurationException('No such scrape_method {}'.format(Config['SCRAPING'].get('scrape_method'))) if method in ('selenium', 'http'): progress_thread.join() # in the end, close the json file. from googlescraper.output_converter import outfile, output_format if output_format == 'json': outfile.end() scraper_search.stopped_searching = datetime.datetime.utcnow() session.add(scraper_search) session.commit() if return_results: return scraper_search
def _parse(self, cleaner=None): """Internal parse the dom according to the provided css selectors. Raises: InvalidSearchTypeException if no css selectors for the searchtype could be found. """ self._parse_lxml(cleaner) # try to parse the number of results. attr_name = self.searchtype + '_search_selectors' selector_dict = getattr(self, attr_name, None) # get the appropriate css selectors for the num_results for the keyword num_results_selector = getattr(self, 'num_results_search_selectors', None) self.num_results_for_query = self.first_match(num_results_selector, self.dom) if not self.num_results_for_query: out('{}: Cannot parse num_results from serp page with selectors {}'.format(self.__class__.__name__, num_results_selector), lvl=4) # get the current page we are at. Sometimes we search engines don't show this. try: self.page_number = int(self.first_match(self.page_number_selectors, self.dom)) except ValueError as e: self.page_number = -1 # let's see if the search query was shitty (no results for that query) self.effective_query = self.first_match(self.effective_query_selector, self.dom) if self.effective_query: out('{}: There was no search hit for the search query. Search engine used {} instead.'.format(self.__class__.__name__, self.effective_query), lvl=4) # the element that notifies the user about no results. self.no_results_text = self.first_match(self.no_results_selector, self.dom) # get the stuff that is of interest in SERP pages. if not selector_dict and not isinstance(selector_dict, dict): raise InvalidSearchTypeException('There is no such attribute: {}. No selectors found'.format(attr_name)) for result_type, selector_class in selector_dict.items(): self.search_results[result_type] = [] for selector_specific, selectors in selector_class.items(): if 'result_container' in selectors and selectors['result_container']: css = '{container} {result_container}'.format(**selectors) else: css = selectors['container'] results = self.dom.xpath( self.css_to_xpath(css) ) to_extract = set(selectors.keys()) - {'container', 'result_container'} selectors_to_use = {key: selectors[key] for key in to_extract if key in selectors.keys()} for index, result in enumerate(results): # Let's add primitive support for CSS3 pseudo selectors # We just need two of them # ::text # ::attr(attribute) # You say we should use xpath expressions instead? # Maybe you're right, but they are complicated when it comes to classes, # have a look here: http://doc.scrapy.org/en/latest/topics/selectors.html serp_result = {} # key are for example 'link', 'snippet', 'visible-url', ... # selector is the selector to grab these items for key, selector in selectors_to_use.items(): serp_result[key] = self.advanced_css(selector, result) serp_result['rank'] = index+1 # only add items that have not None links. # Avoid duplicates. Detect them by the link. # If statement below: Lazy evaluation. The more probable case first. if 'link' in serp_result and serp_result['link'] and \ not [e for e in self.search_results[result_type] if e['link'] == serp_result['link']]: self.search_results[result_type].append(serp_result) self.num_results += 1
def wait_until_title_contains_keyword(self): try: WebDriverWait(self.webdriver, 5).until(EC.title_contains(self.query)) except TimeoutException as e: out(SeleniumSearchError('{}: Keyword "{}" not found in title: {}'.format(self.name, self.query, self.webdriver.title)), lvl=4)