def main(return_results=False, parse_cmd_line=True, config_from_dict=None): """Runs the GoogleScraper application as determined by the various configuration points. The main() function encompasses the core functionality of GoogleScraper. But it shouldn't be the main() functions job to check the validity of the provided configuration. Args: return_results: When GoogleScrape is used from within another program, don't print results to stdout, store them in a database instead. parse_cmd_line: Whether to get options from the command line or not. config_from_dict: Configuration that is passed when GoogleScraper is called as library. Returns: A database session to the results when return_results is True. Else, nothing. """ external_config_file_path = cmd_line_args = None if parse_cmd_line: cmd_line_args = get_command_line() if cmd_line_args.get('config_file', None): external_config_file_path = os.path.abspath(cmd_line_args.get('config_file')) config = get_config(cmd_line_args, external_config_file_path, config_from_dict) if isinstance(config['log_level'], int): config['log_level'] = logging.getLevelName(config['log_level']) setup_logger(level=config.get('log_level').upper()) if config.get('view_config', False): print(open(os.path.join(get_base_path(), 'scrape_config.py')).read()) return if config.get('version'): from GoogleScraper.version import __version__ print(__version__) return if config.get('clean', False): try: os.remove('google_scraper.db') if sys.platform == 'linux': os.system('rm {}/*'.format(config.get('cachedir'))) except: pass return init_outfile(config, force_reload=True) kwfile = config.get('keyword_file', '') if kwfile: kwfile = os.path.abspath(kwfile) keyword = config.get('keyword') keywords = set(config.get('keywords', [])) proxy_file = config.get('proxy_file', '') proxy_db = config.get('mysql_proxy_db', '') proxy_list = config.get('proxy_list', []) # when no search engine is specified, use google search_engines = config.get('search_engines', ['google',]) if not isinstance(search_engines, list): if search_engines == '*': search_engines = config.get('supported_search_engines') else: search_engines = search_engines.split(',') assert isinstance(search_engines, list), 'Search engines must be a list like data type!' search_engines = set(search_engines) num_search_engines = len(search_engines) num_workers = int(config.get('num_workers')) scrape_method = config.get('scrape_method') pages = int(config.get('num_pages_for_keyword', 1)) method = config.get('scrape_method', 'http') if config.get('shell', False): namespace = {} session_cls = get_session(config, scoped=False) namespace['session'] = session_cls() namespace['ScraperSearch'] = ScraperSearch namespace['SERP'] = SERP namespace['Link'] = Link namespace['Proxy'] = GoogleScraper.database.Proxy print('Available objects:') print('session - A sqlalchemy session of the results database') print('ScraperSearch - Search/Scrape job instances') print('SERP - A search engine results page') print('Link - A single link belonging to a SERP') print('Proxy - Proxies stored for scraping projects.') start_python_console(namespace) return if not (keyword or keywords) and not kwfile: # Just print the help. get_command_line(True) print('No keywords to scrape for. Please provide either an keyword file (Option: --keyword-file) or specify and ' 'keyword with --keyword.') return cache_manager = CacheManager(config) if config.get('fix_cache_names'): cache_manager.fix_broken_cache_names() logger.info('renaming done. restart for normal use.') return keywords = [keyword, ] if keyword else keywords scrape_jobs = {} if kwfile: if not os.path.exists(kwfile): raise WrongConfigurationError('The keyword file {} does not exist.'.format(kwfile)) else: if kwfile.endswith('.py'): # we need to import the variable "scrape_jobs" from the module. sys.path.append(os.path.dirname(kwfile)) try: modname = os.path.split(kwfile)[-1].rstrip('.py') scrape_jobs = getattr(__import__(modname, fromlist=['scrape_jobs']), 'scrape_jobs') except ImportError as e: logger.warning(e) else: # Clean the keywords of duplicates right in the beginning keywords = set([line.strip() for line in open(kwfile, 'r').read().split('\n') if line.strip()]) if not scrape_jobs: scrape_jobs = default_scrape_jobs_for_keywords(keywords, search_engines, scrape_method, pages) scrape_jobs = list(scrape_jobs) if config.get('clean_cache_files', False): cache_manager.clean_cachefiles() return if config.get('check_oto', False): cache_manager._caching_is_one_to_one(keyword) if config.get('num_results_per_page') > 100: raise WrongConfigurationError('Not more that 100 results per page available for searches.') proxies = [] if proxy_list: proxies = proxy_list elif proxy_db: proxies = get_proxies_from_mysql_db(proxy_db) elif proxy_file: proxies = parse_proxy_file(proxy_file) if config.get('use_own_ip'): proxies.append(None) if not proxies: raise Exception('No proxies available and using own IP is prohibited by configuration. Turning down.') valid_search_types = ('normal', 'video', 'news', 'image') if config.get('search_type') not in valid_search_types: raise WrongConfigurationError('Invalid search type! Select one of {}'.format(repr(valid_search_types))) if config.get('simulate', False): print('*' * 60 + 'SIMULATION' + '*' * 60) logger.info('If GoogleScraper would have been run without the --simulate flag, it would have:') logger.info('Scraped for {} keywords, with {} results a page, in total {} pages for each keyword'.format( len(keywords), int(config.get('num_results_per_page', 0)), int(config.get('num_pages_for_keyword')))) if None in proxies: logger.info('Also using own ip address to scrape.') else: logger.info('Not scraping with own ip address.') logger.info('Used {} unique ip addresses in total'.format(len(proxies))) if proxies: logger.info('The following proxies are used: \n\t\t{}'.format( '\n\t\t'.join([proxy.host + ':' + proxy.port for proxy in proxies if proxy]))) logger.info('By using {} mode with {} worker instances'.format(config.get('scrape_method'), int(config.get('num_workers')))) return # get a scoped sqlalchemy session session_cls = get_session(config, scoped=False) session = session_cls() # add fixtures fixtures(config, session) # add proxies to the database add_proxies_to_db(proxies, session) # ask the user to continue the last scrape. We detect a continuation of a # previously established scrape, if the keyword-file is the same and unmodified since # the beginning of the last scrape. scraper_search = None if kwfile and config.get('continue_last_scrape', False): searches = session.query(ScraperSearch). \ filter(ScraperSearch.keyword_file == kwfile). \ order_by(ScraperSearch.started_searching). \ all() if searches: last_search = searches[-1] last_modified = datetime.datetime.utcfromtimestamp(os.path.getmtime(last_search.keyword_file)) # if the last modification is older then the starting of the search if last_modified < last_search.started_searching: scraper_search = last_search logger.info('Continuing last scrape.') if not scraper_search: scraper_search = ScraperSearch( keyword_file=kwfile, number_search_engines_used=num_search_engines, number_proxies_used=len(proxies), number_search_queries=len(keywords), started_searching=datetime.datetime.utcnow(), used_search_engines=','.join(search_engines) ) # First of all, lets see how many requests remain to issue after searching the cache. if config.get('do_caching'): scrape_jobs = cache_manager.parse_all_cached_files(scrape_jobs, session, scraper_search) if scrape_jobs: # Create a lock to synchronize database access in the sqlalchemy session db_lock = threading.Lock() # create a lock to cache results cache_lock = threading.Lock() # A lock to prevent multiple threads from solving captcha, used in selenium instances. captcha_lock = threading.Lock() logger.info('Going to scrape {num_keywords} keywords with {num_proxies} proxies by using {num_threads} threads.'.format( num_keywords=len(list(scrape_jobs)), num_proxies=len(proxies), num_threads=num_search_engines)) progress_thread = None # Let the games begin if method in ('selenium', 'http'): # Show the progress of the scraping q = queue.Queue() progress_thread = ShowProgressQueue(config, q, len(scrape_jobs)) progress_thread.start() workers = queue.Queue() num_worker = 0 for search_engine in search_engines: for proxy in proxies: for worker in range(num_workers): num_worker += 1 workers.put( ScrapeWorkerFactory( config, cache_manager=cache_manager, mode=method, proxy=proxy, search_engine=search_engine, session=session, db_lock=db_lock, cache_lock=cache_lock, scraper_search=scraper_search, captcha_lock=captcha_lock, progress_queue=q, browser_num=num_worker ) ) # here we look for suitable workers # for all jobs created. for job in scrape_jobs: while True: worker = workers.get() workers.put(worker) if worker.is_suitabe(job): worker.add_job(job) break threads = [] while not workers.empty(): worker = workers.get() thread = worker.get_worker() if thread: threads.append(thread) for t in threads: t.start() for t in threads: t.join() # after threads are done, stop the progress queue. q.put('done') progress_thread.join() elif method == 'http-async': scheduler = AsyncScrapeScheduler(config, scrape_jobs, cache_manager=cache_manager, session=session, scraper_search=scraper_search, db_lock=db_lock) scheduler.run() else: raise Exception('No such scrape_method {}'.format(config.get('scrape_method'))) from GoogleScraper.output_converter import close_outfile close_outfile() scraper_search.stopped_searching = datetime.datetime.utcnow() session.add(scraper_search) session.commit() if return_results: return scraper_search
def main(): """Runs the GoogleScraper application as determined by the various configuration points.""" global Config Config = get_config(True, True) if Config['GLOBAL'].getboolean('view_config'): from GoogleScraper.config import CONFIG_FILE print(open(CONFIG_FILE).read()) sys.exit(0) if Config['GLOBAL'].getboolean('do_caching'): d = Config['GLOBAL'].get('cachedir') if not os.path.exists(d): os.mkdir(d, 0o744) else: maybe_clean_cache() kwfile = Config['SCRAPING'].get('keyword_file') keyword = Config['SCRAPING'].get('keyword') keywords = set(Config['SCRAPING'].get('keywords', '').split('\n')) proxy_file = Config['GLOBAL'].get('proxy_file', '') proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '') if not (keyword or keywords) and not kwfile: raise ValueError('You must specify a keyword file (separated by newlines, each keyword on a line) with the flag `--keyword-file {filepath}~') if Config['GLOBAL'].getboolean('fix_cache_names'): fix_broken_cache_names() sys.exit('renaming done. restart for normal use.') keywords = [keyword,] if keyword else keywords if kwfile: if not os.path.exists(kwfile): raise ValueError('The keyword file {} does not exist.'.format(kwfile)) else: # Clean the keywords of duplicates right in the beginning keywords = set([line.strip() for line in open(kwfile, 'r').read().split('\n')]) if Config['GLOBAL'].getboolean('check_oto', False): _caching_is_one_to_one(keyword) if Config['SCRAPING'].getint('num_results_per_page') > 100: raise ValueError('Not more that 100 results per page available for Google searches.') if proxy_db: proxies = get_proxies_from_mysql_db(proxy_db) elif proxy_file: proxies = parse_proxy_file(proxy_file) else: proxies = [] valid_search_types = ('normal', 'video', 'news', 'image') if Config['SCRAPING'].get('search_type') not in valid_search_types: ValueError('Invalid search type! Select one of {}'.format(repr(valid_search_types))) # Let the games begin if Config['SCRAPING'].get('scrapemethod', '') == 'sel': conn = maybe_create_db() # First of all, lets see how many keywords remain to scrape after parsing the cache if Config['GLOBAL'].getboolean('do_caching'): remaining = parse_all_cached_files(keywords, conn, simulate=Config['GLOBAL'].getboolean('simulate')) else: remaining = keywords if Config['GLOBAL'].getboolean('simulate'): # TODO: implement simulation raise NotImplementedError('Simulating is not implemented yet!') # Create a lock to sync file access rlock = threading.RLock() # A lock to prevent multiple threads from solving captcha. lock = threading.Lock() max_sel_browsers = Config['SELENIUM'].getint('num_browser_instances') if len(remaining) > max_sel_browsers: kwgroups = grouper(remaining, len(remaining)//max_sel_browsers, fillvalue=None) else: # thats a little special there :) kwgroups = [[kw, ] for kw in remaining] # Distribute the proxies evenly on the kws to search scrapejobs = [] Q = queue.Queue() proxies.append(None) if Config['SCRAPING'].getboolean('use_own_ip') else None if not proxies: logger.info("No ip's available for scanning.") chunks_per_proxy = math.ceil(len(kwgroups)/len(proxies)) for i, chunk in enumerate(kwgroups): scrapejobs.append(SelScraper(chunk, rlock, Q, captcha_lock=lock, browser_num=i, proxy=proxies[i//chunks_per_proxy])) for t in scrapejobs: t.start() handler = ResultsHandler(Q, conn) handler.start() for t in scrapejobs: t.join() # All scrape jobs done, signal the db handler to stop Q.put(Config['GLOBAL'].get('all_processed_sig')) handler.join() conn.commit() conn.close() elif Config['SCRAPING'].get('scrapemethod') == 'http': if Config['SCRAPING'].getboolean('deep_scrape', False): # TODO: implement deep scrape raise NotImplementedError('Sorry. Currently deep_scrape is not implemented.') else: results = [] for kw in keywords: r = scrape(kw, num_results_per_page=Config['SCRAPING'].getint('num_results_per_page', 10), num_pages=Config['SCRAPING'].getint('num_pages', 1), scrapemethod='http') results.append(r) if Config['GLOBAL'].get('print'): print_scrape_results_http(results, Config['GLOBAL'].getint('verbosity', 0), view=Config['HTTP'].get('view', False)) else: raise ValueError('No such scrapemethod. Use "http" or "sel"')
def main(return_results=False, parse_cmd_line=True, config_from_dict=None): """Runs the GoogleScraper application as determined by the various configuration points. The main() function encompasses the core functionality of GoogleScraper. But it shouldn't be the main() functions job to check the validity of the provided configuration. Args: return_results: When GoogleScrape is used from within another program, don't print results to stdout, store them in a database instead. parse_cmd_line: Whether to get options from the command line or not. config_from_dict: Configuration that is passed when GoogleScraper is called as library. Returns: A database session to the results when return_results is True. Else, nothing. """ external_config_file_path = cmd_line_args = None if parse_cmd_line: cmd_line_args = get_command_line() if cmd_line_args.get('config_file', None): external_config_file_path = os.path.abspath( cmd_line_args.get('config_file')) config = get_config(cmd_line_args, external_config_file_path, config_from_dict) if isinstance(config['log_level'], int): config['log_level'] = logging.getLevelName(config['log_level']) setup_logger(level=config.get('log_level').upper()) if config.get('view_config', False): print(open(os.path.join(get_base_path(), 'scrape_config.py')).read()) return if config.get('version'): from GoogleScraper.version import __version__ print(__version__) return if config.get('clean', False): try: os.remove('google_scraper.db') if sys.platform == 'linux': os.system('rm {}/*'.format(config.get('cachedir'))) except: pass return init_outfile(config, force_reload=True) kwfile = config.get('keyword_file', '') if kwfile: kwfile = os.path.abspath(kwfile) keyword = config.get('keyword') keywords = set(config.get('keywords', [])) proxy_file = config.get('proxy_file', '') proxy_db = config.get('mysql_proxy_db', '') # when no search engine is specified, use google search_engines = config.get('search_engines', [ 'google', ]) if not isinstance(search_engines, list): if search_engines == '*': search_engines = config.get('supported_search_engines') else: search_engines = search_engines.split(',') assert isinstance(search_engines, list), 'Search engines must be a list like data type!' search_engines = set(search_engines) num_search_engines = len(search_engines) num_workers = int(config.get('num_workers')) scrape_method = config.get('scrape_method') pages = int(config.get('num_pages_for_keyword', 1)) method = config.get('scrape_method', 'http') if config.get('shell', False): namespace = {} session_cls = get_session(config, scoped=False) namespace['session'] = session_cls() namespace['ScraperSearch'] = ScraperSearch namespace['SERP'] = SERP namespace['Link'] = Link namespace['Proxy'] = GoogleScraper.database.Proxy print('Available objects:') print('session - A sqlalchemy session of the results database') print('ScraperSearch - Search/Scrape job instances') print('SERP - A search engine results page') print('Link - A single link belonging to a SERP') print('Proxy - Proxies stored for scraping projects.') start_python_console(namespace) return if not (keyword or keywords) and not kwfile: # Just print the help. get_command_line(True) print( 'No keywords to scrape for. Please provide either an keyword file (Option: --keyword-file) or specify and ' 'keyword with --keyword.') return cache_manager = CacheManager(config) if config.get('fix_cache_names'): cache_manager.fix_broken_cache_names() logger.info('renaming done. restart for normal use.') return keywords = [ keyword, ] if keyword else keywords scrape_jobs = {} if kwfile: if not os.path.exists(kwfile): raise WrongConfigurationError( 'The keyword file {} does not exist.'.format(kwfile)) else: if kwfile.endswith('.py'): # we need to import the variable "scrape_jobs" from the module. sys.path.append(os.path.dirname(kwfile)) try: modname = os.path.split(kwfile)[-1].rstrip('.py') scrape_jobs = getattr( __import__(modname, fromlist=['scrape_jobs']), 'scrape_jobs') except ImportError as e: logger.warning(e) else: # Clean the keywords of duplicates right in the beginning keywords = set([ line.strip() for line in open(kwfile, 'r').read().split('\n') if line.strip() ]) if not scrape_jobs: scrape_jobs = default_scrape_jobs_for_keywords(keywords, search_engines, scrape_method, pages) scrape_jobs = list(scrape_jobs) if config.get('clean_cache_files', False): cache_manager.clean_cachefiles() return if config.get('check_oto', False): cache_manager._caching_is_one_to_one(keyword) if config.get('num_results_per_page') > 100: raise WrongConfigurationError( 'Not more that 100 results per page available for searches.') proxies = [] if proxy_db: proxies = get_proxies_from_mysql_db(proxy_db) elif proxy_file: proxies = parse_proxy_file(proxy_file) if config.get('use_own_ip'): proxies.append(None) if not proxies: raise Exception( 'No proxies available and using own IP is prohibited by configuration. Turning down.' ) valid_search_types = ('normal', 'video', 'news', 'image') if config.get('search_type') not in valid_search_types: raise WrongConfigurationError( 'Invalid search type! Select one of {}'.format( repr(valid_search_types))) if config.get('simulate', False): print('*' * 60 + 'SIMULATION' + '*' * 60) logger.info( 'If GoogleScraper would have been run without the --simulate flag, it would have:' ) logger.info( 'Scraped for {} keywords, with {} results a page, in total {} pages for each keyword' .format(len(keywords), int(config.get('num_results_per_page', 0)), int(config.get('num_pages_for_keyword')))) if None in proxies: logger.info('Also using own ip address to scrape.') else: logger.info('Not scraping with own ip address.') logger.info('Used {} unique ip addresses in total'.format( len(proxies))) if proxies: logger.info('The following proxies are used: \n\t\t{}'.format( '\n\t\t'.join([ proxy.host + ':' + proxy.port for proxy in proxies if proxy ]))) logger.info('By using {} mode with {} worker instances'.format( config.get('scrape_method'), int(config.get('num_workers')))) return # get a scoped sqlalchemy session session_cls = get_session(config, scoped=False) session = session_cls() # add fixtures fixtures(config, session) # add proxies to the database add_proxies_to_db(proxies, session) # ask the user to continue the last scrape. We detect a continuation of a # previously established scrape, if the keyword-file is the same and unmodified since # the beginning of the last scrape. scraper_search = None if kwfile and config.get('continue_last_scrape', False): searches = session.query(ScraperSearch). \ filter(ScraperSearch.keyword_file == kwfile). \ order_by(ScraperSearch.started_searching). \ all() if searches: last_search = searches[-1] last_modified = datetime.datetime.utcfromtimestamp( os.path.getmtime(last_search.keyword_file)) # if the last modification is older then the starting of the search if last_modified < last_search.started_searching: scraper_search = last_search logger.info('Continuing last scrape.') if not scraper_search: scraper_search = ScraperSearch( keyword_file=kwfile, number_search_engines_used=num_search_engines, number_proxies_used=len(proxies), number_search_queries=len(keywords), started_searching=datetime.datetime.utcnow(), used_search_engines=','.join(search_engines)) # First of all, lets see how many requests remain to issue after searching the cache. if config.get('do_caching'): scrape_jobs = cache_manager.parse_all_cached_files( scrape_jobs, session, scraper_search) if scrape_jobs: # Create a lock to synchronize database access in the sqlalchemy session db_lock = threading.Lock() # create a lock to cache results cache_lock = threading.Lock() # A lock to prevent multiple threads from solving captcha, used in selenium instances. captcha_lock = threading.Lock() logger.info( 'Going to scrape {num_keywords} keywords with {num_proxies} proxies by using {num_threads} threads.' .format(num_keywords=len(list(scrape_jobs)), num_proxies=len(proxies), num_threads=num_search_engines)) progress_thread = None # Let the games begin if method in ('selenium', 'http'): # Show the progress of the scraping q = queue.Queue() progress_thread = ShowProgressQueue(config, q, len(scrape_jobs)) progress_thread.start() workers = queue.Queue() num_worker = 0 for search_engine in search_engines: for proxy in proxies: for worker in range(num_workers): num_worker += 1 workers.put( ScrapeWorkerFactory(config, cache_manager=cache_manager, mode=method, proxy=proxy, search_engine=search_engine, session=session, db_lock=db_lock, cache_lock=cache_lock, scraper_search=scraper_search, captcha_lock=captcha_lock, progress_queue=q, browser_num=num_worker)) # here we look for suitable workers # for all jobs created. for job in scrape_jobs: while True: worker = workers.get() workers.put(worker) if worker.is_suitabe(job): worker.add_job(job) break threads = [] while not workers.empty(): worker = workers.get() thread = worker.get_worker() if thread: threads.append(thread) for t in threads: t.start() for t in threads: t.join() # after threads are done, stop the progress queue. q.put('done') progress_thread.join() elif method == 'http-async': scheduler = AsyncScrapeScheduler(config, scrape_jobs, cache_manager=cache_manager, session=session, scraper_search=scraper_search, db_lock=db_lock) scheduler.run() else: raise Exception('No such scrape_method {}'.format( config.get('scrape_method'))) from GoogleScraper.output_converter import close_outfile close_outfile() scraper_search.stopped_searching = datetime.datetime.utcnow() session.add(scraper_search) session.commit() if return_results: return scraper_search
def main(return_results=False, parse_cmd_line=True): """Runs the GoogleScraper application as determined by the various configuration points. The main() function encompasses the core functionality of GoogleScraper. But it shouldn't be the main() functions job to check the validity of the provided configuration. Args: return_results: When GoogleScrape is used from within another program, don't print results to stdout, store them in a database instead. parse_cmd_line: Whether to get options from the command line or not. Returns: A database session to the results when return_results is True """ if parse_cmd_line: parse_cmd_args() # If the configuration file to use is explicitly specified, update the current configuration # with it. if Config['GLOBAL'].get('config_file', None): update_config_with_file(Config['GLOBAL'].get('config_file', None)) if Config['GLOBAL'].getboolean('view_config'): from GoogleScraper.config import CONFIG_FILE print(open(CONFIG_FILE).read()) return if Config['GLOBAL'].getboolean('version'): from GoogleScraper.version import __version__ print(__version__) return kwfile = Config['SCRAPING'].get('keyword_file', '') keyword = Config['SCRAPING'].get('keyword') keywords = { keyword for keyword in set(Config['SCRAPING'].get('keywords', []).split('\n')) if keyword } proxy_file = Config['GLOBAL'].get('proxy_file', '') proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '') if Config['GLOBAL'].getboolean('shell', False): namespace = {} Session = get_session(scoped=False, create=False) namespace['session'] = Session() namespace['ScraperSearch'] = ScraperSearch namespace['SERP'] = SERP namespace['Link'] = Link print('Available objects:') print('session - A sqlalchemy session of the results database') print('ScraperSearch - Search/Scrape job instances') print('SERP - A search engine results page') print('Link - A single link belonging to a SERP') start_python_console(namespace) return if not (keyword or keywords) and not kwfile: logger.error( 'No keywords to scrape for. Please provide either an keyword file (Option: --keyword-file) or specify and keyword with --keyword.' ) get_command_line(False, True) return if Config['GLOBAL'].getboolean('fix_cache_names'): fix_broken_cache_names() logger.info('renaming done. restart for normal use.') return keywords = [ keyword, ] if keyword else keywords if kwfile: if not os.path.exists(kwfile): raise InvalidConfigurationException( 'The keyword file {} does not exist.'.format(kwfile)) else: # Clean the keywords of duplicates right in the beginning keywords = set([ line.strip() for line in open(kwfile, 'r').read().split('\n') ]) search_engines = list({ search_engine for search_engine in Config['SCRAPING'].get( 'search_engines', 'google').split(',') if search_engine }) assert search_engines, 'No search engine specified' if Config['GLOBAL'].getboolean('clean_cache_files', False): clean_cachefiles() return if Config['GLOBAL'].getboolean('check_oto', False): _caching_is_one_to_one(keyword) if Config['SCRAPING'].getint('num_results_per_page') > 100: raise InvalidConfigurationException( 'Not more that 100 results per page available for searches.') proxies = [] if proxy_db: proxies = get_proxies_from_mysql_db(proxy_db) elif proxy_file: proxies = parse_proxy_file(proxy_file) if Config['SCRAPING'].getboolean('use_own_ip'): proxies.append(None) if not proxies: raise InvalidConfigurationException( "No proxies available and using own IP is prohibited by configuration. Turning down." ) valid_search_types = ('normal', 'video', 'news', 'image') if Config['SCRAPING'].get('search_type') not in valid_search_types: InvalidConfigurationException( 'Invalid search type! Select one of {}'.format( repr(valid_search_types))) if Config['GLOBAL'].getboolean('simulate', False): print('*' * 60 + 'SIMULATION' + '*' * 60) logger.info( 'If GoogleScraper would have been run without the --simulate flag, it would have:' ) logger.info( 'Scraped for {} keywords, with {} results a page, in total {} pages for each keyword' .format(len(keywords), Config['SCRAPING'].getint('num_results_per_page', 0), Config['SCRAPING'].getint('num_pages_for_keyword'))) if None in proxies: logger.info('Also using own ip address to scrape.') else: logger.info('Not scraping with own ip address.') logger.info('Used {} unique ip addresses in total'.format( len(proxies))) if proxies: logger.info('The following proxies are used: \n\t\t{}'.format( '\n\t\t'.join([ proxy.host + ':' + proxy.port for proxy in proxies if proxy ]))) logger.info('By using {} mode with {} worker instances'.format( Config['SCRAPING'].get('scrapemethod'), Config['SCRAPING'].getint('num_workers'))) return # get a scoped sqlalchemy session Session = get_session(scoped=False, create=True) session = Session() # ask the user to continue the last scrape. We detect a continuation of a # previously established scrape, if the keyword-file is the same and unmodified since # the beginning of the last scrape. scraper_search = None if kwfile: searches = session.query(ScraperSearch).\ filter(ScraperSearch.keyword_file == kwfile).\ order_by(ScraperSearch.started_searching).\ all() if searches: last_search = searches[-1] last_modified = datetime.datetime.fromtimestamp( os.path.getmtime(last_search.keyword_file)) # if the last modification is older then the starting of the search if last_modified < last_search.started_searching: scraper_search = last_search logger.info('Continuing last scrape.') if not scraper_search: scraper_search = ScraperSearch( keyword_file=kwfile, number_search_engines_used=1, number_proxies_used=len(proxies), number_search_queries=len(keywords), started_searching=datetime.datetime.utcnow(), used_search_engines=','.join(search_engines)) # First of all, lets see how many keywords remain to scrape after parsing the cache if Config['GLOBAL'].getboolean('do_caching'): remaining = parse_all_cached_files(keywords, search_engines, session, scraper_search) else: remaining = keywords # remove duplicates and empty keywords remaining = [keyword for keyword in set(remaining) if keyword] if remaining: kwgroups = assign_keywords_to_scrapers(remaining) # Create a lock to synchronize database access in the sqlalchemy session db_lock = threading.Lock() # create a lock to cache results cache_lock = threading.Lock() # final check before going into the loop num_workers_to_allocate = len(kwgroups) * len( search_engines) > Config['SCRAPING'].getint('maximum_workers') if (len(kwgroups) * len(search_engines) ) > Config['SCRAPING'].getint('maximum_workers'): logger.error('Too many workers: {} , might crash the app'.format( num_workers_to_allocate)) out('Going to scrape {num_keywords} keywords with {num_proxies} proxies by using {num_threads} threads.' .format(num_keywords=len(remaining), num_proxies=len(proxies), num_threads=Config['SCRAPING'].getint('num_workers', 1)), lvl=1) # Show the progress of the scraping q = queue.Queue() progress_thread = ShowProgressQueue(q, len(remaining)) progress_thread.start() # Let the games begin if Config['SCRAPING'].get('scrapemethod') in ('selenium', 'http'): # A lock to prevent multiple threads from solving captcha. captcha_lock = threading.Lock() # Distribute the proxies evenly on the keywords to search for scrapejobs = [] for k, search_engine in enumerate(search_engines): for i, keyword_group in enumerate(kwgroups): proxy_to_use = proxies[i % len(proxies)] if Config['SCRAPING'].get('scrapemethod', 'http') == 'selenium': scrapejobs.append( SelScrape( search_engine=search_engine, session=session, keywords=keyword_group, db_lock=db_lock, cache_lock=cache_lock, scraper_search=scraper_search, captcha_lock=captcha_lock, browser_num=i, proxy=proxy_to_use, progress_queue=q, )) elif Config['SCRAPING'].get('scrapemethod') == 'http': scrapejobs.append( HttpScrape( search_engine=search_engine, keywords=keyword_group, session=session, scraper_search=scraper_search, cache_lock=cache_lock, db_lock=db_lock, proxy=proxy_to_use, progress_queue=q, )) for t in scrapejobs: t.start() for t in scrapejobs: t.join() elif Config['SCRAPING'].get('scrapemethod') == 'http-async': raise NotImplemented('soon my dear friends :)') else: raise InvalidConfigurationException( 'No such scrapemethod. Use "http" or "sel"') scraper_search.stopped_searching = datetime.datetime.utcnow() session.add(scraper_search) session.commit() progress_thread.join() if return_results: return session
def main(return_results=False, parse_cmd_line=True, config_from_dict=None): """Runs the GoogleScraper application as determined by the various configuration points. The main() function encompasses the core functionality of GoogleScraper. But it shouldn't be the main() functions job to check the validity of the provided configuration. Args: return_results: When GoogleScrape is used from within another program, don't print results to stdout, store them in a database instead. parse_cmd_line: Whether to get options from the command line or not. config_from_dict: Configuration that is passed when GoogleScraper is called as library. Returns: A database session to the results when return_results is True. Else, nothing. """ external_config_file_path = cmd_line_args = None if parse_cmd_line: cmd_line_args = get_command_line() if cmd_line_args.get('config_file', None): external_config_file_path = os.path.abspath( cmd_line_args.get('config_file')) config = get_config(cmd_line_args, external_config_file_path, config_from_dict) keywords = config.get('keywords') kwfile = config.get('keyword_file', None) if isinstance(config['log_level'], int): config['log_level'] = logging.getLevelName(config['log_level']) setup_logger(level=config.get('log_level').upper()) if config.get('view_config', False): print(open(os.path.join(get_base_path(), 'scrape_config.py')).read()) return if config.get('version'): from GoogleScraper.version import __version__ print(__version__) return if config.get('clean', False): try: os.remove('google_scraper.db') if sys.platform == 'linux': os.system('rm {}/*'.format(config.get('cachedir'))) except: pass return init_outfile(config, force_reload=True) # in output_converter.py proxy_file = config.get('proxy_file', '') proxy_db = config.get('mysql_proxy_db', '') setup_shell_config(config) search_engines = get_search_engines( config.get('search_engines', ['google']), config.get('supported_search_engines')) num_search_engines = len(search_engines) num_workers = int(config.get('num_workers')) scrape_method = config.get('scrape_method') pages = int(config.get('num_pages_for_keyword', 1)) method = config.get('scrape_method', 'http') cache_manager = CacheManager(config) if config.get('fix_cache_names'): cache_manager.fix_broken_cache_names() logger.info('renaming done. restart for normal use.') return scrape_jobs = {} if not scrape_jobs: scrape_jobs = default_scrape_jobs_for_keywords(keywords, search_engines, scrape_method, pages) scrape_jobs = list(scrape_jobs) if config.get('clean_cache_files', False): cache_manager.clean_cachefiles() return if config.get('check_oto', False): cache_manager._caching_is_one_to_one(keyword) if config.get('num_results_per_page') > 100: raise WrongConfigurationError( 'Not more that 100 results per page available for searches.') proxies = [] if proxy_db: proxies = get_proxies_from_mysql_db(proxy_db) elif proxy_file: proxies = parse_proxy_file(proxy_file) if config.get('use_own_ip'): proxies.append(None) if not proxies: raise Exception( 'No proxies available and using own IP is prohibited by configuration. Turning down.' ) if config.get('search_type') not in VALID_SEARCH_TYPES: raise WrongConfigurationError( 'Invalid search type! Select one of {}'.format( repr(VALID_SEARCH_TYPES))) if config.get('simulate', False): run_simulation(config.get('num_results_per_page', 0), config.get('num_pages_for_keyword'), config.get('num_workers')) # get a scoped sqlalchemy session session_cls = get_session(config, scoped=False) session = session_cls() # add fixtures fixtures(config, session) # add proxies to the database add_proxies_to_db(proxies, session) # ask the user to continue the last scrape. We detect a continuation of a # previously established scrape, if the keyword-file is the same and unmodified since # the beginning of the last scrape. scraper_search = None if kwfile and config.get('continue_last_scrape', False): searches = session.query(ScraperSearch). \ filter(ScraperSearch.keyword_file == kwfile). \ order_by(ScraperSearch.started_searching). \ all() if searches: last_search = searches[-1] last_modified = datetime.datetime.utcfromtimestamp( os.path.getmtime(last_search.keyword_file)) # if the last modification is older then the starting of the search if last_modified < last_search.started_searching: scraper_search = last_search logger.info('Continuing last scrape.') if not scraper_search: scraper_search = ScraperSearch( keyword_file=kwfile, number_search_engines_used=num_search_engines, number_proxies_used=len(proxies), number_search_queries=len(keywords), started_searching=datetime.datetime.utcnow(), used_search_engines=','.join(search_engines)) # First of all, lets see how many requests remain to issue after searching the cache. if config.get('do_caching'): scrape_jobs = cache_manager.parse_all_cached_files( scrape_jobs, session, scraper_search) if scrape_jobs: # Create a lock to synchronize database access in the sqlalchemy session db_lock = threading.Lock() # create a lock to cache results cache_lock = threading.Lock() # A lock to prevent multiple threads from solving captcha, used in selenium instances. captcha_lock = threading.Lock() logger.info( 'Going to scrape {num_keywords} keywords with {num_proxies} proxies by using {num_threads} threads.' .format(num_keywords=len(list(scrape_jobs)), num_proxies=len(proxies), num_threads=num_search_engines)) progress_thread = None # Let the games begin if method in ('selenium', 'http'): # Show the progress of the scraping q = queue.Queue() progress_thread = ShowProgressQueue(config, q, len(scrape_jobs)) progress_thread.start() workers = queue.Queue() num_worker = 0 for search_engine in search_engines: for proxy in proxies: for worker in range(num_workers): num_worker += 1 workers.put( ScrapeWorkerFactory(config, cache_manager=cache_manager, mode=method, proxy=proxy, search_engine=search_engine, session=session, db_lock=db_lock, cache_lock=cache_lock, scraper_search=scraper_search, captcha_lock=captcha_lock, progress_queue=q, browser_num=num_worker)) # here we look for suitable workers # for all jobs created. for job in scrape_jobs: while True: worker = workers.get() workers.put(worker) if worker.is_suitabe(job): worker.add_job(job) break threads = [] while not workers.empty(): worker = workers.get() thread = worker.get_worker() if thread: threads.append(thread) for t in threads: t.start() for t in threads: t.join() # after threads are done, stop the progress queue. progress_thread.join() elif method == 'http-async': scheduler = AsyncScrapeScheduler(config, scrape_jobs, cache_manager=cache_manager, session=session, scraper_search=scraper_search, db_lock=db_lock) scheduler.run() else: raise Exception('No such scrape_method {}'.format( config.get('scrape_method'))) from GoogleScraper.output_converter import close_outfile close_outfile() scraper_search.stopped_searching = datetime.datetime.utcnow() session.add(scraper_search) session.commit() if return_results: return scraper_search
def main(return_results=True): """Runs the GoogleScraper application as determined by the various configuration points. The main() function encompasses the core functionality of GoogleScraper. But it shouldn't be the main() functions job to check the validity of the provided configuration. Args: return_results: When GoogleScrape is used from within another program, don't print results to stdout, store them in a database instead. Returns: A database connection to the results when return_results is True """ parse_cmd_args() if Config['GLOBAL'].getboolean('view_config'): from GoogleScraper.config import CONFIG_FILE print(open(CONFIG_FILE).read()) return if Config['GLOBAL'].getboolean('do_caching'): d = Config['GLOBAL'].get('cachedir') if not os.path.exists(d): os.mkdir(d, 0o744) else: maybe_clean_cache() kwfile = Config['SCRAPING'].get('keyword_file') keyword = Config['SCRAPING'].get('keyword') keywords = set(Config['SCRAPING'].get('keywords', '').split('\n')) proxy_file = Config['GLOBAL'].get('proxy_file', '') proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '') if not (keyword or keywords) and not kwfile: raise InvalidConfigurationException( 'You must specify a keyword file (separated by newlines, each keyword on a line) with the flag `--keyword-file {filepath}~' ) if Config['GLOBAL'].getboolean('fix_cache_names'): fix_broken_cache_names() logger.info('renaming done. restart for normal use.') return keywords = [ keyword, ] if keyword else keywords if kwfile: if not os.path.exists(kwfile): raise InvalidConfigurationException( 'The keyword file {} does not exist.'.format(kwfile)) else: # Clean the keywords of duplicates right in the beginning keywords = set([ line.strip() for line in open(kwfile, 'r').read().split('\n') ]) if Config['GLOBAL'].getboolean('check_oto', False): _caching_is_one_to_one(keyword) if Config['SCRAPING'].getint('num_results_per_page') > 100: raise InvalidConfigurationException( 'Not more that 100 results per page available for Google searches.' ) proxies = [] if proxy_db: proxies = get_proxies_from_mysql_db(proxy_db) elif proxy_file: proxies = parse_proxy_file(proxy_file) valid_search_types = ('normal', 'video', 'news', 'image') if Config['SCRAPING'].get('search_type') not in valid_search_types: InvalidConfigurationException( 'Invalid search type! Select one of {}'.format( repr(valid_search_types))) # Create a sqlite3 database to store the results conn = maybe_create_db() if Config['GLOBAL'].getboolean('simulate'): print('*' * 60 + 'SIMULATION' + '*' * 60) logger.info( 'If GoogleScraper would have been run without the --simulate flag, it would have' ) logger.info( 'Scraped for {} keywords (before caching), with {} results a page, in total {} pages for each keyword' .format(len(keywords), Config['SCRAPING'].getint('num_results_per_page', 0), Config['SCRAPING'].getint('num_pages_for_keyword'))) logger.info( 'Used {} distinct proxies in total, with the following proxies: {}' .format(len(proxies), '\t\t\n'.join(proxies))) if Config['SCRAPING'].get('scrapemethod') == 'sel': mode = 'selenium mode with {} browser instances'.format( Config['SELENIUM'].getint('num_browser_instances')) else: mode = 'http mode' logger.info('By using scrapemethod: {}'.format(mode)) return # First of all, lets see how many keywords remain to scrape after parsing the cache if Config['GLOBAL'].getboolean('do_caching'): remaining = parse_all_cached_files( keywords, conn, url=Config['SELENIUM'].get('sel_scraper_base_url')) else: remaining = keywords kwgroups = assign_keywords_to_scrapers(remaining) # Let the games begin if Config['SCRAPING'].get('scrapemethod', 'http') == 'sel': # Create a lock to sync file access rlock = threading.RLock() # A lock to prevent multiple threads from solving captcha. lock = threading.Lock() # Distribute the proxies evenly on the keywords to search for scrapejobs = [] Q = queue.Queue() if Config['SCRAPING'].getboolean('use_own_ip'): proxies.append(None) elif not proxies: raise InvalidConfigurationException( "No proxies available and using own IP is prohibited by configuration. Turning down." ) chunks_per_proxy = math.ceil(len(kwgroups) / len(proxies)) for i, chunk in enumerate(kwgroups): scrapejobs.append( SelScrape(chunk, rlock, Q, captcha_lock=lock, browser_num=i, proxy=proxies[i // chunks_per_proxy])) for t in scrapejobs: t.start() handler = ResultsHandler(Q, conn) handler.start() for t in scrapejobs: t.join() # All scrape jobs done, signal the db handler to stop Q.put(Config['GLOBAL'].get('all_processed_sig')) handler.join() conn.commit() if return_results: return conn else: conn.close() elif Config['SCRAPING'].get('scrapemethod') == 'http': threads = [] for group in kwgroups: threads.append(HttpScrape(keywords=group)) for thread in threads: thread.start() for thread in threads: thread.join() elif Config['SCRAPING'].get('scrapemethod') == 'http_async': pass else: raise InvalidConfigurationException( 'No such scrapemethod. Use "http" or "sel"')
def main(return_results=True, force_reload=False, proxies=[]): """Runs the GoogleScraper application as determined by the various configuration points. Keyword arguments: return_results -- Whether the GoogleScraper application is run programmatically. Will return all scraped results. """ parse_cmd_args() if Config['GLOBAL'].getboolean('view_config'): from GoogleScraper.config import CONFIG_FILE print(open(CONFIG_FILE).read()) sys.exit(0) if Config['GLOBAL'].getboolean('do_caching'): d = Config['GLOBAL'].get('cachedir') if not os.path.exists(d): os.mkdir(d, 0o744) else: maybe_clean_cache() kwfile = Config['SCRAPING'].get('keyword_file') keyword = Config['SCRAPING'].get('keyword') keywords = set(Config['SCRAPING'].get('keywords', '').split('\n')) proxy_file = Config['GLOBAL'].get('proxy_file', '') proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '') if not (keyword or keywords) and not kwfile: raise InvalidConfigurationException('You must specify a keyword file (separated by newlines, each keyword on a line) with the flag `--keyword-file {filepath}~') if Config['GLOBAL'].getboolean('fix_cache_names'): fix_broken_cache_names() sys.exit('renaming done. restart for normal use.') keywords = [keyword, ] if keyword else keywords if kwfile: if not os.path.exists(kwfile): raise InvalidConfigurationException('The keyword file {} does not exist.'.format(kwfile)) else: # Clean the keywords of duplicates right in the beginning keywords = set([line.strip() for line in open(kwfile, 'r').read().split('\n')]) if Config['GLOBAL'].getboolean('check_oto', False): _caching_is_one_to_one(keyword) if Config['SCRAPING'].getint('num_results_per_page') > 100: raise InvalidConfigurationException('Not more that 100 results per page available for Google searches.') if not proxies: # look for proxies in mysql database or a proxy file if not given as keyword argument if proxy_db: proxies = get_proxies_from_mysql_db(proxy_db) elif proxy_file: proxies = parse_proxy_file(proxy_file) valid_search_types = ('normal', 'video', 'news', 'image') if Config['SCRAPING'].get('search_type') not in valid_search_types: InvalidConfigurationException('Invalid search type! Select one of {}'.format(repr(valid_search_types))) # Create a sqlite database to store the results conn = maybe_create_db() if Config['GLOBAL'].getboolean('simulate'): print('*' * 60 + 'SIMULATION' + '*' * 60) logger.info('If GoogleScraper would have been run without the --simulate flag, it would have') logger.info('Scraped for {} keywords (before caching), with {} results a page, in total {} pages for each keyword'.format( len(keywords), Config['SCRAPING'].getint('num_results_per_page', 0), Config['SCRAPING'].getint('num_of_pages'))) logger.info('Used {} distinct proxies in total, with the following ip addresses: {}'.format( len(proxies), '\t\t\n'.join(proxies) )) if Config['SCRAPING'].get('scrapemethod') == 'sel': mode = 'selenium mode with {} browser instances'.format(Config['SELENIUM'].getint('num_browser_instances')) else: mode = 'http mode' logger.info('By using {}'.format(mode)) sys.exit(0) # Let the games begin if Config['SCRAPING'].get('scrapemethod', '') == 'sel': # First of all, lets see how many keywords remain to scrape after parsing the cache if Config['GLOBAL'].getboolean('do_caching'): remaining = parse_all_cached_files(keywords, conn, simulate=Config['GLOBAL'].getboolean('simulate')) else: remaining = keywords # Create a lock to sync file access rlock = threading.RLock() # A lock to prevent multiple threads from solving captcha. lock = threading.Lock() max_sel_browsers = Config['SELENIUM'].getint('num_browser_instances') if len(remaining) > max_sel_browsers: kwgroups = grouper(remaining, len(remaining)//max_sel_browsers, fillvalue=None) else: # thats a little special there :) kwgroups = [[kw, ] for kw in remaining] # Distribute the proxies evenly on the keywords to search for scrapejobs = [] Q = queue.Queue() if Config['SCRAPING'].getboolean('use_own_ip'): proxies.append(None) elif not proxies: raise InvalidConfigurationException("No proxies available and using own IP is prohibited by configuration. Turning down.") chunks_per_proxy = math.ceil(len(kwgroups)/len(proxies)) for i, chunk in enumerate(kwgroups): scrapejobs.append(SelScraper(chunk, rlock, Q, captcha_lock=lock, browser_num=i, proxy=proxies[i//chunks_per_proxy])) for t in scrapejobs: t.start() handler = ResultsHandler(Q, conn) handler.start() for t in scrapejobs: t.join() # All scrape jobs done, signal the db handler to stop Q.put(Config['GLOBAL'].get('all_processed_sig')) handler.join() conn.commit() if return_results: return conn else: conn.close() elif Config['SCRAPING'].get('scrapemethod') == 'http': results = [] cursor = conn.cursor() if Config['SCRAPING'].getboolean('deep_scrape', False): # TODO: implement deep scrape raise NotImplementedError('Sorry. Currently deep scrape is not implemented.') else: for i, kw in enumerate(keywords): r = scrape(kw, num_results_per_page=Config['SCRAPING'].getint('num_results_per_page', 10), num_pages=Config['SCRAPING'].getint('num_pages', 1), scrapemethod='http') if r: cursor.execute('INSERT INTO serp_page (page_number, requested_at, num_results, num_results_for_kw_google, search_query) VALUES(?,?,?,?,?)', (i, datetime.datetime.utcnow(), 0, 0, kw)) serp_id = cursor.lastrowid for result in r: for result_set in ('results', 'ads_main', 'ads_aside'): if result_set in result.keys(): for title, snippet, url, pos in result[result_set]: cursor.execute('INSERT INTO link (title, snippet, url, domain, rank, serp_id) VALUES(?, ?, ?, ?, ?, ?)', (title, snippet, url.geturl(), url.netloc, pos, serp_id)) results.append(r) cursor.close() if Config['GLOBAL'].get('print'): print_scrape_results_http(results, Config['GLOBAL'].getint('verbosity', 0)) return conn else: raise InvalidConfigurationException('No such scrapemethod. Use "http" or "sel"')
def main(return_results=True): """Runs the GoogleScraper application as determined by the various configuration points. The main() function encompasses the core functionality of GoogleScraper. But it shouldn't be the main() functions job to check the validity of the provided configuration. Args: return_results: When GoogleScrape is used from within another program, don't print results to stdout, store them in a database instead. Returns: A database connection to the results when return_results is True """ parse_cmd_args() if Config['GLOBAL'].getboolean('view_config'): from GoogleScraper.config import CONFIG_FILE print(open(CONFIG_FILE).read()) return maybe_clean_cache() kwfile = Config['SCRAPING'].get('keyword_file') keyword = Config['SCRAPING'].get('keyword') keywords = set(Config['SCRAPING'].get('keywords', '').split('\n')) proxy_file = Config['GLOBAL'].get('proxy_file', '') proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '') if not (keyword or keywords) and not kwfile: raise InvalidConfigurationException('You must specify a keyword file (separated by newlines, each keyword on a line) with the flag `--keyword-file {filepath}~') if Config['GLOBAL'].getboolean('fix_cache_names'): fix_broken_cache_names() logger.info('renaming done. restart for normal use.') return keywords = [keyword, ] if keyword else keywords if kwfile: if not os.path.exists(kwfile): raise InvalidConfigurationException('The keyword file {} does not exist.'.format(kwfile)) else: # Clean the keywords of duplicates right in the beginning keywords = set([line.strip() for line in open(kwfile, 'r').read().split('\n')]) if Config['GLOBAL'].getboolean('check_oto', False): _caching_is_one_to_one(keyword) if Config['SCRAPING'].getint('num_results_per_page') > 100: raise InvalidConfigurationException('Not more that 100 results per page available for searches.') proxies = [] if proxy_db: proxies = get_proxies_from_mysql_db(proxy_db) elif proxy_file: proxies = parse_proxy_file(proxy_file) valid_search_types = ('normal', 'video', 'news', 'image') if Config['SCRAPING'].get('search_type') not in valid_search_types: InvalidConfigurationException('Invalid search type! Select one of {}'.format(repr(valid_search_types))) if Config['GLOBAL'].getboolean('simulate', False): print('*' * 60 + 'SIMULATION' + '*' * 60) logger.info('If GoogleScraper would have been run without the --simulate flag, it would have:') logger.info('Scraped for {} keywords, with {} results a page, in total {} pages for each keyword'.format( len(keywords), Config['SCRAPING'].getint('num_results_per_page', 0), Config['SCRAPING'].getint('num_pages_for_keyword'))) parse_all_cached_files(keywords, None) logger.info('Used {} distinct proxies in total'.format(len(proxies))) if proxies: logger.info('The following proxies are used: {}'.format('\t\t\n'.join(proxies))) if Config['SCRAPING'].get('scrapemethod') == 'sel': mode = 'selenium mode with {} browser instances'.format(Config['SELENIUM'].getint('num_browser_instances')) else: mode = 'http mode' logger.info('By using scrapemethod: {}'.format(mode)) return if Config['GLOBAL'].getboolean('shell', False): namespace = {} namespace['session'] = get_session(scoped=False, create=False) namespace['ScraperSearch'] = ScraperSearch namespace['SERP'] = SERP namespace['Link'] = Link print('Available objects:') print('session - A sqlalchemy session of the results database') print('ScraperSearch - Search/Scrape job instances') print('SERP - A search engine results page') print('Link - A single link belonging to a SERP') start_python_console(namespace) return # get a scoped sqlalchemy session session = get_session(scoped=True, create=True) scraper_search = ScraperSearch( number_search_engines_used=1, number_proxies_used=len(proxies), number_search_queries=len(keywords), started_searching=datetime.datetime.utcnow() ) # First of all, lets see how many keywords remain to scrape after parsing the cache if Config['GLOBAL'].getboolean('do_caching'): remaining = parse_all_cached_files(keywords, session, scraper_search) else: remaining = keywords kwgroups = assign_keywords_to_scrapers(remaining) # Create a lock to synchronize database access in the sqlalchemy session db_lock = Lock() # create a lock to cache results cache_lock = Lock() # Let the games begin if Config['SCRAPING'].get('scrapemethod', 'http') == 'sel': # A lock to prevent multiple threads from solving captcha. lock = threading.Lock() # Distribute the proxies evenly on the keywords to search for scrapejobs = [] if Config['SCRAPING'].getboolean('use_own_ip'): proxies.append(None) elif not proxies: raise InvalidConfigurationException("No proxies available and using own IP is prohibited by configuration. Turning down.") chunks_per_proxy = math.ceil(len(kwgroups)/len(proxies)) for i, keyword_group in enumerate(kwgroups): scrapejobs.append( SelScrape( keywords=keyword_group, db_lock=db_lock, cache_lock=cache_lock, session=session, captcha_lock=lock, browser_num=i, proxy=proxies[i//chunks_per_proxy] ) ) for t in scrapejobs: t.start() for t in scrapejobs: t.join() elif Config['SCRAPING'].get('scrapemethod') == 'http': threads = [] for group in kwgroups: threads.append( HttpScrape( keywords=group, session=session, cache_lock=cache_lock, db_lock=db_lock ) ) for thread in threads: thread.start() for thread in threads: thread.join() elif Config['SCRAPING'].get('scrapemethod') == 'http_async': raise NotImplemented('soon my dead friends :)') else: raise InvalidConfigurationException('No such scrapemethod. Use "http" or "sel"') scraper_search.stopped_searching = datetime.datetime.utcnow() session.add(scraper_search) session.commit()
def main(return_results=False, parse_cmd_line=True): """Runs the GoogleScraper application as determined by the various configuration points. The main() function encompasses the core functionality of GoogleScraper. But it shouldn't be the main() functions job to check the validity of the provided configuration. Args: return_results: When GoogleScrape is used from within another program, don't print results to stdout, store them in a database instead. Returns: A database session to the results when return_results is True """ if parse_cmd_line: parse_cmd_args() if Config['GLOBAL'].getboolean('view_config'): from GoogleScraper.config import CONFIG_FILE print(open(CONFIG_FILE).read()) return if Config['GLOBAL'].getboolean('version'): from GoogleScraper.version import __version__ print(__version__) return maybe_clean_cache() kwfile = Config['SCRAPING'].get('keyword_file') keyword = Config['SCRAPING'].get('keyword') keywords = {keyword for keyword in set(Config['SCRAPING'].get('keywords', []).split('\n')) if keyword} proxy_file = Config['GLOBAL'].get('proxy_file', '') proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '') if Config['GLOBAL'].getboolean('shell', False): namespace = {} Session = get_session(scoped=False, create=False) namespace['session'] = Session() namespace['ScraperSearch'] = ScraperSearch namespace['SERP'] = SERP namespace['Link'] = Link print('Available objects:') print('session - A sqlalchemy session of the results database') print('ScraperSearch - Search/Scrape job instances') print('SERP - A search engine results page') print('Link - A single link belonging to a SERP') start_python_console(namespace) return if not (keyword or keywords) and not kwfile: logger.error('No keywords to scrape for. Please provide either an keyword file (Option: --keyword-file) or specify and keyword with --keyword.') return if Config['GLOBAL'].getboolean('fix_cache_names'): fix_broken_cache_names() logger.info('renaming done. restart for normal use.') return keywords = [keyword, ] if keyword else keywords if kwfile: if not os.path.exists(kwfile): raise InvalidConfigurationException('The keyword file {} does not exist.'.format(kwfile)) else: # Clean the keywords of duplicates right in the beginning keywords = set([line.strip() for line in open(kwfile, 'r').read().split('\n')]) search_engines = list({search_engine for search_engine in Config['SCRAPING'].get('search_engines', 'google').split(',') if search_engine}) assert search_engines, 'No search engine specified' if Config['GLOBAL'].getboolean('clean_cache_files', False): clean_cachefiles() return if Config['GLOBAL'].getboolean('check_oto', False): _caching_is_one_to_one(keyword) if Config['SCRAPING'].getint('num_results_per_page') > 100: raise InvalidConfigurationException('Not more that 100 results per page available for searches.') proxies = [] if proxy_db: proxies = get_proxies_from_mysql_db(proxy_db) elif proxy_file: proxies = parse_proxy_file(proxy_file) if Config['SCRAPING'].getboolean('use_own_ip'): proxies.append(None) if not proxies: raise InvalidConfigurationException("No proxies available and using own IP is prohibited by configuration. Turning down.") valid_search_types = ('normal', 'video', 'news', 'image') if Config['SCRAPING'].get('search_type') not in valid_search_types: InvalidConfigurationException('Invalid search type! Select one of {}'.format(repr(valid_search_types))) if Config['GLOBAL'].getboolean('simulate', False): print('*' * 60 + 'SIMULATION' + '*' * 60) logger.info('If GoogleScraper would have been run without the --simulate flag, it would have:') logger.info('Scraped for {} keywords, with {} results a page, in total {} pages for each keyword'.format( len(keywords), Config['SCRAPING'].getint('num_results_per_page', 0), Config['SCRAPING'].getint('num_pages_for_keyword'))) if None in proxies: logger.info('Also using own ip address to scrape.') else: logger.info('Not scraping with own ip address.') logger.info('Used {} unique ip addresses in total'.format(len(proxies))) if proxies: logger.info('The following proxies are used: \n\t\t{}'.format('\n\t\t'.join([proxy.host + ':' + proxy.port for proxy in proxies if proxy]))) logger.info('By using {} mode with {} worker instances'.format(Config['SCRAPING'].get('scrapemethod'), Config['SCRAPING'].getint('num_workers'))) return # get a scoped sqlalchemy session Session = get_session(scoped=False, create=True) session = Session() scraper_search = ScraperSearch( number_search_engines_used=1, number_proxies_used=len(proxies), number_search_queries=len(keywords), started_searching=datetime.datetime.utcnow() ) # First of all, lets see how many keywords remain to scrape after parsing the cache if Config['GLOBAL'].getboolean('do_caching'): remaining = parse_all_cached_files(keywords, search_engines, session, scraper_search) else: remaining = keywords # remove duplicates and empty keywords remaining = [keyword for keyword in set(remaining) if keyword] kwgroups = assign_keywords_to_scrapers(remaining) # Create a lock to synchronize database access in the sqlalchemy session db_lock = threading.Lock() # create a lock to cache results cache_lock = threading.Lock() # final check before going into the loop num_workers_to_allocate = len(kwgroups) * len(search_engines) > Config['SCRAPING'].getint('maximum_workers') if (len(kwgroups) * len(search_engines)) > Config['SCRAPING'].getint('maximum_workers'): logger.error('Too many workers: {} , might crash the app'.format(num_workers_to_allocate)) # Let the games begin if Config['SCRAPING'].get('scrapemethod') in ('selenium', 'http'): # A lock to prevent multiple threads from solving captcha. captcha_lock = threading.Lock() # Distribute the proxies evenly on the keywords to search for scrapejobs = [] for k, search_engine in enumerate(search_engines): for i, keyword_group in enumerate(kwgroups): proxy_to_use = proxies[i % len(proxies)] if Config['SCRAPING'].get('scrapemethod', 'http') == 'selenium': scrapejobs.append( SelScrape( search_engine=search_engine, session=session, keywords=keyword_group, db_lock=db_lock, cache_lock=cache_lock, scraper_search=scraper_search, captcha_lock=captcha_lock, browser_num=i, proxy=proxy_to_use ) ) elif Config['SCRAPING'].get('scrapemethod') == 'http': scrapejobs.append( HttpScrape( search_engine=search_engine, keywords=keyword_group, session=session, scraper_search=scraper_search, cache_lock=cache_lock, db_lock=db_lock, proxy=proxy_to_use ) ) for t in scrapejobs: t.start() for t in scrapejobs: t.join() elif Config['SCRAPING'].get('scrapemethod') == 'http-async': raise NotImplemented('soon my dear friends :)') else: raise InvalidConfigurationException('No such scrapemethod. Use "http" or "sel"') scraper_search.stopped_searching = datetime.datetime.utcnow() session.add(scraper_search) session.commit() if return_results: return session
def main(return_results=False, parse_cmd_line=True): """Runs the GoogleScraper application as determined by the various configuration points. The main() function encompasses the core functionality of GoogleScraper. But it shouldn't be the main() functions job to check the validity of the provided configuration. Args: return_results: When GoogleScrape is used from within another program, don't print results to stdout, store them in a database instead. parse_cmd_line: Whether to get options from the command line or not. Returns: A database session to the results when return_results is True """ if parse_cmd_line: parse_cmd_args() # If the configuration file to use is explicitly specified, update the current configuration # with it. if Config['GLOBAL'].get('config_file', None): update_config_with_file(Config['GLOBAL'].get('config_file', None)) if Config['GLOBAL'].getboolean('view_config'): from GoogleScraper.config import CONFIG_FILE print(open(CONFIG_FILE).read()) return if Config['GLOBAL'].getboolean('version'): from GoogleScraper.version import __version__ print(__version__) return if Config['GLOBAL'].getboolean('clean', False): try: os.remove('google_scraper.db') if sys.platform == 'linux': os.system('rm {}/*'.format(Config['GLOBAL'].get('cachedir'))) except: pass return init_outfile(force_reload=True) kwfile = Config['SCRAPING'].get('keyword_file', '') if kwfile: kwfile = os.path.abspath(kwfile) keyword = Config['SCRAPING'].get('keyword') keywords = {re.sub(' +',' ', re.sub('[^\x00-\x7F]+',' ', keyword.lower())).strip() for keyword in set(Config['SCRAPING'].get('keywords', []).split('\n')) if keyword} proxy_file = Config['GLOBAL'].get('proxy_file', '') proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '') se = Config['SCRAPING'].get('search_engines', 'google') if se.strip() == '*': se = Config['SCRAPING'].get('supported_search_engines', 'google') search_engines = list({search_engine.strip() for search_engine in se.split(',') if search_engine.strip()}) assert search_engines, 'No search engine specified' num_search_engines = len(search_engines) num_workers = Config['SCRAPING'].getint('num_workers') scrape_method = Config['SCRAPING'].get('scrape_method') pages = Config['SCRAPING'].getint('num_pages_for_keyword', 1) method = Config['SCRAPING'].get('scrape_method', 'http') if Config['GLOBAL'].getboolean('shell', False): namespace = {} session_cls = get_session(scoped=False) namespace['session'] = session_cls() namespace['ScraperSearch'] = ScraperSearch namespace['SERP'] = SERP namespace['Link'] = Link namespace['Proxy'] = GoogleScraper.database.Proxy print('Available objects:') print('session - A sqlalchemy session of the results database') print('ScraperSearch - Search/Scrape job instances') print('SERP - A search engine results page') print('Link - A single link belonging to a SERP') print('Proxy - Proxies stored for scraping projects.') start_python_console(namespace) return if not (keyword or keywords) and not kwfile: raise_or_log( 'No keywords to scrape for. Please provide either an keyword file (Option: --keyword-file) or specify and ' 'keyword with --keyword.') # Just print the help. get_command_line(True) return if Config['GLOBAL'].getboolean('fix_cache_names'): fix_broken_cache_names() logger.info('renaming done. restart for normal use.') return keywords = [keyword, ] if keyword else keywords scrape_jobs = {} if kwfile: if not os.path.exists(kwfile): raise_or_log('The keyword file {} does not exist.'.format(kwfile), exception_obj=InvalidConfigurationException) else: if kwfile.endswith('.py'): # we need to import the variable "scrape_jobs" from the module. sys.path.append(os.path.dirname(kwfile)) try: modname = os.path.split(kwfile)[-1].rstrip('.py') scrape_jobs = getattr(__import__(modname, fromlist=['scrape_jobs']), 'scrape_jobs') except ImportError as e: logger.warning(e) else: # Clean the keywords of duplicates right in the beginning keywords = set([re.sub(' +',' ', re.sub('[^\x00-\x7F]+',' ', line.lower())).strip() for line in open(kwfile, 'r').read().split('\n') if line.strip()]) # Arrange keyword set to minimize number of AdWords queries for traffic numbers maxKeywordsPerQuery = 800 keywords_list = list(keywords) numberOfQueries = math.ceil(len(keywords_list) / maxKeywordsPerQuery) keywords_adwords = [ keywords_list[i*maxKeywordsPerQuery: (i+1)*maxKeywordsPerQuery] for i in range(numberOfQueries) ] if not scrape_jobs: scrape_jobs = default_scrape_jobs_for_keywords(keywords, search_engines, scrape_method, pages) scrape_jobs = list(scrape_jobs) if Config['GLOBAL'].getboolean('clean_cache_files', False): clean_cachefiles() return if Config['GLOBAL'].getboolean('check_oto', False): _caching_is_one_to_one(keyword) if Config['SCRAPING'].getint('num_results_per_page') > 100: raise_or_log('Not more that 100 results per page available for searches.', exception_obj=InvalidConfigurationException) proxies = [] if proxy_db: proxies = get_proxies_from_mysql_db(proxy_db) elif proxy_file: proxies = parse_proxy_file(proxy_file) if Config['SCRAPING'].getboolean('use_own_ip'): proxies.append(None) if not proxies: raise InvalidConfigurationException( 'No proxies available and using own IP is prohibited by configuration. Turning down.') valid_search_types = ('normal', 'video', 'news', 'image') if Config['SCRAPING'].get('search_type') not in valid_search_types: raise_or_log('Invalid search type! Select one of {}'.format(repr(valid_search_types)), exception_obj=InvalidConfigurationException) if Config['GLOBAL'].getboolean('simulate', False): print('*' * 60 + 'SIMULATION' + '*' * 60) logger.info('If GoogleScraper would have been run without the --simulate flag, it would have:') logger.info('Scraped for {} keywords, with {} results a page, in total {} pages for each keyword'.format( len(keywords), Config['SCRAPING'].getint('num_results_per_page', 0), Config['SCRAPING'].getint('num_pages_for_keyword'))) if None in proxies: logger.info('Also using own ip address to scrape.') else: logger.info('Not scraping with own ip address.') logger.info('Used {} unique ip addresses in total'.format(len(proxies))) if proxies: logger.info('The following proxies are used: \n\t\t{}'.format( '\n\t\t'.join([proxy.host + ':' + proxy.port for proxy in proxies if proxy]))) logger.info('By using {} mode with {} worker instances'.format(Config['SCRAPING'].get('scrape_method'), Config['SCRAPING'].getint('num_workers'))) return # get a scoped sqlalchemy session session_cls = get_session(scoped=False) session = session_cls() # add fixtures fixtures(session) # add proxies to the database add_proxies_to_db(proxies, session) # ask the user to continue the last scrape. We detect a continuation of a # previously established scrape, if the keyword-file is the same and unmodified since # the beginning of the last scrape. scraper_search = None if kwfile and Config['GLOBAL'].getboolean('continue_last_scrape', False): searches = session.query(ScraperSearch). \ filter(ScraperSearch.keyword_file == kwfile). \ order_by(ScraperSearch.started_searching). \ all() if searches: last_search = searches[-1] last_modified = datetime.datetime.utcfromtimestamp(os.path.getmtime(last_search.keyword_file)) # if the last modification is older then the starting of the search if last_modified < last_search.started_searching: scraper_search = last_search logger.info('Continuing last scrape.') if not scraper_search: scraper_search = ScraperSearch( keyword_file=os.path.abspath(kwfile), number_search_engines_used=num_search_engines, number_proxies_used=len(proxies), number_search_queries=len(keywords), started_searching=datetime.datetime.utcnow(), used_search_engines=','.join(search_engines) ) # First of all, lets see how many requests remain to issue after searching the cache. if Config['GLOBAL'].getboolean('do_caching'): scrape_jobs = parse_all_cached_files(scrape_jobs, session, scraper_search) if scrape_jobs: # Create a lock to synchronize database access in the sqlalchemy session db_lock = threading.Lock() # create a lock to cache results cache_lock = threading.Lock() # A lock to prevent multiple threads from solving captcha, used in selenium instances. captcha_lock = threading.Lock() out('Going to scrape {num_keywords} keywords with {num_proxies} proxies by using {num_threads} threads.'.format( num_keywords=len(list(scrape_jobs)), num_proxies=len(proxies), num_threads=num_search_engines ), lvl=1) progress_thread = None # Let the games begin if method in ('selenium', 'http'): # Show the progress of the scraping q = queue.Queue() progress_thread = ShowProgressQueue(q, len(scrape_jobs)) progress_thread.start() workers = queue.Queue() num_worker = 0 for search_engine in search_engines: for worker in range(num_workers): num_worker += 1 proxy_to_use = proxies[worker % len(proxies)] workers.put( ScrapeWorkerFactory( mode=method, proxy=proxy_to_use, search_engine=search_engine, session=session, db_lock=db_lock, cache_lock=cache_lock, scraper_search=scraper_search, captcha_lock=captcha_lock, progress_queue=q, browser_num=num_worker ) ) for job in scrape_jobs: while True: worker = workers.get() workers.put(worker) if worker.is_suitabe(job): worker.add_job(job) break threads = [] while not workers.empty(): worker = workers.get() thread = worker.get_worker() if thread: threads.append(thread) for t in threads: t.start() for t in threads: t.join() # after threads are done, stop the progress queue. q.put('done') elif method == 'http-async': scheduler = AsyncScrapeScheduler(scrape_jobs, session=session, scraper_search=scraper_search, db_lock=db_lock) scheduler.run() else: raise InvalidConfigurationException('No such scrape_method {}'.format(Config['SCRAPING'].get('scrape_method'))) # Once keywords have been scraped, query AdWords API for traffic numbers keywords_traffic = {} for keyword_set in keywords_adwords: if not keywords_traffic: keywords_traffic = get_traffic(keyword_set).copy() else: keywords_traffic.update(get_traffic(keyword_set)) set_values_from_adwords(session, keywords_traffic) progress_thread.adwords_done = True if method in ('selenium', 'http'): # progress_thread can be None try: progress_thread.join() except AttributeError: pass # in the end, close the json file. from GoogleScraper.output_converter import outfile, output_format if output_format == 'json': outfile.end() scraper_search.stopped_searching = datetime.datetime.utcnow() session.add(scraper_search) session.commit() if return_results: return session
def main(return_results=True, force_reload=True, proxies=[]): """Runs the GoogleScraper application as determined by the various configuration points. Keyword arguments: return_results -- Whether the GoogleScraper application is run programmatically. Will return all scraped results. """ global Config Config = get_config(True, force_reload) if Config['GLOBAL'].getboolean('view_config'): from GoogleScraper.config import CONFIG_FILE print(open(CONFIG_FILE).read()) sys.exit(0) if Config['GLOBAL'].getboolean('do_caching'): d = Config['GLOBAL'].get('cachedir') if not os.path.exists(d): os.mkdir(d, 0o744) else: maybe_clean_cache() kwfile = Config['SCRAPING'].get('keyword_file') keyword = Config['SCRAPING'].get('keyword') keywords = set(Config['SCRAPING'].get('keywords', '').split('\n')) proxy_file = Config['GLOBAL'].get('proxy_file', '') proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '') if not (keyword or keywords) and not kwfile: raise InvalidConfigurationException( 'You must specify a keyword file (separated by newlines, each keyword on a line) with the flag `--keyword-file {filepath}~' ) if Config['GLOBAL'].getboolean('fix_cache_names'): fix_broken_cache_names() sys.exit('renaming done. restart for normal use.') keywords = [ keyword, ] if keyword else keywords if kwfile: if not os.path.exists(kwfile): raise InvalidConfigurationException( 'The keyword file {} does not exist.'.format(kwfile)) else: # Clean the keywords of duplicates right in the beginning keywords = set([ line.strip() for line in open(kwfile, 'r').read().split('\n') ]) if Config['GLOBAL'].getboolean('check_oto', False): _caching_is_one_to_one(keyword) if Config['SCRAPING'].getint('num_results_per_page') > 100: raise InvalidConfigurationException( 'Not more that 100 results per page available for Google searches.' ) if not proxies: # look for proxies in mysql database or a proxy file if not given as keyword argument if proxy_db: proxies = get_proxies_from_mysql_db(proxy_db) elif proxy_file: proxies = parse_proxy_file(proxy_file) valid_search_types = ('normal', 'video', 'news', 'image') if Config['SCRAPING'].get('search_type') not in valid_search_types: InvalidConfigurationException( 'Invalid search type! Select one of {}'.format( repr(valid_search_types))) # Create a sqlite database to store the results conn = maybe_create_db() if Config['GLOBAL'].getboolean('simulate'): print('*' * 60 + 'SIMULATION' + '*' * 60) logger.info( 'If GoogleScraper would have been run without the --simulate flag, it would have' ) logger.info( 'Scraped for {} keywords (before caching), with {} results a page, in total {} pages for each keyword' .format(len(keywords), Config['SCRAPING'].getint('num_results_per_page', 0), Config['SCRAPING'].getint('num_of_pages'))) logger.info( 'Used {} distinct proxies in total, with the following ip addresses: {}' .format(len(proxies), '\t\t\n'.join(proxies))) if Config['SCRAPING'].get('scrapemethod') == 'sel': mode = 'selenium mode with {} browser instances'.format( Config['SELENIUM'].getint('num_browser_instances')) else: mode = 'http mode' logger.info('By using {}'.format(mode)) sys.exit(0) # Let the games begin if Config['SCRAPING'].get('scrapemethod', '') == 'sel': # First of all, lets see how many keywords remain to scrape after parsing the cache if Config['GLOBAL'].getboolean('do_caching'): remaining = parse_all_cached_files( keywords, conn, simulate=Config['GLOBAL'].getboolean('simulate')) else: remaining = keywords # Create a lock to sync file access rlock = threading.RLock() # A lock to prevent multiple threads from solving captcha. lock = threading.Lock() max_sel_browsers = Config['SELENIUM'].getint('num_browser_instances') if len(remaining) > max_sel_browsers: kwgroups = grouper(remaining, len(remaining) // max_sel_browsers, fillvalue=None) else: # thats a little special there :) kwgroups = [[ kw, ] for kw in remaining] # Distribute the proxies evenly on the keywords to search for scrapejobs = [] Q = queue.Queue() if Config['SCRAPING'].getboolean('use_own_ip'): proxies.append(None) elif not proxies: raise InvalidConfigurationException( "No proxies available and using own IP is prohibited by configuration. Turning down." ) chunks_per_proxy = math.ceil(len(kwgroups) / len(proxies)) for i, chunk in enumerate(kwgroups): scrapejobs.append( SelScraper(chunk, rlock, Q, captcha_lock=lock, browser_num=i, proxy=proxies[i // chunks_per_proxy])) for t in scrapejobs: t.start() handler = ResultsHandler(Q, conn) handler.start() for t in scrapejobs: t.join() # All scrape jobs done, signal the db handler to stop Q.put(Config['GLOBAL'].get('all_processed_sig')) handler.join() conn.commit() if return_results: return conn else: conn.close() elif Config['SCRAPING'].get('scrapemethod') == 'http': results = [] cursor = conn.cursor() if Config['SCRAPING'].getboolean('deep_scrape', False): # TODO: implement deep scrape raise NotImplementedError( 'Sorry. Currently deep scrape is not implemented.') else: for i, kw in enumerate(keywords): r = scrape(kw, num_results_per_page=Config['SCRAPING'].getint( 'num_results_per_page', 10), num_pages=Config['SCRAPING'].getint('num_pages', 1), scrapemethod='http') if r: cursor.execute( 'INSERT INTO serp_page (page_number, requested_at, num_results, num_results_for_kw_google, search_query) VALUES(?,?,?,?,?)', (i, datetime.datetime.utcnow(), 0, 0, kw)) serp_id = cursor.lastrowid for result in r: for result_set in ('results', 'ads_main', 'ads_aside'): if result_set in result.keys(): for title, snippet, url, pos in result[ result_set]: cursor.execute( 'INSERT INTO link (title, snippet, url, domain, rank, serp_id) VALUES(?, ?, ?, ?, ?, ?)', (title, snippet, url.geturl(), url.netloc, pos, serp_id)) results.append(r) cursor.close() if Config['GLOBAL'].get('print'): print_scrape_results_http(results, Config['GLOBAL'].getint('verbosity', 0)) return conn else: raise InvalidConfigurationException( 'No such scrapemethod. Use "http" or "sel"')
def main(return_results=True): """Runs the GoogleScraper application as determined by the various configuration points. The main() function encompasses the core functionality of GoogleScraper. But it shouldn't be the main() functions job to check the validity of the provided configuration. Args: return_results: When GoogleScrape is used from within another program, don't print results to stdout, store them in a database instead. Returns: A database connection to the results when return_results is True """ parse_cmd_args() if Config['GLOBAL'].getboolean('view_config'): from GoogleScraper.config import CONFIG_FILE print(open(CONFIG_FILE).read()) return if Config['GLOBAL'].getboolean('do_caching'): d = Config['GLOBAL'].get('cachedir') if not os.path.exists(d): os.mkdir(d, 0o744) else: maybe_clean_cache() kwfile = Config['SCRAPING'].get('keyword_file') keyword = Config['SCRAPING'].get('keyword') keywords = set(Config['SCRAPING'].get('keywords', '').split('\n')) proxy_file = Config['GLOBAL'].get('proxy_file', '') proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '') if not (keyword or keywords) and not kwfile: raise InvalidConfigurationException('You must specify a keyword file (separated by newlines, each keyword on a line) with the flag `--keyword-file {filepath}~') if Config['GLOBAL'].getboolean('fix_cache_names'): fix_broken_cache_names() logger.info('renaming done. restart for normal use.') return keywords = [keyword, ] if keyword else keywords if kwfile: if not os.path.exists(kwfile): raise InvalidConfigurationException('The keyword file {} does not exist.'.format(kwfile)) else: # Clean the keywords of duplicates right in the beginning keywords = set([line.strip() for line in open(kwfile, 'r').read().split('\n')]) if Config['GLOBAL'].getboolean('check_oto', False): _caching_is_one_to_one(keyword) if Config['SCRAPING'].getint('num_results_per_page') > 100: raise InvalidConfigurationException('Not more that 100 results per page available for Google searches.') proxies = [] if proxy_db: proxies = get_proxies_from_mysql_db(proxy_db) elif proxy_file: proxies = parse_proxy_file(proxy_file) valid_search_types = ('normal', 'video', 'news', 'image') if Config['SCRAPING'].get('search_type') not in valid_search_types: InvalidConfigurationException('Invalid search type! Select one of {}'.format(repr(valid_search_types))) # Create a sqlite3 database to store the results conn = maybe_create_db() if Config['GLOBAL'].getboolean('simulate'): print('*' * 60 + 'SIMULATION' + '*' * 60) logger.info('If GoogleScraper would have been run without the --simulate flag, it would have') logger.info('Scraped for {} keywords (before caching), with {} results a page, in total {} pages for each keyword'.format( len(keywords), Config['SCRAPING'].getint('num_results_per_page', 0), Config['SCRAPING'].getint('num_pages_for_keyword'))) logger.info('Used {} distinct proxies in total, with the following proxies: {}'.format(len(proxies), '\t\t\n'.join(proxies))) if Config['SCRAPING'].get('scrapemethod') == 'sel': mode = 'selenium mode with {} browser instances'.format(Config['SELENIUM'].getint('num_browser_instances')) else: mode = 'http mode' logger.info('By using scrapemethod: {}'.format(mode)) return # First of all, lets see how many keywords remain to scrape after parsing the cache if Config['GLOBAL'].getboolean('do_caching'): remaining = parse_all_cached_files(keywords, conn, url=Config['SELENIUM'].get('sel_scraper_base_url')) else: remaining = keywords kwgroups = assign_keywords_to_scrapers(remaining) # Let the games begin if Config['SCRAPING'].get('scrapemethod', 'http') == 'sel': # Create a lock to sync file access rlock = threading.RLock() # A lock to prevent multiple threads from solving captcha. lock = threading.Lock() # Distribute the proxies evenly on the keywords to search for scrapejobs = [] Q = queue.Queue() if Config['SCRAPING'].getboolean('use_own_ip'): proxies.append(None) elif not proxies: raise InvalidConfigurationException("No proxies available and using own IP is prohibited by configuration. Turning down.") chunks_per_proxy = math.ceil(len(kwgroups)/len(proxies)) for i, chunk in enumerate(kwgroups): scrapejobs.append(SelScrape(chunk, rlock, Q, captcha_lock=lock, browser_num=i, proxy=proxies[i//chunks_per_proxy])) for t in scrapejobs: t.start() handler = ResultsHandler(Q, conn) handler.start() for t in scrapejobs: t.join() # All scrape jobs done, signal the db handler to stop Q.put(Config['GLOBAL'].get('all_processed_sig')) handler.join() conn.commit() if return_results: return conn else: conn.close() elif Config['SCRAPING'].get('scrapemethod') == 'http': threads = [] for group in kwgroups: threads.append(HttpScrape(keywords=group)) for thread in threads: thread.start() for thread in threads: thread.join() elif Config['SCRAPING'].get('scrapemethod') == 'http_async': pass else: raise InvalidConfigurationException('No such scrapemethod. Use "http" or "sel"')