def assign_keywords_to_scrapers(all_keywords): """Scrapers are often threads or asynchronous objects. Splitting the keywords equally on the workers is crucial for maximal performance. Args: all_keywords: All keywords to scrape Returns: A list of list. The inner list should be assigned to individual scrapers. """ mode = Config['SCRAPING'].get('scrapemethod') if mode == 'sel': num_scrapers = Config['SELENIUM'].getint('num_browser_instances', 1) elif mode == 'http': num_scrapers = Config['HTTP'].getint('num_threads', 1) else: num_scrapers = 0 if len(all_keywords) > num_scrapers: kwgroups = grouper(all_keywords, len(all_keywords)//num_scrapers, fillvalue=None) else: # thats a little special there :) kwgroups = [[kw, ] for kw in all_keywords] return kwgroups
def assign_keywords_to_scrapers(all_keywords): """Scrapers are often threads or asynchronous objects. Splitting the keywords equally on the workers is crucial for maximal performance. Args: all_keywords: All keywords to scrape Returns: A list of list. The inner list should be assigned to individual scrapers. """ mode = Config['SCRAPING'].get('scrapemethod') if mode == 'sel': num_scrapers = Config['SELENIUM'].getint('num_browser_instances', 1) elif mode == 'http': num_scrapers = Config['HTTP'].getint('num_threads', 1) else: num_scrapers = 0 if len(all_keywords) > num_scrapers: kwgroups = grouper(all_keywords, len(all_keywords) // num_scrapers, fillvalue=None) else: # thats a little special there :) kwgroups = [[ kw, ] for kw in all_keywords] return kwgroups
def main(): """Runs the GoogleScraper application as determined by the various configuration points.""" global Config Config = get_config(True, True) if Config['GLOBAL'].getboolean('view_config'): from GoogleScraper.config import CONFIG_FILE print(open(CONFIG_FILE).read()) sys.exit(0) if Config['GLOBAL'].getboolean('do_caching'): d = Config['GLOBAL'].get('cachedir') if not os.path.exists(d): os.mkdir(d, 0o744) else: maybe_clean_cache() kwfile = Config['SCRAPING'].get('keyword_file') keyword = Config['SCRAPING'].get('keyword') keywords = set(Config['SCRAPING'].get('keywords', '').split('\n')) proxy_file = Config['GLOBAL'].get('proxy_file', '') proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '') if not (keyword or keywords) and not kwfile: raise ValueError('You must specify a keyword file (separated by newlines, each keyword on a line) with the flag `--keyword-file {filepath}~') if Config['GLOBAL'].getboolean('fix_cache_names'): fix_broken_cache_names() sys.exit('renaming done. restart for normal use.') keywords = [keyword,] if keyword else keywords if kwfile: if not os.path.exists(kwfile): raise ValueError('The keyword file {} does not exist.'.format(kwfile)) else: # Clean the keywords of duplicates right in the beginning keywords = set([line.strip() for line in open(kwfile, 'r').read().split('\n')]) if Config['GLOBAL'].getboolean('check_oto', False): _caching_is_one_to_one(keyword) if Config['SCRAPING'].getint('num_results_per_page') > 100: raise ValueError('Not more that 100 results per page available for Google searches.') if proxy_db: proxies = get_proxies_from_mysql_db(proxy_db) elif proxy_file: proxies = parse_proxy_file(proxy_file) else: proxies = [] valid_search_types = ('normal', 'video', 'news', 'image') if Config['SCRAPING'].get('search_type') not in valid_search_types: ValueError('Invalid search type! Select one of {}'.format(repr(valid_search_types))) # Let the games begin if Config['SCRAPING'].get('scrapemethod', '') == 'sel': conn = maybe_create_db() # First of all, lets see how many keywords remain to scrape after parsing the cache if Config['GLOBAL'].getboolean('do_caching'): remaining = parse_all_cached_files(keywords, conn, simulate=Config['GLOBAL'].getboolean('simulate')) else: remaining = keywords if Config['GLOBAL'].getboolean('simulate'): # TODO: implement simulation raise NotImplementedError('Simulating is not implemented yet!') # Create a lock to sync file access rlock = threading.RLock() # A lock to prevent multiple threads from solving captcha. lock = threading.Lock() max_sel_browsers = Config['SELENIUM'].getint('num_browser_instances') if len(remaining) > max_sel_browsers: kwgroups = grouper(remaining, len(remaining)//max_sel_browsers, fillvalue=None) else: # thats a little special there :) kwgroups = [[kw, ] for kw in remaining] # Distribute the proxies evenly on the kws to search scrapejobs = [] Q = queue.Queue() proxies.append(None) if Config['SCRAPING'].getboolean('use_own_ip') else None if not proxies: logger.info("No ip's available for scanning.") chunks_per_proxy = math.ceil(len(kwgroups)/len(proxies)) for i, chunk in enumerate(kwgroups): scrapejobs.append(SelScraper(chunk, rlock, Q, captcha_lock=lock, browser_num=i, proxy=proxies[i//chunks_per_proxy])) for t in scrapejobs: t.start() handler = ResultsHandler(Q, conn) handler.start() for t in scrapejobs: t.join() # All scrape jobs done, signal the db handler to stop Q.put(Config['GLOBAL'].get('all_processed_sig')) handler.join() conn.commit() conn.close() elif Config['SCRAPING'].get('scrapemethod') == 'http': if Config['SCRAPING'].getboolean('deep_scrape', False): # TODO: implement deep scrape raise NotImplementedError('Sorry. Currently deep_scrape is not implemented.') else: results = [] for kw in keywords: r = scrape(kw, num_results_per_page=Config['SCRAPING'].getint('num_results_per_page', 10), num_pages=Config['SCRAPING'].getint('num_pages', 1), scrapemethod='http') results.append(r) if Config['GLOBAL'].get('print'): print_scrape_results_http(results, Config['GLOBAL'].getint('verbosity', 0), view=Config['HTTP'].get('view', False)) else: raise ValueError('No such scrapemethod. Use "http" or "sel"')
def main(return_results=True, force_reload=False, proxies=[]): """Runs the GoogleScraper application as determined by the various configuration points. Keyword arguments: return_results -- Whether the GoogleScraper application is run programmatically. Will return all scraped results. """ parse_cmd_args() if Config['GLOBAL'].getboolean('view_config'): from GoogleScraper.config import CONFIG_FILE print(open(CONFIG_FILE).read()) sys.exit(0) if Config['GLOBAL'].getboolean('do_caching'): d = Config['GLOBAL'].get('cachedir') if not os.path.exists(d): os.mkdir(d, 0o744) else: maybe_clean_cache() kwfile = Config['SCRAPING'].get('keyword_file') keyword = Config['SCRAPING'].get('keyword') keywords = set(Config['SCRAPING'].get('keywords', '').split('\n')) proxy_file = Config['GLOBAL'].get('proxy_file', '') proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '') if not (keyword or keywords) and not kwfile: raise InvalidConfigurationException('You must specify a keyword file (separated by newlines, each keyword on a line) with the flag `--keyword-file {filepath}~') if Config['GLOBAL'].getboolean('fix_cache_names'): fix_broken_cache_names() sys.exit('renaming done. restart for normal use.') keywords = [keyword, ] if keyword else keywords if kwfile: if not os.path.exists(kwfile): raise InvalidConfigurationException('The keyword file {} does not exist.'.format(kwfile)) else: # Clean the keywords of duplicates right in the beginning keywords = set([line.strip() for line in open(kwfile, 'r').read().split('\n')]) if Config['GLOBAL'].getboolean('check_oto', False): _caching_is_one_to_one(keyword) if Config['SCRAPING'].getint('num_results_per_page') > 100: raise InvalidConfigurationException('Not more that 100 results per page available for Google searches.') if not proxies: # look for proxies in mysql database or a proxy file if not given as keyword argument if proxy_db: proxies = get_proxies_from_mysql_db(proxy_db) elif proxy_file: proxies = parse_proxy_file(proxy_file) valid_search_types = ('normal', 'video', 'news', 'image') if Config['SCRAPING'].get('search_type') not in valid_search_types: InvalidConfigurationException('Invalid search type! Select one of {}'.format(repr(valid_search_types))) # Create a sqlite database to store the results conn = maybe_create_db() if Config['GLOBAL'].getboolean('simulate'): print('*' * 60 + 'SIMULATION' + '*' * 60) logger.info('If GoogleScraper would have been run without the --simulate flag, it would have') logger.info('Scraped for {} keywords (before caching), with {} results a page, in total {} pages for each keyword'.format( len(keywords), Config['SCRAPING'].getint('num_results_per_page', 0), Config['SCRAPING'].getint('num_of_pages'))) logger.info('Used {} distinct proxies in total, with the following ip addresses: {}'.format( len(proxies), '\t\t\n'.join(proxies) )) if Config['SCRAPING'].get('scrapemethod') == 'sel': mode = 'selenium mode with {} browser instances'.format(Config['SELENIUM'].getint('num_browser_instances')) else: mode = 'http mode' logger.info('By using {}'.format(mode)) sys.exit(0) # Let the games begin if Config['SCRAPING'].get('scrapemethod', '') == 'sel': # First of all, lets see how many keywords remain to scrape after parsing the cache if Config['GLOBAL'].getboolean('do_caching'): remaining = parse_all_cached_files(keywords, conn, simulate=Config['GLOBAL'].getboolean('simulate')) else: remaining = keywords # Create a lock to sync file access rlock = threading.RLock() # A lock to prevent multiple threads from solving captcha. lock = threading.Lock() max_sel_browsers = Config['SELENIUM'].getint('num_browser_instances') if len(remaining) > max_sel_browsers: kwgroups = grouper(remaining, len(remaining)//max_sel_browsers, fillvalue=None) else: # thats a little special there :) kwgroups = [[kw, ] for kw in remaining] # Distribute the proxies evenly on the keywords to search for scrapejobs = [] Q = queue.Queue() if Config['SCRAPING'].getboolean('use_own_ip'): proxies.append(None) elif not proxies: raise InvalidConfigurationException("No proxies available and using own IP is prohibited by configuration. Turning down.") chunks_per_proxy = math.ceil(len(kwgroups)/len(proxies)) for i, chunk in enumerate(kwgroups): scrapejobs.append(SelScraper(chunk, rlock, Q, captcha_lock=lock, browser_num=i, proxy=proxies[i//chunks_per_proxy])) for t in scrapejobs: t.start() handler = ResultsHandler(Q, conn) handler.start() for t in scrapejobs: t.join() # All scrape jobs done, signal the db handler to stop Q.put(Config['GLOBAL'].get('all_processed_sig')) handler.join() conn.commit() if return_results: return conn else: conn.close() elif Config['SCRAPING'].get('scrapemethod') == 'http': results = [] cursor = conn.cursor() if Config['SCRAPING'].getboolean('deep_scrape', False): # TODO: implement deep scrape raise NotImplementedError('Sorry. Currently deep scrape is not implemented.') else: for i, kw in enumerate(keywords): r = scrape(kw, num_results_per_page=Config['SCRAPING'].getint('num_results_per_page', 10), num_pages=Config['SCRAPING'].getint('num_pages', 1), scrapemethod='http') if r: cursor.execute('INSERT INTO serp_page (page_number, requested_at, num_results, num_results_for_kw_google, search_query) VALUES(?,?,?,?,?)', (i, datetime.datetime.utcnow(), 0, 0, kw)) serp_id = cursor.lastrowid for result in r: for result_set in ('results', 'ads_main', 'ads_aside'): if result_set in result.keys(): for title, snippet, url, pos in result[result_set]: cursor.execute('INSERT INTO link (title, snippet, url, domain, rank, serp_id) VALUES(?, ?, ?, ?, ?, ?)', (title, snippet, url.geturl(), url.netloc, pos, serp_id)) results.append(r) cursor.close() if Config['GLOBAL'].get('print'): print_scrape_results_http(results, Config['GLOBAL'].getint('verbosity', 0)) return conn else: raise InvalidConfigurationException('No such scrapemethod. Use "http" or "sel"')
def main(return_results=True, force_reload=True, proxies=[]): """Runs the GoogleScraper application as determined by the various configuration points. Keyword arguments: return_results -- Whether the GoogleScraper application is run programmatically. Will return all scraped results. """ global Config Config = get_config(True, force_reload) if Config['GLOBAL'].getboolean('view_config'): from GoogleScraper.config import CONFIG_FILE print(open(CONFIG_FILE).read()) sys.exit(0) if Config['GLOBAL'].getboolean('do_caching'): d = Config['GLOBAL'].get('cachedir') if not os.path.exists(d): os.mkdir(d, 0o744) else: maybe_clean_cache() kwfile = Config['SCRAPING'].get('keyword_file') keyword = Config['SCRAPING'].get('keyword') keywords = set(Config['SCRAPING'].get('keywords', '').split('\n')) proxy_file = Config['GLOBAL'].get('proxy_file', '') proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '') if not (keyword or keywords) and not kwfile: raise InvalidConfigurationException( 'You must specify a keyword file (separated by newlines, each keyword on a line) with the flag `--keyword-file {filepath}~' ) if Config['GLOBAL'].getboolean('fix_cache_names'): fix_broken_cache_names() sys.exit('renaming done. restart for normal use.') keywords = [ keyword, ] if keyword else keywords if kwfile: if not os.path.exists(kwfile): raise InvalidConfigurationException( 'The keyword file {} does not exist.'.format(kwfile)) else: # Clean the keywords of duplicates right in the beginning keywords = set([ line.strip() for line in open(kwfile, 'r').read().split('\n') ]) if Config['GLOBAL'].getboolean('check_oto', False): _caching_is_one_to_one(keyword) if Config['SCRAPING'].getint('num_results_per_page') > 100: raise InvalidConfigurationException( 'Not more that 100 results per page available for Google searches.' ) if not proxies: # look for proxies in mysql database or a proxy file if not given as keyword argument if proxy_db: proxies = get_proxies_from_mysql_db(proxy_db) elif proxy_file: proxies = parse_proxy_file(proxy_file) valid_search_types = ('normal', 'video', 'news', 'image') if Config['SCRAPING'].get('search_type') not in valid_search_types: InvalidConfigurationException( 'Invalid search type! Select one of {}'.format( repr(valid_search_types))) # Create a sqlite database to store the results conn = maybe_create_db() if Config['GLOBAL'].getboolean('simulate'): print('*' * 60 + 'SIMULATION' + '*' * 60) logger.info( 'If GoogleScraper would have been run without the --simulate flag, it would have' ) logger.info( 'Scraped for {} keywords (before caching), with {} results a page, in total {} pages for each keyword' .format(len(keywords), Config['SCRAPING'].getint('num_results_per_page', 0), Config['SCRAPING'].getint('num_of_pages'))) logger.info( 'Used {} distinct proxies in total, with the following ip addresses: {}' .format(len(proxies), '\t\t\n'.join(proxies))) if Config['SCRAPING'].get('scrapemethod') == 'sel': mode = 'selenium mode with {} browser instances'.format( Config['SELENIUM'].getint('num_browser_instances')) else: mode = 'http mode' logger.info('By using {}'.format(mode)) sys.exit(0) # Let the games begin if Config['SCRAPING'].get('scrapemethod', '') == 'sel': # First of all, lets see how many keywords remain to scrape after parsing the cache if Config['GLOBAL'].getboolean('do_caching'): remaining = parse_all_cached_files( keywords, conn, simulate=Config['GLOBAL'].getboolean('simulate')) else: remaining = keywords # Create a lock to sync file access rlock = threading.RLock() # A lock to prevent multiple threads from solving captcha. lock = threading.Lock() max_sel_browsers = Config['SELENIUM'].getint('num_browser_instances') if len(remaining) > max_sel_browsers: kwgroups = grouper(remaining, len(remaining) // max_sel_browsers, fillvalue=None) else: # thats a little special there :) kwgroups = [[ kw, ] for kw in remaining] # Distribute the proxies evenly on the keywords to search for scrapejobs = [] Q = queue.Queue() if Config['SCRAPING'].getboolean('use_own_ip'): proxies.append(None) elif not proxies: raise InvalidConfigurationException( "No proxies available and using own IP is prohibited by configuration. Turning down." ) chunks_per_proxy = math.ceil(len(kwgroups) / len(proxies)) for i, chunk in enumerate(kwgroups): scrapejobs.append( SelScraper(chunk, rlock, Q, captcha_lock=lock, browser_num=i, proxy=proxies[i // chunks_per_proxy])) for t in scrapejobs: t.start() handler = ResultsHandler(Q, conn) handler.start() for t in scrapejobs: t.join() # All scrape jobs done, signal the db handler to stop Q.put(Config['GLOBAL'].get('all_processed_sig')) handler.join() conn.commit() if return_results: return conn else: conn.close() elif Config['SCRAPING'].get('scrapemethod') == 'http': results = [] cursor = conn.cursor() if Config['SCRAPING'].getboolean('deep_scrape', False): # TODO: implement deep scrape raise NotImplementedError( 'Sorry. Currently deep scrape is not implemented.') else: for i, kw in enumerate(keywords): r = scrape(kw, num_results_per_page=Config['SCRAPING'].getint( 'num_results_per_page', 10), num_pages=Config['SCRAPING'].getint('num_pages', 1), scrapemethod='http') if r: cursor.execute( 'INSERT INTO serp_page (page_number, requested_at, num_results, num_results_for_kw_google, search_query) VALUES(?,?,?,?,?)', (i, datetime.datetime.utcnow(), 0, 0, kw)) serp_id = cursor.lastrowid for result in r: for result_set in ('results', 'ads_main', 'ads_aside'): if result_set in result.keys(): for title, snippet, url, pos in result[ result_set]: cursor.execute( 'INSERT INTO link (title, snippet, url, domain, rank, serp_id) VALUES(?, ?, ?, ?, ?, ?)', (title, snippet, url.geturl(), url.netloc, pos, serp_id)) results.append(r) cursor.close() if Config['GLOBAL'].get('print'): print_scrape_results_http(results, Config['GLOBAL'].getint('verbosity', 0)) return conn else: raise InvalidConfigurationException( 'No such scrapemethod. Use "http" or "sel"')