def save_artist_image(aname, size): """Find and save an image for the artist with name `aname`.""" # todo: save both images with one call config = serpscrap.Config() config.set('search_engines', ['googleimg']) config.set('pages_per_keyword', 1) config.set('screenshot', False) config.set('search_type', 'image') config.set('sleeping_max', 10) config.set('image_type', 'any') # required -- has no effect config.set('image_size', 'l') # required -- has no effect keywords = [aname] scrap = serpscrap.SerpScrap() scrap.init(keywords=keywords, config=config.get()) results = scrap.run() # limit to first NUM_RES results if len(results) > NUM_RES: results = results[:NUM_RES] for result in results: url = urllib.parse.unquote(result['serp_url']) try: content = requests.get(url).content saved = save(content, size) if saved is not None: return saved except Exception as e: pass return None
def get_related(config, keywords, related): scrap = serpscrap.SerpScrap() scrap.init(config=config.get(), keywords=keywords) scrap.run() results = scrap.get_related() for keyword in results: if keyword['keyword'] not in related: related.append(keyword['keyword']) return related
def google_scraper(): keywords = ['chat.whatsapp.com/', 'chat.whatsapp.com/*', 'inurl:chat.whatsapp.com/', 'link:chat.whatsapp.com'] config = serpscrap.Config() config.set('scrape_urls', True) config.set('num_pages_for_keyword', 100) # 100 page per keyword config.set('num_results_per_page', 20) # 20 pages per result scrap = serpscrap.SerpScrap() scrap.init(config=config.get(), keywords=keywords) scrap.as_csv('raw_rez')
def scrape_google_snippets(self, query='', file='info.txt'): f = open(file, 'a') keyword = [query] config = serpscrap.Config() config.set('scrape_urls', False) scrap = serpscrap.SerpScrap() scrap.init(config=config.get(), keywords=keyword) results = scrap.run() return results
def scrap(self): config = serpscrap.Config() config_new = { 'cachedir': '/tmp/.serpscrap/', 'clean_cache_after': 100, 'database_name': '/tmp/serpscrap', 'do_caching': True, 'num_pages_for_keyword': 1, 'scrape_urls': True, 'search_engines': ['google'], 'google_search_url': 'https://www.google.com/?gl=us&hl=en&pws=0&gws_rd=cr', 'executable_path': '/tools/chromedriver', } config.apply(config_new) scrap = serpscrap.SerpScrap() scrap.init(config=config.get(), keywords=self.keywords) self.scrap = scrap.run()
def get_scrapes(keyword): query = clean_bag_of_words_stop_words(keyword) query = ' '.join([item for sublist in query for item in sublist]) keywords = query config = serpscrap.Config() config.set('scrape_urls', False) scrap = serpscrap.SerpScrap() scrap.init(config=config.get(), keywords=keywords) results = scrap.run() rrs = [] for result in results: rrs.append(result) def strip(obj): return obj if obj is not None else ' ' return ' '.join([strip(result['serp_snippet']) + strip(result['serp_title']) for result in results])
def get_google_links_snippets(query): """ retrieves top 10 results (for which snippets could be retrieved) along with URLs and snippets """ config = serpscrap.Config() config.set('scrape_urls', False) config.set('do_caching', False) scrap = serpscrap.SerpScrap() scrap.init(config=config.get(), keywords=[query]) results = scrap.scrap_serps() i = 0 urls = [] google_snippets = [] for result in results: if result['serp_snippet'] and i < 10: urls.append(result['serp_url']) google_snippets.append( re.sub(r'[^\x00-\x7F]+', ' ', result['serp_snippet']).replace('\n', '')) i += 1 return urls, google_snippets
#!/usr/bin/python3 # -*- coding: utf-8 -*- import pprint import serpscrap keywords = ['example'] config = serpscrap.Config() config.set('scrape_urls', False) scrap = serpscrap.SerpScrap() scrap.init(config=config.get(), keywords=keywords) results = scrap.run() for result in results: pprint.pprint(result) print()
def scrape(config, keywords): scrap = serpscrap.SerpScrap() scrap.init(config=config.get(), keywords=keywords) return scrap.run()
def scrape_to_csv(config, keywords): scrap = serpscrap.SerpScrap() scrap.init(config=config.get(), keywords=keywords) return scrap.as_csv('/tmp/planet-earth')
def scrape_to_csv(config, keywords): scrap = serpscrap.SerpScrap() scrap.init(config=config.get(), keywords=keywords) return scrap.as_csv('/tmp/cryptocurrency')
def main(args): """main driver""" test = False dbname = './tmp/{}_{}_{}_{}'.format( NOW, args.comparison, args.num_locations, args.query_source) if args.query_source == 'trends': keyword_objs = from_trends_top_query_by_category() elif args.query_source == 'csv': keyword_objs = from_csv() elif args.query_source == 'test': test = True keyword_objs = [{ 'keyword': 'pizza', 'category': args.query_source, }, { 'keyword': 'coffee', 'category': args.query_source, },{ 'keyword': 'trump', 'category': args.query_source, }, { 'keyword': 'football', 'category': args.query_source, },] elif args.query_source in ['all', 'all6', 'extra']: keyword_objs = [] if args.query_source in ['all', 'all6']: for query_source in ['procon_popular', 'trending', ]: keywords = CURATED[query_source] keyword_objs += [ { 'keyword': keyword, 'category': query_source, } for keyword in keywords ] keyword_objs += CURATED['popular'] if args.query_source in ['all6', 'extra']: for query_source in ['top_insurance', 'top_loans', 'med_sample_first_20', ]: keywords = CURATED[query_source] keyword_objs += [ { 'keyword': keyword, 'category': query_source, } for keyword in keywords ] elif args.query_source == 'expanded': keyword_objs = [] keywords = CURATED['procon_a_to_z'] keyword_objs += [ { 'keyword': keyword, 'category': args.query_source, } for keyword in keywords ] keyword_objs += from_trends_top_query_by_category(15) else: keywords = CURATED[args.query_source] keyword_objs = [ { 'keyword': keyword, 'category': args.query_source, } for keyword in keywords ] print(keyword_objs) config = serpscrap.Config() config.set('do_caching', False) if VERSION == 'chrome': config.set('sel_browser', 'chrome') config.set('executable_path', CHROME_PATH) config.set('chromedriver_log', CHROMEDRIVER_LOG) else: config.set('executable_path', PHANT_PATH) # config.set('use_own_ip', False) # config.set('proxy_file', 'proxy.txt') config.set('num_pages_for_keyword', 1) config.set('num_results_per_page', 30) # overshoots actual number of results per page config.set('screenshot', False) # config.set('mobile_emulation', True) print(dbname) config.set('database_name', dbname) config.set('save_html', True) config.set('use_control', False) location_df = load_locations() locations = [] if args.comparison == 'test': locations.append({ 'engine': 'google', 'latitude': 34.063, 'longitude': -118.44, 'urban_rural_code': 1, 'median_income': 0, 'percent_dem': 0, 'population_estimate': 0, 'name': 'almaden', }) else: if args.comparison == 'urban-rural': subsets = [ location_df[location_df[URBAN_RURAL_COL] == 1], location_df[location_df[URBAN_RURAL_COL] == 6], ] elif args.comparison == 'income' or args.comparison == 'voting': if args.comparison == 'income': sort_col = MEDIAN_INCOME_COL else: sort_col = VOTING_COL print('Going to sort by {}'.format(sort_col)) location_df = location_df.sort_values(by=[sort_col]) print(location_df) lower_set = location_df.head(args.num_locations) upper_set = location_df.tail(args.num_locations) subsets = [lower_set, upper_set] else: subsets = [location_df] for subset in subsets: if args.comparison == 'population_weighted': sample = subset.sample( n=args.num_locations, weights=subset.POP_ESTIMATE_2016) sample = subset.sample(n=args.num_locations) for _, row in sample.iterrows(): locations.append({ 'engine': 'google', 'latitude': row.INTPTLAT, 'longitude': row.INTPTLONG, 'urban_rural_code': row[URBAN_RURAL_COL], 'median_income': row[MEDIAN_INCOME_COL], 'percent_dem': row[VOTING_COL], 'population_estimate': row[POPULATION_COL], 'name': row.NAME }) pprint(locations) config.set('search_instances', locations) scrap = serpscrap.SerpScrap() scrap.init(config=config.get(), keywords=keyword_objs) a, b = len(keyword_objs), len(locations) estimated_time = round(a * b / 60, 2) if not test: yag = yagmail.SMTP('*****@*****.**', os.environ['MAILBOT_PASSWORD']) start_contents = """ About to run! In total, {} keywords will be searched across {} locations. At a rate of ~1 SERP/min, this will take approximately {} hours. Keep in mind that going over 28 hours may result in a longer term IP ban. Arguments are {}. """.format( a, b, estimated_time, args ) yag.send('*****@*****.**', 'Scrape starting', start_contents) try: scrap.run() except ValueError as err: new_dbname = 'take2' + dbname err_contents = ['Error: {}. Going to wait one hour and try again! Results will be in {}'.format( err, new_dbname )] if not test: yag = yagmail.SMTP('*****@*****.**', os.environ['MAILBOT_PASSWORD']) yag.send('*****@*****.**', 'Scrape starting', err_contents) time.sleep(3600) config.set('database_name', new_dbname) scrap2 = serpscrap.SerpScrap() scrap2.init(config=config.get(), keywords=keyword_objs) scrap2.run() if not test: end_contents = ['you-geo-see main.py finished running! Arguments were: {}'.format(args)] yag = yagmail.SMTP('*****@*****.**', os.environ['MAILBOT_PASSWORD']) yag.send('*****@*****.**', 'Scrape success', end_contents)