def test_scrape_selenium_mode(sel_browser, num_words=15, num_pages=2): """Run some none proxied normal selenium mode tests""" some_words = random_words(num_words, range(6, 17)) GoogleScraper.Config['SCRAPING'].update({ 'keywords': '\n'.join(some_words[0:num_words]), 'scrapemethod': 'sel', 'num_of_pages': str(num_pages) }) GoogleScraper.Config['GLOBAL'].update({ 'db': '{}_test.db'.format(sel_browser), 'do_caching': 'False' }) GoogleScraper.Config['SELENIUM'].update({'sel_browser': sel_browser}) GoogleScraper.run() con = sqlite3.connect('{}_test.db'.format(sel_browser)) con.row_factory = sqlite3.Row # check that we got a reasonable amount of urls cnt = con.execute( 'select count(*) as cnt from serp_page').fetchone()['cnt'] assert int(cnt) >= ( num_pages * num_words ), 'Scraped {} keywords, with {} pages each, got only {}'.format( num_words, num_pages, cnt) # lets see if the links are really links for result in con.execute('select url, domain from link').fetchall(): url, domain = result assert GoogleScraper.Google_SERP_Parser._REGEX_VALID_URL.match(url)
def test_scrape_selenium_mode(sel_browser, num_words=15, num_pages=2): """Run some none proxied normal selenium mode tests""" some_words = random_words(num_words, range(6, 17)) GoogleScraper.Config['SCRAPING'].update( { 'keywords': '\n'.join(some_words[0:num_words]), 'scrapemethod': 'sel', 'num_of_pages': str(num_pages) }) GoogleScraper.Config['GLOBAL'].update( { 'db': '{}_test.db'.format(sel_browser), 'do_caching': 'False' }) GoogleScraper.Config['SELENIUM'].update( { 'sel_browser': sel_browser } ) GoogleScraper.run() con = sqlite3.connect('{}_test.db'.format(sel_browser)) con.row_factory = sqlite3.Row # check that we got a reasonable amount of urls cnt = con.execute('select count(*) as cnt from serp_page').fetchone()['cnt'] assert int(cnt) >= (num_pages * num_words), 'Scraped {} keywords, with {} pages each, got only {}'.format( num_words, num_pages, cnt ) # lets see if the links are really links for result in con.execute('select url, domain from link').fetchall(): url, domain = result assert GoogleScraper.Google_SERP_Parser._REGEX_VALID_URL.match(url)
def test_config(self): GoogleScraper.parse_config(self.cmdargs) cfg = GoogleScraper.Config self.assertEqual(cfg['SELENIUM'].getint('num_browser_instances'), 10, 'num_browser_instances should be 10') self.assertFalse(cfg['GLOBAL'].getboolean('do_caching'), 'do_caching should be false') self.assertEqual(cfg['GLOBAL'].getint('clean_cache_after'), 14, 'clean_cache_after is expected to be 14')
def GetTextGoogle(self): data = [] sub = [] gsearch = self.search + ' reddit' try: results = GoogleScraper.scrape_google(gsearch, self.degree, "en") for result in results: data.append(result) except Exception as e: print(e) finally: time.sleep(10) for d in data: if urllib.parse.urlsplit(d).netloc == 'www.reddit.com': sub.append(reddit.submission(d)) for subs in sub: self.Text['body'].append(subs.selftext) for top_c in subs.comments: if isinstance(top_c, MoreComments): continue self.Text['t_comments'].append(top_c.body) for sub_c in top_c: self.Text['s_comments'].append(sub_c.body) return self.Text
def main(subject='', results_filter='site', n=5): """ Scrape google search up to the nth page and save the results to a MongoDB collection. :param n: """ if not subject: subject = "+".join(random.sample(SUBJECTS, 5)) q = "{}+RSS+site:br".format(subject) lang = '' if results_filter == 'lang': q = "{}+RSS".format(subject) lang = 'lang_pt' print "searching for {}.".format(subject) for o in range(0, n*10, n): urls = GoogleScraper.scrape(q, number_pages=n, offset=o, language=lang) for url in urls: # You can access all parts of the search results like that # url.scheme => URL scheme specifier (Ex: 'http') # url.netloc => Network location part (Ex: 'www.python.org') # url.path => URL scheme specifier (Ex: ''help/Python.html'') # url.params => Parameters for last path element # url.query => Query component #print url #print(unquote(url.geturl())) try: U = unquote(url.geturl()).split("&")[0]#sa=U&ei=")[0] # Remove googlebot crap URLS.insert({'url': U, 'tags': [subject], 'fetched_on': datetime.datetime.now()}) except DuplicateKeyError: pass time.sleep(1)
def scrape(keywords, search_engine : str, num_pages: int) -> object: scrape_results = [] titles = [] links = [] snippets = [] config = { 'use_own_ip': True, 'keywords': keywords, 'search_engines': [search_engine], 'num_pages_for_keyword': num_pages, 'scrape_method': 'http', 'sel_browser': 'chrome', 'do_caching': False, 'num_workers': 1 } search = GoogleScraper.scrape_with_config(config) for serp in search.serps: print(serp) for link in serp.links: titles.append(link.title) links.append(link.link) snippets.append(str(link.snippet)) """ #print(link.title + '\n') #print(link.link + '\n') #print(link.snippet + '\n') """ scrape_results.append(titles) scrape_results.append(links) scrape_results.append(snippets) return scrape_results
def run(self): config = { 'SCRAPING': { 'use_own_ip': 'True', 'keywords': '\n'.join(self.searches.values()), 'num_of_pages': "%s" % self.num_pages, 'scrapemethod': self.scrapemethod }, 'SELENIUM': { 'sel_browser': 'chrome', 'manual_captcha_solving': 'True', # 'sleeping_ranges': '5; 1, 2', # more agressive than defaults }, 'GLOBAL': { 'do_caching': 'True', #'do_caching': 'False', #'cachedir': 'dc 'db': "results_{asctime}.db", # 'debug': 'WARNING', 'debug': 'ERROR', }, 'GOOGLE_SEARCH_PARAMS': { 'start': "0", 'num': "30", } } if self.proxyfile: print("Using proxies from %s" % self.proxyfile) config['GLOBAL']['proxy_file'] = self.proxyfile # GoogleScraper.config.update_config(config) # hack, GoogleScraper config 'db' path is broken when 2nd time db = GoogleScraper.scrape_with_config(config, usrcb_result=self.cb_results) urls = db.execute('SELECT * FROM link').fetchall() db.close() self.urls.extend(urls) return urls
def main(subject="", results_filter="site", n=5): """ Scrape google search up to the nth page and save the results to a MongoDB collection. :param n: """ if not subject: subject = "+".join(random.sample(SUBJECTS, 5)) q = "{}+RSS+site:br".format(subject) lang = "" if results_filter == "lang": q = "{}+RSS".format(subject) lang = "lang_pt" print "searching for {}.".format(subject) for o in range(0, n * 10, n): urls = GoogleScraper.scrape(q, number_pages=n, offset=o, language=lang) for url in urls: # You can access all parts of the search results like that # url.scheme => URL scheme specifier (Ex: 'http') # url.netloc => Network location part (Ex: 'www.python.org') # url.path => URL scheme specifier (Ex: ''help/Python.html'') # url.params => Parameters for last path element # url.query => Query component # print url # print(unquote(url.geturl())) try: U = unquote(url.geturl()).split("&")[0] # sa=U&ei=")[0] # Remove googlebot crap URLS.insert({"url": U, "tags": [subject], "fetched_on": datetime.datetime.now()}) except DuplicateKeyError: pass time.sleep(1)
def get_results(query): result = list() results = GoogleScraper.scrape(query, num_results_per_page=100, num_pages=10, offset=0, searchtype='normal') for page in results: for link_title, link_snippet, link_url in page['results']: # You can access all parts of the search results like that # link_url.scheme => URL scheme specifier (Ex: 'http') # link_url.netloc => Network location part (Ex: 'www.python.org') # link_url.path => URL scheme specifier (Ex: ''help/Python.html'') # link_url.params => Parameters for last path element # link_url.query => Query component try: # This reassembles the parts of the url to the whole thing result.append([urllib.parse.unquote(link_url.geturl()).replace('&', '&'), link_title.replace('&', '&').replace('<', ' ').replace('>', ' '), link_snippet.replace('&', '&').replace('<', ' ').replace('>', ' ')]) except: pass return result
def googleSearch(query): results = GoogleScraper.scrape('Best SEO tool', num_results_per_page=50, num_pages=3, offset=0) for page in results: for link_title, link_snippet, link_url in page['results']: # You can access all parts of the search results like that # link_url.scheme => URL scheme specifier (Ex: 'http') # link_url.netloc => Network location part (Ex: 'www.python.org') # link_url.path => URL scheme specifier (Ex: ''help/Python.html'') # link_url.params => Parameters for last path element # link_url.query => Query component try: print(urllib.parse.unquote(link_url.geturl( ))) # This reassembles the parts of the url to the whole thing except: pass
def main(subject='', n=5): """ Scrape google search up to the nth page and save the results to a MongoDB collection. :param n: """ q = "{}+RSS+site:br".format(subject) for o in range(0, n*10, n): urls = GoogleScraper.scrape(q, number_pages=n, offset=o) for url in urls: # You can access all parts of the search results like that # url.scheme => URL scheme specifier (Ex: 'http') # url.netloc => Network location part (Ex: 'www.python.org') # url.path => URL scheme specifier (Ex: ''help/Python.html'') # url.params => Parameters for last path element # url.query => Query component #print url #print(unquote(url.geturl())) try: U = unquote(url.geturl()).split("&")[0]#sa=U&ei=")[0] # Remove googlebot crap URLS.insert({'url': U, 'tags': [subject], 'fetched_on': datetime.datetime.now()}) except DuplicateKeyError: pass time.sleep(1)
def get_proxies(n=5): """Read some notoriously known sites and extract some public proxies. Scrapes - http://www.samair.ru/proxy/ The quality of these proxies is probably not worth to be mentioned, but it's nice to test the lack of quality and the behaviour of GoogleScraper. """ r = requests.get('http://www.samair.ru/proxy/') # Try to parse the google HTML result using lxml try: doc = UnicodeDammit(r.text, is_html=True) parser = lxml.html.HTMLParser(encoding=doc.declared_html_encoding) dom = lxml.html.document_fromstring(r.text, parser=parser) dom.resolve_base_href() except Exception as e: print('Some error occurred while lxml tried to parse: {}'.format(e)) table = dom.xpath('//table[@id=\'proxylist\']')[0] for row in table.findall('tr'): print(row.xpath('//td[1]')[0].text_content()) return GoogleScraper.Proxy()
import GoogleScraper import urllib.parse if __name__ == '__main__': results = GoogleScraper.scrape('Best SEO tool', num_results_per_page=50, num_pages=3, offset=0) for page in results: for link_title, link_snippet, link_url in page['results']: # You can access all parts of the search results like that # link_url.scheme => URL scheme specifier (Ex: 'http') # link_url.netloc => Network location part (Ex: 'www.python.org') # link_url.path => URL scheme specifier (Ex: ''help/Python.html'') # link_url.params => Parameters for last path element # link_url.query => Query component try: print(urllib.parse.unquote(link_url.geturl())) # This reassembles the parts of the url to the whole thing except: pass # How many urls did we get on all pages? print(sum(len(page['results']) for page in results)) # How many hits has google found with our keyword (as shown on the first page)? print(results[0]['num_results_for_kw'])
if __name__ == '__main__': # See for the config.cfg file for possible values GoogleScraper.Config.update({ 'SCRAPING': { 'use_own_ip': 'False', 'keyword': 'HelloWorld' }, 'SELENIUM': { 'sel_browser': 'chrome', # change to 'phantomjs' if you want so 'manual_captcha_solving': 'True' } }) # sample proxy proxy = GoogleScraper.Proxy(proto='socks5', host='localhost', port=9050, username='', password='') try: results = GoogleScraper.scrape('Best SEO tool', scrapemethod='sel')#, proxy=proxy) for page in results: for link_title, link_snippet, link_url, *rest in page['results']: # You can access all parts of the search results like that # link_url.scheme => URL scheme specifier (Ex: 'http') # link_url.netloc => Network location part (Ex: 'www.python.org') # link_url.path => URL scheme specifier (Ex: ''help/Python.html'') # link_url.params => Parameters for last path element # link_url.query => Query component try: print(urllib.parse.unquote(link_url.geturl())) # This reassembles the parts of the url to the whole thing except: pass
#! usr/bin/python import GoogleScraper import urllib.parse GoogleScraper.setup_logger() if __name__ == '__main__': results = GoogleScraper.scrape('Best SEO Tool', num_results_per_page=50, num_pages=3, offset=0, searchtype='normal') for page in results: for link_title, link_snippet, link_url, *rest in page['results']: # link_url.scheme # link_url.netloc # link_url.path # link_url.params # link_url.query try: print(urllib.parse.unquote(link_url.geturl())) except: pass # print total number or returned values on each page print (sum(len(page['results']) for page in results)) # print how many urls were returned on all pages print (results[0]['num_results_per_kw'])
if size_google is None: size_google = 'l' query_tag = inputs["tag"] google_state = "yes" if inputs["google"] is None else inputs["google"] insta_state = "yes" if inputs["insta"] is None else inputs["insta"] accepted_states = ['yes', 'no'] # The scrapers are lunched if their states are set to 'Yes' or not specified (None) if (google_state.lower() in accepted_states) and (insta_state.lower() in accepted_states): if google_state.lower() == 'yes': print('\n----------------------- Load Google images ------------------------\n') google_scraper = GoogleScraper.GoogleScraper(query=query_tag, size=size_google) google_scraper.get_urls() google_scraper.parse_urls() google_scraper.load_images() else: print("\nGoogleScraper has been ignored.\n") if insta_state.lower() == 'yes': print('\n------------------------ Load Insta images ------------------------\n') insta_scraper = InstaScraper.InstaScraper(tag=query_tag) insta_scraper.get_urls() insta_scraper.load_images() else: print("\nInstaScraper has been ignored.\n") else: print("The scraping states aren't correct. " +
#urls = GoogleScraper.scrape(title, results_per_page=100, number_pages=1) con = lite.connect('name.db') with con: cur = con.cursor() c = 0 #TODO select only titles which were not already downloaded cur.execute("SELECT id, title FROM articles;") articles = cur.fetchall() for article_id, title in articles: print("article_id=" + str(article_id) + " title=" + title) urls = GoogleScraper.scrape(title, results_per_page=100, number_pages=1) data = [] for url in urls: # We use INSERT or REPLACE to be sure an id will be returned cur.execute("INSERT OR REPLACE INTO urls (url) VALUES(?)", [url]) url_id = cur.lastrowid data.append([article_id, url_id]) cur.executemany("INSERT OR IGNORE INTO article_url (article_id, url_id) VALUES(?,?)", data) con.commit() c += 1 print("--downloaded " + str(c) + "/" + str(len(articles))) sleep(10)
import GoogleScraper import urllib.parse if __name__ == '__main__': results = GoogleScraper.scrape('Best SEO tool', num_results_per_page=50, num_pages=3, offset=0, searchtype='normal') for page in results: for link_title, link_snippet, link_url in page['results']: # You can access all parts of the search results like that # link_url.scheme => URL scheme specifier (Ex: 'http') # link_url.netloc => Network location part (Ex: 'www.python.org') # link_url.path => URL scheme specifier (Ex: ''help/Python.html'') # link_url.params => Parameters for last path element # link_url.query => Query component try: print(urllib.parse.unquote(link_url.geturl( ))) # This reassembles the parts of the url to the whole thing except: pass # How many urls did we get on all pages? print(sum(len(page['results']) for page in results)) # How many hits has google found with our keyword (as shown on the first page)? print(results[0]['num_results_for_kw'])
import GoogleScraper import urllib.parse if __name__ == '__main__': results = GoogleScraper.scrape('HOly shit', number_pages=1) for link_title, link_snippet, link_url in results['results']: # You can access all parts of the search results like that # link_url.scheme => URL scheme specifier (Ex: 'http') # link_url.netloc => Network location part (Ex: 'www.python.org') # link_url.path => URL scheme specifier (Ex: ''help/Python.html'') # link_url.params => Parameters for last path element # link_url.query => Query component try: print(urllib.parse.unquote(link_url.geturl())) # This reassembles the parts of the url to the whole thing except: pass # How many urls did we get? print(len(results['results'])) # How many hits has google found with our keyword? print(results['num_results_for_kw'])
import GoogleScraper as gs import CombineSheets as cs import pandas as pd #IMPORT ALL FILES INTO THIS ONE SO THE COMPUTER CAN READ THE FILES QUICKLY #WHERE ARE ALL THE KEYWORDS STORED, ENTER SHEET NAME #MUST BE IN THE FIRST COLUMN WITH A HEADER #!!!!!!!!!!!GOOGLE CAN REVOKE YOUR INTERNET SEARCH IF RUNNING THIS TO OFTEN!!!!!!!!! df = pd.read_excel('fitness-keywords-wordstream.xlsx', sheet_name='sheet1') #df = pd.read_excel('EXCEL_WORKBOOK_NAME.xlsx', sheet_name='SHEET_NAME') #THIS LOOP WILL RUN A2 IN THE EXCEL FILE THEN THE NEXT ROW AND SO ON UNTIL INDEX 'I' IS NULL for i in df.index: #df['Keyword'][i] IS THE DATAFRAME AND COLUMN NAME #COLUMN NAME MUST MATCH EXACTLY #SINGLE QUOTES IN COLUMN HEADER NAME ONLY gs.GoogleSearch(df['Keyword'][i]) #MERGES ALL SHEETS FROM GoogleScrapingSEOKEYFINDER.xlsx FOUND IN ROOT DIRECTORY WHICH CAN BE CHANGED IN COMBINE SHEET BUT YOU MUST #SHANGE IT IN GOOGLESCRAPER ALSO !!!!!!!!!!!!!IMPORTANT!!!!!!!!! cs.MergeAllDataSheets() #EXCEL BOOK CLOSED YOU SHOULD NOW HAVE AN EXCEL SHEET THAT CONTAINS ALL SHEETS IN ONE SHEET #INDEXING MUST BE CHANGED AND ORGINZED THE WAY YOU SEE FIT #TRY FINDING THE NUMBER OF WORDS THAT MATCHED KEYWORDS SEARCHED TO DESCRIPTION WOULD BE A GOOD PLACE TO START #COMPARE KEYWORD PHRASING TO TITLE, DESC, AND URL SEPERATLY #AFTER RUNNING THIS MAIN CLASS FIND THE COUNTMASTERKEYWORDSTOEXCEL.PY RUN THIS TO MERGE ALL KEYWORD #SHEETS INTO ONE THIS WILL ALLOW YOU HAVE ONE LARGE DOCUMENT THAT CONTAINS ALL SEARCH RESULTS #INTO THE EXCEL FILE SOFTWAREENGINEERINGMASTERDATABASE.XLSX IN THIS CASE. IT CAN ALWAYS BE CHANGED #RUNNING THIS FILE WILL TAKE AWHILE PLEASE BE CAREFUL AND PATIENT ON HOW OFTEN YOU USE IT. #IF YOU GET RECOVERED DOCUMENT OR BROKEN DOCUMENT JUST ALLOW EXCEL TO REMOVE BROKEN DATA IT IS OKAY! #AFTERRUNNING THIS DOCUMENT YOU MUST CLEAN THE DATA BEFORE RUNNING THE COUNTMASTERKEYWORDSTOEXCEL.PY DOCUMENT!
import urllib import GoogleScraper if __name__ == '__main__': results = GoogleScraper.scrape('There are', number_pages=1) for link_title, link_snippet, link_url in results['results']: # You can access all parts of the search results like that # link_url.scheme => URL scheme specifier (Ex: 'http') # link_url.netloc => Network location part (Ex: 'www.python.org') # link_url.path => URL scheme specifier (Ex: ''help/Python.html'') # link_url.params => Parameters for last path element # link_url.query => Query component try: print(urllib.parse.unquote(link_url.geturl())) # This reassembles the parts of the url to the whole thing except: pass # How many urls did we get? print(len(results['results'])) # How many hits has google found with our keyword? print(results['num_results_for_kw'])
def search_via_googler(self, query_paraments): """ args['config']['last_update'] Applications: tbm=app Blogs: tbm=blg Books: tbm=bks Discussions: tbm=dsc Images: tbm=isch News: tbm=nws Patents: tbm=pts Places: tbm=plcs Recipes: tbm=rcp Shopping: tbm=shop Video: tbm=vid """ def fix_urls(url): url = url.replace('/amp/', '') if '/amp/' in url else url url = url.replace('/amp.html', '') if '/amp.html' in url else url url = urllib.parse.urljoin('http://', url) if 'http://' not in url else url return url google_search_url = 'https://www.google.com/search?tbs=qdr:%s&' dateRestrict = query_paraments.get('dateRestrict', 'd') config = { 'use_own_ip': 'True', 'keywords': [query_paraments['q']], 'google_search_url': google_search_url % dateRestrict, 'num_results_per_page': query_paraments.get('results_per_page', 25), 'num_pages_for_keyword': query_paraments.get('num_pages', 4), 'num_workers': 2, 'search_engines': [ 'google', ], 'search_type': 'normal', 'scrape_method': 'http', 'do_caching': False, 'print_results': None, } logger.debug('Making search with Googler lib with configuration') try: google_search = GoogleScraper.scrape_with_config(config) urls_without_fix = [] urls = [] for serp in google_search.serps: urls_without_fix = [r.link for r in serp.links] urls = [fix_urls(r.link) for r in serp.links] logger.debug( ('Google Search fixed links successfully extracted with ' 'query "{}": {:d} links extracted').format( query_paraments['q'], len(urls))) logger.debug( ('Google Search links without fix successfully extracted ' 'with query "{}":\n{}').format(query_paraments['q'], urls_without_fix)) logger.debug(('List of link extracted from Google Search with the ' 'query "{}":\n{}').format(query_paraments['q'], urls)) return urls except GoogleScraper.GoogleSearchError as e: logger.error(str(e))