def main(subject='', results_filter='site', n=5): """ Scrape google search up to the nth page and save the results to a MongoDB collection. :param n: """ if not subject: subject = "+".join(random.sample(SUBJECTS, 5)) q = "{}+RSS+site:br".format(subject) lang = '' if results_filter == 'lang': q = "{}+RSS".format(subject) lang = 'lang_pt' print "searching for {}.".format(subject) for o in range(0, n*10, n): urls = GoogleScraper.scrape(q, number_pages=n, offset=o, language=lang) for url in urls: # You can access all parts of the search results like that # url.scheme => URL scheme specifier (Ex: 'http') # url.netloc => Network location part (Ex: 'www.python.org') # url.path => URL scheme specifier (Ex: ''help/Python.html'') # url.params => Parameters for last path element # url.query => Query component #print url #print(unquote(url.geturl())) try: U = unquote(url.geturl()).split("&")[0]#sa=U&ei=")[0] # Remove googlebot crap URLS.insert({'url': U, 'tags': [subject], 'fetched_on': datetime.datetime.now()}) except DuplicateKeyError: pass time.sleep(1)
def main(subject="", results_filter="site", n=5): """ Scrape google search up to the nth page and save the results to a MongoDB collection. :param n: """ if not subject: subject = "+".join(random.sample(SUBJECTS, 5)) q = "{}+RSS+site:br".format(subject) lang = "" if results_filter == "lang": q = "{}+RSS".format(subject) lang = "lang_pt" print "searching for {}.".format(subject) for o in range(0, n * 10, n): urls = GoogleScraper.scrape(q, number_pages=n, offset=o, language=lang) for url in urls: # You can access all parts of the search results like that # url.scheme => URL scheme specifier (Ex: 'http') # url.netloc => Network location part (Ex: 'www.python.org') # url.path => URL scheme specifier (Ex: ''help/Python.html'') # url.params => Parameters for last path element # url.query => Query component # print url # print(unquote(url.geturl())) try: U = unquote(url.geturl()).split("&")[0] # sa=U&ei=")[0] # Remove googlebot crap URLS.insert({"url": U, "tags": [subject], "fetched_on": datetime.datetime.now()}) except DuplicateKeyError: pass time.sleep(1)
def get_results(query): result = list() results = GoogleScraper.scrape(query, num_results_per_page=100, num_pages=10, offset=0, searchtype='normal') for page in results: for link_title, link_snippet, link_url in page['results']: # You can access all parts of the search results like that # link_url.scheme => URL scheme specifier (Ex: 'http') # link_url.netloc => Network location part (Ex: 'www.python.org') # link_url.path => URL scheme specifier (Ex: ''help/Python.html'') # link_url.params => Parameters for last path element # link_url.query => Query component try: # This reassembles the parts of the url to the whole thing result.append([urllib.parse.unquote(link_url.geturl()).replace('&', '&'), link_title.replace('&', '&').replace('<', ' ').replace('>', ' '), link_snippet.replace('&', '&').replace('<', ' ').replace('>', ' ')]) except: pass return result
def googleSearch(query): results = GoogleScraper.scrape('Best SEO tool', num_results_per_page=50, num_pages=3, offset=0) for page in results: for link_title, link_snippet, link_url in page['results']: # You can access all parts of the search results like that # link_url.scheme => URL scheme specifier (Ex: 'http') # link_url.netloc => Network location part (Ex: 'www.python.org') # link_url.path => URL scheme specifier (Ex: ''help/Python.html'') # link_url.params => Parameters for last path element # link_url.query => Query component try: print(urllib.parse.unquote(link_url.geturl( ))) # This reassembles the parts of the url to the whole thing except: pass
def main(subject='', n=5): """ Scrape google search up to the nth page and save the results to a MongoDB collection. :param n: """ q = "{}+RSS+site:br".format(subject) for o in range(0, n*10, n): urls = GoogleScraper.scrape(q, number_pages=n, offset=o) for url in urls: # You can access all parts of the search results like that # url.scheme => URL scheme specifier (Ex: 'http') # url.netloc => Network location part (Ex: 'www.python.org') # url.path => URL scheme specifier (Ex: ''help/Python.html'') # url.params => Parameters for last path element # url.query => Query component #print url #print(unquote(url.geturl())) try: U = unquote(url.geturl()).split("&")[0]#sa=U&ei=")[0] # Remove googlebot crap URLS.insert({'url': U, 'tags': [subject], 'fetched_on': datetime.datetime.now()}) except DuplicateKeyError: pass time.sleep(1)
GoogleScraper.Config.update({ 'SCRAPING': { 'use_own_ip': 'False', 'keyword': 'HelloWorld' }, 'SELENIUM': { 'sel_browser': 'chrome', # change to 'phantomjs' if you want so 'manual_captcha_solving': 'True' } }) # sample proxy proxy = GoogleScraper.Proxy(proto='socks5', host='localhost', port=9050, username='', password='') try: results = GoogleScraper.scrape('Best SEO tool', scrapemethod='sel')#, proxy=proxy) for page in results: for link_title, link_snippet, link_url, *rest in page['results']: # You can access all parts of the search results like that # link_url.scheme => URL scheme specifier (Ex: 'http') # link_url.netloc => Network location part (Ex: 'www.python.org') # link_url.path => URL scheme specifier (Ex: ''help/Python.html'') # link_url.params => Parameters for last path element # link_url.query => Query component try: print(urllib.parse.unquote(link_url.geturl())) # This reassembles the parts of the url to the whole thing except: pass # How many urls did we get on all pages? print(sum(len(page['results']) for page in results))
import GoogleScraper import urllib.parse if __name__ == '__main__': results = GoogleScraper.scrape('Best SEO tool', num_results_per_page=50, num_pages=3, offset=0) for page in results: for link_title, link_snippet, link_url in page['results']: # You can access all parts of the search results like that # link_url.scheme => URL scheme specifier (Ex: 'http') # link_url.netloc => Network location part (Ex: 'www.python.org') # link_url.path => URL scheme specifier (Ex: ''help/Python.html'') # link_url.params => Parameters for last path element # link_url.query => Query component try: print(urllib.parse.unquote(link_url.geturl())) # This reassembles the parts of the url to the whole thing except: pass # How many urls did we get on all pages? print(sum(len(page['results']) for page in results)) # How many hits has google found with our keyword (as shown on the first page)? print(results[0]['num_results_for_kw'])
#! usr/bin/python import GoogleScraper import urllib.parse GoogleScraper.setup_logger() if __name__ == '__main__': results = GoogleScraper.scrape('Best SEO Tool', num_results_per_page=50, num_pages=3, offset=0, searchtype='normal') for page in results: for link_title, link_snippet, link_url, *rest in page['results']: # link_url.scheme # link_url.netloc # link_url.path # link_url.params # link_url.query try: print(urllib.parse.unquote(link_url.geturl())) except: pass # print total number or returned values on each page print (sum(len(page['results']) for page in results)) # print how many urls were returned on all pages print (results[0]['num_results_per_kw'])
#urls = GoogleScraper.scrape(title, results_per_page=100, number_pages=1) con = lite.connect('name.db') with con: cur = con.cursor() c = 0 #TODO select only titles which were not already downloaded cur.execute("SELECT id, title FROM articles;") articles = cur.fetchall() for article_id, title in articles: print("article_id=" + str(article_id) + " title=" + title) urls = GoogleScraper.scrape(title, results_per_page=100, number_pages=1) data = [] for url in urls: # We use INSERT or REPLACE to be sure an id will be returned cur.execute("INSERT OR REPLACE INTO urls (url) VALUES(?)", [url]) url_id = cur.lastrowid data.append([article_id, url_id]) cur.executemany("INSERT OR IGNORE INTO article_url (article_id, url_id) VALUES(?,?)", data) con.commit() c += 1 print("--downloaded " + str(c) + "/" + str(len(articles))) sleep(10)
import GoogleScraper import urllib.parse if __name__ == '__main__': results = GoogleScraper.scrape('Best SEO tool', num_results_per_page=50, num_pages=3, offset=0, searchtype='normal') for page in results: for link_title, link_snippet, link_url in page['results']: # You can access all parts of the search results like that # link_url.scheme => URL scheme specifier (Ex: 'http') # link_url.netloc => Network location part (Ex: 'www.python.org') # link_url.path => URL scheme specifier (Ex: ''help/Python.html'') # link_url.params => Parameters for last path element # link_url.query => Query component try: print(urllib.parse.unquote(link_url.geturl( ))) # This reassembles the parts of the url to the whole thing except: pass # How many urls did we get on all pages? print(sum(len(page['results']) for page in results)) # How many hits has google found with our keyword (as shown on the first page)? print(results[0]['num_results_for_kw'])
import GoogleScraper import urllib.parse if __name__ == '__main__': results = GoogleScraper.scrape('HOly shit', number_pages=1) for link_title, link_snippet, link_url in results['results']: # You can access all parts of the search results like that # link_url.scheme => URL scheme specifier (Ex: 'http') # link_url.netloc => Network location part (Ex: 'www.python.org') # link_url.path => URL scheme specifier (Ex: ''help/Python.html'') # link_url.params => Parameters for last path element # link_url.query => Query component try: print(urllib.parse.unquote(link_url.geturl())) # This reassembles the parts of the url to the whole thing except: pass # How many urls did we get? print(len(results['results'])) # How many hits has google found with our keyword? print(results['num_results_for_kw'])
import urllib import GoogleScraper if __name__ == '__main__': results = GoogleScraper.scrape('There are', number_pages=1) for link_title, link_snippet, link_url in results['results']: # You can access all parts of the search results like that # link_url.scheme => URL scheme specifier (Ex: 'http') # link_url.netloc => Network location part (Ex: 'www.python.org') # link_url.path => URL scheme specifier (Ex: ''help/Python.html'') # link_url.params => Parameters for last path element # link_url.query => Query component try: print(urllib.parse.unquote(link_url.geturl())) # This reassembles the parts of the url to the whole thing except: pass # How many urls did we get? print(len(results['results'])) # How many hits has google found with our keyword? print(results['num_results_for_kw'])
import GoogleScraper import urllib.parse if __name__ == '__main__': results = GoogleScraper.scrape('Best SEO tool', num_results_per_page=50, num_pages=3, offset=0) for page in results: for link_title, link_snippet, link_url in page['results']: # You can access all parts of the search results like that # link_url.scheme => URL scheme specifier (Ex: 'http') # link_url.netloc => Network location part (Ex: 'www.python.org') # link_url.path => URL scheme specifier (Ex: ''help/Python.html'') # link_url.params => Parameters for last path element # link_url.query => Query component try: print(urllib.parse.unquote(link_url.geturl())) # This reassembles the parts of the url to the whole thing except: pass # How many urls did we get on all pages? print(sum(len(page['results']) for page in results)) # How many hits has google found with our keyword (as shown on the first page)? print(results[0]['num_results_for_kw'])