Python GoogleScraper Exemples, GoogleScraper Python Exemples

Exemple #1

0

Afficher le fichier

def test_scrape_selenium_mode(sel_browser, num_words=15, num_pages=2):
    """Run some none proxied normal selenium mode tests"""
    some_words = random_words(num_words, range(6, 17))
    GoogleScraper.Config['SCRAPING'].update({
        'keywords':
        '\n'.join(some_words[0:num_words]),
        'scrapemethod':
        'sel',
        'num_of_pages':
        str(num_pages)
    })
    GoogleScraper.Config['GLOBAL'].update({
        'db':
        '{}_test.db'.format(sel_browser),
        'do_caching':
        'False'
    })
    GoogleScraper.Config['SELENIUM'].update({'sel_browser': sel_browser})
    GoogleScraper.run()

    con = sqlite3.connect('{}_test.db'.format(sel_browser))
    con.row_factory = sqlite3.Row
    # check that we got a reasonable amount of urls
    cnt = con.execute(
        'select count(*) as cnt from serp_page').fetchone()['cnt']
    assert int(cnt) >= (
        num_pages * num_words
    ), 'Scraped {} keywords, with {} pages each, got only {}'.format(
        num_words, num_pages, cnt)
    # lets see if the links are really links
    for result in con.execute('select url, domain from link').fetchall():
        url, domain = result
        assert GoogleScraper.Google_SERP_Parser._REGEX_VALID_URL.match(url)

Exemple #2

0

Afficher le fichier

Fichier : functional_tests.py Projet : Julienh/GoogleScraper

def test_scrape_selenium_mode(sel_browser, num_words=15, num_pages=2):
   """Run some none proxied normal selenium mode tests"""
   some_words = random_words(num_words, range(6, 17))
   GoogleScraper.Config['SCRAPING'].update(
           {
               'keywords': '\n'.join(some_words[0:num_words]),
               'scrapemethod': 'sel',
               'num_of_pages': str(num_pages)
           })
   GoogleScraper.Config['GLOBAL'].update(
       {
               'db': '{}_test.db'.format(sel_browser),
               'do_caching': 'False'
       })
   GoogleScraper.Config['SELENIUM'].update(
       {
           'sel_browser': sel_browser
       }
   )
   GoogleScraper.run()

   con = sqlite3.connect('{}_test.db'.format(sel_browser))
   con.row_factory = sqlite3.Row
   # check that we got a reasonable amount of urls
   cnt = con.execute('select count(*) as cnt from serp_page').fetchone()['cnt']
   assert int(cnt) >= (num_pages * num_words), 'Scraped {} keywords, with {} pages each, got only {}'.format(
       num_words,
       num_pages,
       cnt
   )
   # lets see if the links are really links
   for result in con.execute('select url, domain from link').fetchall():
       url, domain = result
       assert GoogleScraper.Google_SERP_Parser._REGEX_VALID_URL.match(url)

Exemple #3

0

Afficher le fichier

    def test_config(self):
        GoogleScraper.parse_config(self.cmdargs)

        cfg = GoogleScraper.Config
        self.assertEqual(cfg['SELENIUM'].getint('num_browser_instances'), 10, 'num_browser_instances should be 10')
        self.assertFalse(cfg['GLOBAL'].getboolean('do_caching'), 'do_caching should be false')
        self.assertEqual(cfg['GLOBAL'].getint('clean_cache_after'), 14, 'clean_cache_after is expected to be 14')

Exemple #4

0

Afficher le fichier

Fichier : RedditMovieBot.py Projet : anwxwna/accio-aesthetic

    def GetTextGoogle(self):
        data = []
        sub = []
        gsearch = self.search + ' reddit'
        try:
            results = GoogleScraper.scrape_google(gsearch, self.degree, "en")
            for result in results:
                data.append(result)
        except Exception as e:
            print(e)
        finally:
            time.sleep(10)
        for d in data:
            if urllib.parse.urlsplit(d).netloc == 'www.reddit.com':
                sub.append(reddit.submission(d))
        for subs in sub:
            self.Text['body'].append(subs.selftext)
            for top_c in subs.comments:
                if isinstance(top_c, MoreComments):
                    continue
                self.Text['t_comments'].append(top_c.body)
                for sub_c in top_c:
                    self.Text['s_comments'].append(sub_c.body)

        return self.Text

Exemple #5

0

Afficher le fichier

def main(subject='', results_filter='site', n=5):
    """
    Scrape google search up to the nth page and save the results to a MongoDB collection.
    :param n:
    """
    if not subject:
        subject = "+".join(random.sample(SUBJECTS, 5))
    q = "{}+RSS+site:br".format(subject)
    lang = ''
    if results_filter == 'lang':
        q = "{}+RSS".format(subject)
        lang = 'lang_pt'
    print "searching for {}.".format(subject)
    for o in range(0, n*10, n):
        urls = GoogleScraper.scrape(q, number_pages=n, offset=o, language=lang)
        for url in urls:
            # You can access all parts of the search results like that
            # url.scheme => URL scheme specifier (Ex: 'http')
            # url.netloc => Network location part (Ex: 'www.python.org')
            # url.path => URL scheme specifier (Ex: ''help/Python.html'')
            # url.params => Parameters for last path element
            # url.query => Query component
            #print url
            #print(unquote(url.geturl()))
            try:
                U = unquote(url.geturl()).split("&")[0]#sa=U&ei=")[0]  # Remove googlebot crap
                URLS.insert({'url': U, 'tags': [subject], 'fetched_on': datetime.datetime.now()})
            except DuplicateKeyError:
                pass
        time.sleep(1)

Exemple #6

0

Afficher le fichier

def scrape(keywords, search_engine : str, num_pages: int) -> object:
    scrape_results = []
    titles = []
    links = []
    snippets = []

    config = {
        'use_own_ip': True,
        'keywords': keywords,
        'search_engines': [search_engine],
        'num_pages_for_keyword': num_pages,
        'scrape_method': 'http',
        'sel_browser': 'chrome',
        'do_caching': False,
        'num_workers': 1
    }
    search = GoogleScraper.scrape_with_config(config)

    for serp in search.serps:
        print(serp)
        for link in serp.links:
            titles.append(link.title)
            links.append(link.link)
            snippets.append(str(link.snippet))
            """
            #print(link.title + '\n')
            #print(link.link + '\n')
            #print(link.snippet + '\n')
            """

    scrape_results.append(titles)
    scrape_results.append(links)
    scrape_results.append(snippets)
    return scrape_results

Exemple #7

0

Afficher le fichier

Fichier : gsdl.py Projet : looran/gsdl

 def run(self):
     config = {
         'SCRAPING': {
             'use_own_ip': 'True',
             'keywords': '\n'.join(self.searches.values()),
             'num_of_pages': "%s" % self.num_pages,
             'scrapemethod': self.scrapemethod
         },
         'SELENIUM': {
             'sel_browser': 'chrome',
             'manual_captcha_solving': 'True',
             # 'sleeping_ranges': '5; 1, 2', # more agressive than defaults
         },
         'GLOBAL': {
             'do_caching': 'True',
             #'do_caching': 'False',
             #'cachedir': 'dc
             'db': "results_{asctime}.db",
             # 'debug': 'WARNING',
             'debug': 'ERROR',
         },
         'GOOGLE_SEARCH_PARAMS': {
             'start': "0",
             'num': "30",
         }
     }
     if self.proxyfile:
         print("Using proxies from %s" % self.proxyfile)
         config['GLOBAL']['proxy_file'] = self.proxyfile
     # GoogleScraper.config.update_config(config) # hack, GoogleScraper config 'db' path is broken when 2nd time
     db = GoogleScraper.scrape_with_config(config, usrcb_result=self.cb_results)
     urls = db.execute('SELECT * FROM link').fetchall()
     db.close()
     self.urls.extend(urls)
     return urls

Exemple #8

0

Afficher le fichier

Fichier : googlerss.py Projet : veniciusgrjr/mediacloud_backend

def main(subject="", results_filter="site", n=5):
    """
    Scrape google search up to the nth page and save the results to a MongoDB collection.
    :param n:
    """
    if not subject:
        subject = "+".join(random.sample(SUBJECTS, 5))
    q = "{}+RSS+site:br".format(subject)
    lang = ""
    if results_filter == "lang":
        q = "{}+RSS".format(subject)
        lang = "lang_pt"
    print "searching for {}.".format(subject)
    for o in range(0, n * 10, n):
        urls = GoogleScraper.scrape(q, number_pages=n, offset=o, language=lang)
        for url in urls:
            # You can access all parts of the search results like that
            # url.scheme => URL scheme specifier (Ex: 'http')
            # url.netloc => Network location part (Ex: 'www.python.org')
            # url.path => URL scheme specifier (Ex: ''help/Python.html'')
            # url.params => Parameters for last path element
            # url.query => Query component
            # print url
            # print(unquote(url.geturl()))
            try:
                U = unquote(url.geturl()).split("&")[0]  # sa=U&ei=")[0]  # Remove googlebot crap
                URLS.insert({"url": U, "tags": [subject], "fetched_on": datetime.datetime.now()})
            except DuplicateKeyError:
                pass
        time.sleep(1)

Exemple #9

0

Afficher le fichier

Fichier : use.py Projet : maxim-k/armif

def get_results(query):
    result = list()
    results = GoogleScraper.scrape(query, num_results_per_page=100, num_pages=10, offset=0, searchtype='normal')
    for page in results:
        for link_title, link_snippet, link_url in page['results']:
            # You can access all parts of the search results like that
            # link_url.scheme => URL scheme specifier (Ex: 'http')
            # link_url.netloc => Network location part (Ex: 'www.python.org')
            # link_url.path => URL scheme specifier (Ex: ''help/Python.html'')
            # link_url.params => Parameters for last path element
            # link_url.query => Query component
            try:
                # This reassembles the parts of the url to the whole thing
                result.append([urllib.parse.unquote(link_url.geturl()).replace('&', '&amp;'),
                               link_title.replace('&', '&amp;').replace('<', ' ').replace('>', ' '),
                               link_snippet.replace('&', '&amp;').replace('<', ' ').replace('>', ' ')])
            except:
                pass
    return result

Exemple #10

0

Afficher le fichier

def googleSearch(query):

    results = GoogleScraper.scrape('Best SEO tool',
                                   num_results_per_page=50,
                                   num_pages=3,
                                   offset=0)
    for page in results:
        for link_title, link_snippet, link_url in page['results']:
            # You can access all parts of the search results like that
            # link_url.scheme => URL scheme specifier (Ex: 'http')
            # link_url.netloc => Network location part (Ex: 'www.python.org')
            # link_url.path => URL scheme specifier (Ex: ''help/Python.html'')
            # link_url.params => Parameters for last path element
            # link_url.query => Query component
            try:
                print(urllib.parse.unquote(link_url.geturl(
                )))  # This reassembles the parts of the url to the whole thing
            except:
                pass

Exemple #11

0

Afficher le fichier

Fichier : googlerss.py Projet : jonhedson/mediacloud_backend

def main(subject='', n=5):
    """
    Scrape google search up to the nth page and save the results to a MongoDB collection.
    :param n:
    """
    q = "{}+RSS+site:br".format(subject)
    for o in range(0, n*10, n):
        urls = GoogleScraper.scrape(q, number_pages=n, offset=o)
        for url in urls:
            # You can access all parts of the search results like that
            # url.scheme => URL scheme specifier (Ex: 'http')
            # url.netloc => Network location part (Ex: 'www.python.org')
            # url.path => URL scheme specifier (Ex: ''help/Python.html'')
            # url.params => Parameters for last path element
            # url.query => Query component
            #print url
            #print(unquote(url.geturl()))
            try:
                U = unquote(url.geturl()).split("&")[0]#sa=U&ei=")[0]  # Remove googlebot crap
                URLS.insert({'url': U, 'tags': [subject], 'fetched_on': datetime.datetime.now()})
            except DuplicateKeyError:
                pass
        time.sleep(1)

Exemple #12

0

Afficher le fichier

def get_proxies(n=5):
    """Read some notoriously known sites and extract some public proxies.

    Scrapes
        - http://www.samair.ru/proxy/

    The quality of these proxies is probably not worth to be mentioned, but it's
    nice to test the lack of quality and the behaviour of GoogleScraper.
    """
    r = requests.get('http://www.samair.ru/proxy/')
    # Try to parse the google HTML result using lxml
    try:
        doc = UnicodeDammit(r.text, is_html=True)
        parser = lxml.html.HTMLParser(encoding=doc.declared_html_encoding)
        dom = lxml.html.document_fromstring(r.text, parser=parser)
        dom.resolve_base_href()
    except Exception as e:
        print('Some error occurred while lxml tried to parse: {}'.format(e))

    table = dom.xpath('//table[@id=\'proxylist\']')[0]
    for row in table.findall('tr'):
        print(row.xpath('//td[1]')[0].text_content())

    return GoogleScraper.Proxy()

Exemple #13

0

Afficher le fichier

Fichier : search.py Projet : psygypsie/euler

import GoogleScraper
import urllib.parse

if __name__ == '__main__':

    results = GoogleScraper.scrape('Best SEO tool', num_results_per_page=50, num_pages=3, offset=0)
    for page in results:
        for link_title, link_snippet, link_url in page['results']:
            # You can access all parts of the search results like that
            # link_url.scheme => URL scheme specifier (Ex: 'http')
            # link_url.netloc => Network location part (Ex: 'www.python.org')
            # link_url.path => URL scheme specifier (Ex: ''help/Python.html'')
            # link_url.params => Parameters for last path element
            # link_url.query => Query component
            try:
                print(urllib.parse.unquote(link_url.geturl())) # This reassembles the parts of the url to the whole thing
            except:
                pass

# How many urls did we get on all pages?
print(sum(len(page['results']) for page in results))

# How many hits has google found with our keyword (as shown on the first page)?
print(results[0]['num_results_for_kw'])

Exemple #14

0

Afficher le fichier

if __name__ == '__main__':
    # See for the config.cfg file for possible values
    GoogleScraper.Config.update({
        'SCRAPING': {
            'use_own_ip': 'False',
            'keyword': 'HelloWorld'
        },
        'SELENIUM': {
            'sel_browser': 'chrome', # change to 'phantomjs' if you want so
            'manual_captcha_solving': 'True'
        }
    })

    # sample proxy
    proxy = GoogleScraper.Proxy(proto='socks5', host='localhost', port=9050, username='', password='')

    try:
        results = GoogleScraper.scrape('Best SEO tool', scrapemethod='sel')#, proxy=proxy)
        for page in results:
            for link_title, link_snippet, link_url, *rest in page['results']:
                # You can access all parts of the search results like that
                # link_url.scheme => URL scheme specifier (Ex: 'http')
                # link_url.netloc => Network location part (Ex: 'www.python.org')
                # link_url.path => URL scheme specifier (Ex: ''help/Python.html'')
                # link_url.params => Parameters for last path element
                # link_url.query => Query component
                try:
                    print(urllib.parse.unquote(link_url.geturl())) # This reassembles the parts of the url to the whole thing
                except:
                    pass

Exemple #15

0

Afficher le fichier

Fichier : GoogleScraper.py Projet : CaMeLCa5e/Scrapy

#! usr/bin/python 
import GoogleScraper
import urllib.parse

GoogleScraper.setup_logger()

if __name__ == '__main__':
	
	results = GoogleScraper.scrape('Best SEO Tool', 
									num_results_per_page=50, 
									num_pages=3, 
									offset=0,
									searchtype='normal')

	for page in results:
		for link_title, link_snippet, link_url, *rest in page['results']:
			# link_url.scheme
			# link_url.netloc
			# link_url.path
			# link_url.params
			# link_url.query
			try:
				print(urllib.parse.unquote(link_url.geturl()))
			except:
				pass

# print total number or returned values on each page 
print (sum(len(page['results']) for page in results))

# print how many urls were returned on all pages
print (results[0]['num_results_per_kw'])

Exemple #16

0

Afficher le fichier

        if size_google is None:
            size_google = 'l'

        query_tag = inputs["tag"]

        google_state = "yes" if inputs["google"] is None else inputs["google"]
        insta_state = "yes" if inputs["insta"] is None else inputs["insta"]

        accepted_states = ['yes', 'no']

        # The scrapers are lunched if their states are set to 'Yes' or not specified (None)

        if (google_state.lower() in accepted_states) and (insta_state.lower() in accepted_states):
            if google_state.lower() == 'yes':
                print('\n----------------------- Load Google images ------------------------\n')
                google_scraper = GoogleScraper.GoogleScraper(query=query_tag, size=size_google)
                google_scraper.get_urls()
                google_scraper.parse_urls()
                google_scraper.load_images()
            else:
                print("\nGoogleScraper has been ignored.\n")

            if insta_state.lower() == 'yes':
                print('\n------------------------ Load Insta images ------------------------\n')
                insta_scraper = InstaScraper.InstaScraper(tag=query_tag)
                insta_scraper.get_urls()
                insta_scraper.load_images()
            else:
                print("\nInstaScraper has been ignored.\n")
        else:
            print("The scraping states aren't correct. " +

Exemple #17

0

Afficher le fichier

Fichier : download_urls.py Projet : koniiiik/opendata-sk-ias

#urls = GoogleScraper.scrape(title, results_per_page=100, number_pages=1)

con = lite.connect('name.db')

with con:
  cur = con.cursor()
  c = 0
  
  #TODO select only titles which were not already downloaded 
  cur.execute("SELECT id, title FROM articles;") 
  articles = cur.fetchall() 
  
  for article_id, title in articles:
    print("article_id=" + str(article_id) + " title=" + title)
    
    urls = GoogleScraper.scrape(title, results_per_page=100, number_pages=1)

    data = []

    for url in urls: 
      # We use INSERT or REPLACE to be sure an id will be returned 
      cur.execute("INSERT OR REPLACE INTO urls (url) VALUES(?)", [url])  
      url_id = cur.lastrowid
      data.append([article_id, url_id]) 
    
    cur.executemany("INSERT OR IGNORE INTO article_url (article_id, url_id) VALUES(?,?)", data)
    con.commit()
    
    c += 1
    print("--downloaded " + str(c) + "/" + str(len(articles)))
    sleep(10)

Exemple #18

0

Afficher le fichier

Fichier : use.py Projet : skysunlimited/GoogleViolator

import GoogleScraper
import urllib.parse

if __name__ == '__main__':

    results = GoogleScraper.scrape('Best SEO tool',
                                   num_results_per_page=50,
                                   num_pages=3,
                                   offset=0,
                                   searchtype='normal')
    for page in results:
        for link_title, link_snippet, link_url in page['results']:
            # You can access all parts of the search results like that
            # link_url.scheme => URL scheme specifier (Ex: 'http')
            # link_url.netloc => Network location part (Ex: 'www.python.org')
            # link_url.path => URL scheme specifier (Ex: ''help/Python.html'')
            # link_url.params => Parameters for last path element
            # link_url.query => Query component
            try:
                print(urllib.parse.unquote(link_url.geturl(
                )))  # This reassembles the parts of the url to the whole thing
            except:
                pass

# How many urls did we get on all pages?
print(sum(len(page['results']) for page in results))

# How many hits has google found with our keyword (as shown on the first page)?
print(results[0]['num_results_for_kw'])

Exemple #19

0

Afficher le fichier

Fichier : use.py Projet : DeltaP/GoogleScraper

import GoogleScraper
import urllib.parse

if __name__ == '__main__':

    results = GoogleScraper.scrape('Best SEO tool', num_results_per_page=50, num_pages=3, offset=0)
    for page in results:
        for link_title, link_snippet, link_url in page['results']:
            # You can access all parts of the search results like that
            # link_url.scheme => URL scheme specifier (Ex: 'http')
            # link_url.netloc => Network location part (Ex: 'www.python.org')
            # link_url.path => URL scheme specifier (Ex: ''help/Python.html'')
            # link_url.params => Parameters for last path element
            # link_url.query => Query component
            try:
                print(urllib.parse.unquote(link_url.geturl())) # This reassembles the parts of the url to the whole thing
            except:
                pass

# How many urls did we get on all pages?
print(sum(len(page['results']) for page in results))

# How many hits has google found with our keyword (as shown on the first page)?
print(results[0]['num_results_for_kw'])

Exemple #20

0

Afficher le fichier

Fichier : googlear.py Projet : sperea/SEO-Library

import GoogleScraper
import urllib.parse

if __name__ == '__main__':

    results = GoogleScraper.scrape('HOly shit', number_pages=1)
    for link_title, link_snippet, link_url in results['results']:
        # You can access all parts of the search results like that
        # link_url.scheme => URL scheme specifier (Ex: 'http')
        # link_url.netloc => Network location part (Ex: 'www.python.org')
        # link_url.path => URL scheme specifier (Ex: ''help/Python.html'')
        # link_url.params => Parameters for last path element
        # link_url.query => Query component
        try:
            print(urllib.parse.unquote(link_url.geturl())) # This reassembles the parts of the url to the whole thing
        except:
            pass

# How many urls did we get?
print(len(results['results']))

# How many hits has google found with our keyword?
print(results['num_results_for_kw'])

Exemple #21

0

Afficher le fichier

import GoogleScraper as gs
import CombineSheets as cs
import pandas as pd
#IMPORT ALL FILES INTO THIS ONE SO THE COMPUTER CAN READ THE FILES QUICKLY
#WHERE ARE ALL THE KEYWORDS STORED, ENTER SHEET NAME
#MUST BE IN THE FIRST COLUMN WITH A HEADER
#!!!!!!!!!!!GOOGLE CAN REVOKE YOUR INTERNET SEARCH IF RUNNING THIS TO OFTEN!!!!!!!!!
df = pd.read_excel('fitness-keywords-wordstream.xlsx', sheet_name='sheet1')
#df = pd.read_excel('EXCEL_WORKBOOK_NAME.xlsx', sheet_name='SHEET_NAME')
#THIS LOOP WILL RUN A2 IN THE EXCEL FILE THEN THE NEXT ROW AND SO ON UNTIL INDEX 'I' IS NULL
for i in df.index:
    #df['Keyword'][i] IS THE DATAFRAME AND COLUMN NAME
    #COLUMN NAME MUST MATCH EXACTLY
    #SINGLE QUOTES IN COLUMN HEADER NAME ONLY
    gs.GoogleSearch(df['Keyword'][i])
#MERGES ALL SHEETS FROM GoogleScrapingSEOKEYFINDER.xlsx FOUND IN ROOT DIRECTORY WHICH CAN BE CHANGED IN COMBINE SHEET BUT YOU MUST
#SHANGE IT IN GOOGLESCRAPER ALSO !!!!!!!!!!!!!IMPORTANT!!!!!!!!!
cs.MergeAllDataSheets()
#EXCEL BOOK CLOSED YOU SHOULD NOW HAVE AN EXCEL SHEET THAT CONTAINS ALL SHEETS IN ONE SHEET
#INDEXING MUST BE CHANGED AND ORGINZED THE WAY YOU SEE FIT
#TRY FINDING THE NUMBER OF WORDS THAT MATCHED KEYWORDS SEARCHED TO DESCRIPTION WOULD BE A GOOD PLACE TO START
#COMPARE KEYWORD PHRASING TO TITLE, DESC, AND URL SEPERATLY

#AFTER RUNNING THIS MAIN CLASS FIND THE COUNTMASTERKEYWORDSTOEXCEL.PY RUN THIS TO MERGE ALL KEYWORD
#SHEETS INTO ONE THIS WILL ALLOW YOU HAVE ONE LARGE DOCUMENT THAT CONTAINS ALL SEARCH RESULTS
#INTO THE EXCEL FILE SOFTWAREENGINEERINGMASTERDATABASE.XLSX IN THIS CASE. IT CAN ALWAYS BE CHANGED
#RUNNING THIS FILE WILL TAKE AWHILE PLEASE BE CAREFUL AND PATIENT ON HOW OFTEN YOU USE IT.

#IF YOU GET RECOVERED DOCUMENT OR BROKEN DOCUMENT JUST ALLOW EXCEL TO REMOVE BROKEN DATA IT IS OKAY!

#AFTERRUNNING THIS DOCUMENT YOU MUST CLEAN THE DATA BEFORE RUNNING THE COUNTMASTERKEYWORDSTOEXCEL.PY DOCUMENT!

Exemple #22

0

Afficher le fichier

Fichier : GoogleParser.py Projet : giefko/Python

import urllib
import GoogleScraper

if __name__ == '__main__':

    results = GoogleScraper.scrape('There are', number_pages=1)
    for link_title, link_snippet, link_url in results['results']:
        # You can access all parts of the search results like that
        # link_url.scheme => URL scheme specifier (Ex: 'http')
        # link_url.netloc => Network location part (Ex: 'www.python.org')
        # link_url.path => URL scheme specifier (Ex: ''help/Python.html'')
        # link_url.params => Parameters for last path element
        # link_url.query => Query component
        try:
            print(urllib.parse.unquote(link_url.geturl())) # This reassembles the parts of the url to the whole thing
        except:
            pass

# How many urls did we get?
print(len(results['results']))

# How many hits has google found with our keyword?
print(results['num_results_for_kw'])

Exemple #23

0

Afficher le fichier

    def search_via_googler(self, query_paraments):
        """
        args['config']['last_update']
            Applications: tbm=app
            Blogs: tbm=blg
            Books: tbm=bks
            Discussions: tbm=dsc
            Images: tbm=isch
            News: tbm=nws
            Patents: tbm=pts
            Places: tbm=plcs
            Recipes: tbm=rcp
            Shopping: tbm=shop
            Video: tbm=vid
        """
        def fix_urls(url):
            url = url.replace('/amp/', '') if '/amp/' in url else url
            url = url.replace('/amp.html', '') if '/amp.html' in url else url
            url = urllib.parse.urljoin('http://',
                                       url) if 'http://' not in url else url
            return url

        google_search_url = 'https://www.google.com/search?tbs=qdr:%s&'
        dateRestrict = query_paraments.get('dateRestrict', 'd')
        config = {
            'use_own_ip': 'True',
            'keywords': [query_paraments['q']],
            'google_search_url': google_search_url % dateRestrict,
            'num_results_per_page':
            query_paraments.get('results_per_page', 25),
            'num_pages_for_keyword': query_paraments.get('num_pages', 4),
            'num_workers': 2,
            'search_engines': [
                'google',
            ],
            'search_type': 'normal',
            'scrape_method': 'http',
            'do_caching': False,
            'print_results': None,
        }

        logger.debug('Making search with Googler lib with configuration')

        try:
            google_search = GoogleScraper.scrape_with_config(config)

            urls_without_fix = []
            urls = []
            for serp in google_search.serps:
                urls_without_fix = [r.link for r in serp.links]
                urls = [fix_urls(r.link) for r in serp.links]

            logger.debug(
                ('Google Search fixed links successfully extracted with '
                 'query "{}": {:d} links extracted').format(
                     query_paraments['q'], len(urls)))
            logger.debug(
                ('Google Search links without fix successfully extracted '
                 'with query "{}":\n{}').format(query_paraments['q'],
                                                urls_without_fix))
            logger.debug(('List of link extracted from Google Search with the '
                          'query "{}":\n{}').format(query_paraments['q'],
                                                    urls))

            return urls
        except GoogleScraper.GoogleSearchError as e:
            logger.error(str(e))

Python GoogleScraper, google-image-scraper Exemples