Esempio n. 1
0
def main(subject='', results_filter='site', n=5):
    """
    Scrape google search up to the nth page and save the results to a MongoDB collection.
    :param n:
    """
    if not subject:
        subject = "+".join(random.sample(SUBJECTS, 5))
    q = "{}+RSS+site:br".format(subject)
    lang = ''
    if results_filter == 'lang':
        q = "{}+RSS".format(subject)
        lang = 'lang_pt'
    print "searching for {}.".format(subject)
    for o in range(0, n*10, n):
        urls = GoogleScraper.scrape(q, number_pages=n, offset=o, language=lang)
        for url in urls:
            # You can access all parts of the search results like that
            # url.scheme => URL scheme specifier (Ex: 'http')
            # url.netloc => Network location part (Ex: 'www.python.org')
            # url.path => URL scheme specifier (Ex: ''help/Python.html'')
            # url.params => Parameters for last path element
            # url.query => Query component
            #print url
            #print(unquote(url.geturl()))
            try:
                U = unquote(url.geturl()).split("&")[0]#sa=U&ei=")[0]  # Remove googlebot crap
                URLS.insert({'url': U, 'tags': [subject], 'fetched_on': datetime.datetime.now()})
            except DuplicateKeyError:
                pass
        time.sleep(1)
def main(subject="", results_filter="site", n=5):
    """
    Scrape google search up to the nth page and save the results to a MongoDB collection.
    :param n:
    """
    if not subject:
        subject = "+".join(random.sample(SUBJECTS, 5))
    q = "{}+RSS+site:br".format(subject)
    lang = ""
    if results_filter == "lang":
        q = "{}+RSS".format(subject)
        lang = "lang_pt"
    print "searching for {}.".format(subject)
    for o in range(0, n * 10, n):
        urls = GoogleScraper.scrape(q, number_pages=n, offset=o, language=lang)
        for url in urls:
            # You can access all parts of the search results like that
            # url.scheme => URL scheme specifier (Ex: 'http')
            # url.netloc => Network location part (Ex: 'www.python.org')
            # url.path => URL scheme specifier (Ex: ''help/Python.html'')
            # url.params => Parameters for last path element
            # url.query => Query component
            # print url
            # print(unquote(url.geturl()))
            try:
                U = unquote(url.geturl()).split("&")[0]  # sa=U&ei=")[0]  # Remove googlebot crap
                URLS.insert({"url": U, "tags": [subject], "fetched_on": datetime.datetime.now()})
            except DuplicateKeyError:
                pass
        time.sleep(1)
Esempio n. 3
0
File: use.py Progetto: maxim-k/armif
def get_results(query):
    result = list()
    results = GoogleScraper.scrape(query, num_results_per_page=100, num_pages=10, offset=0, searchtype='normal')
    for page in results:
        for link_title, link_snippet, link_url in page['results']:
            # You can access all parts of the search results like that
            # link_url.scheme => URL scheme specifier (Ex: 'http')
            # link_url.netloc => Network location part (Ex: 'www.python.org')
            # link_url.path => URL scheme specifier (Ex: ''help/Python.html'')
            # link_url.params => Parameters for last path element
            # link_url.query => Query component
            try:
                # This reassembles the parts of the url to the whole thing
                result.append([urllib.parse.unquote(link_url.geturl()).replace('&', '&'),
                               link_title.replace('&', '&amp;').replace('<', ' ').replace('>', ' '),
                               link_snippet.replace('&', '&amp;').replace('<', ' ').replace('>', ' ')])
            except:
                pass
    return result
Esempio n. 4
0
def googleSearch(query):

    results = GoogleScraper.scrape('Best SEO tool',
                                   num_results_per_page=50,
                                   num_pages=3,
                                   offset=0)
    for page in results:
        for link_title, link_snippet, link_url in page['results']:
            # You can access all parts of the search results like that
            # link_url.scheme => URL scheme specifier (Ex: 'http')
            # link_url.netloc => Network location part (Ex: 'www.python.org')
            # link_url.path => URL scheme specifier (Ex: ''help/Python.html'')
            # link_url.params => Parameters for last path element
            # link_url.query => Query component
            try:
                print(urllib.parse.unquote(link_url.geturl(
                )))  # This reassembles the parts of the url to the whole thing
            except:
                pass
Esempio n. 5
0
def main(subject='', n=5):
    """
    Scrape google search up to the nth page and save the results to a MongoDB collection.
    :param n:
    """
    q = "{}+RSS+site:br".format(subject)
    for o in range(0, n*10, n):
        urls = GoogleScraper.scrape(q, number_pages=n, offset=o)
        for url in urls:
            # You can access all parts of the search results like that
            # url.scheme => URL scheme specifier (Ex: 'http')
            # url.netloc => Network location part (Ex: 'www.python.org')
            # url.path => URL scheme specifier (Ex: ''help/Python.html'')
            # url.params => Parameters for last path element
            # url.query => Query component
            #print url
            #print(unquote(url.geturl()))
            try:
                U = unquote(url.geturl()).split("&")[0]#sa=U&ei=")[0]  # Remove googlebot crap
                URLS.insert({'url': U, 'tags': [subject], 'fetched_on': datetime.datetime.now()})
            except DuplicateKeyError:
                pass
        time.sleep(1)
Esempio n. 6
0
    GoogleScraper.Config.update({
        'SCRAPING': {
            'use_own_ip': 'False',
            'keyword': 'HelloWorld'
        },
        'SELENIUM': {
            'sel_browser': 'chrome', # change to 'phantomjs' if you want so
            'manual_captcha_solving': 'True'
        }
    })

    # sample proxy
    proxy = GoogleScraper.Proxy(proto='socks5', host='localhost', port=9050, username='', password='')

    try:
        results = GoogleScraper.scrape('Best SEO tool', scrapemethod='sel')#, proxy=proxy)
        for page in results:
            for link_title, link_snippet, link_url, *rest in page['results']:
                # You can access all parts of the search results like that
                # link_url.scheme => URL scheme specifier (Ex: 'http')
                # link_url.netloc => Network location part (Ex: 'www.python.org')
                # link_url.path => URL scheme specifier (Ex: ''help/Python.html'')
                # link_url.params => Parameters for last path element
                # link_url.query => Query component
                try:
                    print(urllib.parse.unquote(link_url.geturl())) # This reassembles the parts of the url to the whole thing
                except:
                    pass
            # How many urls did we get on all pages?
            print(sum(len(page['results']) for page in results))
Esempio n. 7
0
import GoogleScraper
import urllib.parse

if __name__ == '__main__':

    results = GoogleScraper.scrape('Best SEO tool', num_results_per_page=50, num_pages=3, offset=0)
    for page in results:
        for link_title, link_snippet, link_url in page['results']:
            # You can access all parts of the search results like that
            # link_url.scheme => URL scheme specifier (Ex: 'http')
            # link_url.netloc => Network location part (Ex: 'www.python.org')
            # link_url.path => URL scheme specifier (Ex: ''help/Python.html'')
            # link_url.params => Parameters for last path element
            # link_url.query => Query component
            try:
                print(urllib.parse.unquote(link_url.geturl())) # This reassembles the parts of the url to the whole thing
            except:
                pass

# How many urls did we get on all pages?
print(sum(len(page['results']) for page in results))

# How many hits has google found with our keyword (as shown on the first page)?
print(results[0]['num_results_for_kw'])

Esempio n. 8
0
#! usr/bin/python 
import GoogleScraper
import urllib.parse

GoogleScraper.setup_logger()

if __name__ == '__main__':
	
	results = GoogleScraper.scrape('Best SEO Tool', 
									num_results_per_page=50, 
									num_pages=3, 
									offset=0,
									searchtype='normal')

	for page in results:
		for link_title, link_snippet, link_url, *rest in page['results']:
			# link_url.scheme
			# link_url.netloc
			# link_url.path
			# link_url.params
			# link_url.query
			try:
				print(urllib.parse.unquote(link_url.geturl()))
			except:
				pass

# print total number or returned values on each page 
print (sum(len(page['results']) for page in results))

# print how many urls were returned on all pages
print (results[0]['num_results_per_kw'])
Esempio n. 9
0
#urls = GoogleScraper.scrape(title, results_per_page=100, number_pages=1)

con = lite.connect('name.db')

with con:
  cur = con.cursor()
  c = 0
  
  #TODO select only titles which were not already downloaded 
  cur.execute("SELECT id, title FROM articles;") 
  articles = cur.fetchall() 
  
  for article_id, title in articles:
    print("article_id=" + str(article_id) + " title=" + title)
    
    urls = GoogleScraper.scrape(title, results_per_page=100, number_pages=1)

    data = []

    for url in urls: 
      # We use INSERT or REPLACE to be sure an id will be returned 
      cur.execute("INSERT OR REPLACE INTO urls (url) VALUES(?)", [url])  
      url_id = cur.lastrowid
      data.append([article_id, url_id]) 
    
    cur.executemany("INSERT OR IGNORE INTO article_url (article_id, url_id) VALUES(?,?)", data)
    con.commit()
    
    c += 1
    print("--downloaded " + str(c) + "/" + str(len(articles)))
    sleep(10) 
Esempio n. 10
0
import GoogleScraper
import urllib.parse

if __name__ == '__main__':

    results = GoogleScraper.scrape('Best SEO tool',
                                   num_results_per_page=50,
                                   num_pages=3,
                                   offset=0,
                                   searchtype='normal')
    for page in results:
        for link_title, link_snippet, link_url in page['results']:
            # You can access all parts of the search results like that
            # link_url.scheme => URL scheme specifier (Ex: 'http')
            # link_url.netloc => Network location part (Ex: 'www.python.org')
            # link_url.path => URL scheme specifier (Ex: ''help/Python.html'')
            # link_url.params => Parameters for last path element
            # link_url.query => Query component
            try:
                print(urllib.parse.unquote(link_url.geturl(
                )))  # This reassembles the parts of the url to the whole thing
            except:
                pass

# How many urls did we get on all pages?
print(sum(len(page['results']) for page in results))

# How many hits has google found with our keyword (as shown on the first page)?
print(results[0]['num_results_for_kw'])
Esempio n. 11
0
import GoogleScraper
import urllib.parse

if __name__ == '__main__':

    results = GoogleScraper.scrape('HOly shit', number_pages=1)
    for link_title, link_snippet, link_url in results['results']:
        # You can access all parts of the search results like that
        # link_url.scheme => URL scheme specifier (Ex: 'http')
        # link_url.netloc => Network location part (Ex: 'www.python.org')
        # link_url.path => URL scheme specifier (Ex: ''help/Python.html'')
        # link_url.params => Parameters for last path element
        # link_url.query => Query component
        try:
            print(urllib.parse.unquote(link_url.geturl())) # This reassembles the parts of the url to the whole thing
        except:
            pass

# How many urls did we get?
print(len(results['results']))

# How many hits has google found with our keyword?
print(results['num_results_for_kw'])
Esempio n. 12
0
import urllib
import GoogleScraper

if __name__ == '__main__':

    results = GoogleScraper.scrape('There are', number_pages=1)
    for link_title, link_snippet, link_url in results['results']:
        # You can access all parts of the search results like that
        # link_url.scheme => URL scheme specifier (Ex: 'http')
        # link_url.netloc => Network location part (Ex: 'www.python.org')
        # link_url.path => URL scheme specifier (Ex: ''help/Python.html'')
        # link_url.params => Parameters for last path element
        # link_url.query => Query component
        try:
            print(urllib.parse.unquote(link_url.geturl())) # This reassembles the parts of the url to the whole thing
        except:
            pass

# How many urls did we get?
print(len(results['results']))

# How many hits has google found with our keyword?
print(results['num_results_for_kw'])
Esempio n. 13
0
import GoogleScraper
import urllib.parse

if __name__ == '__main__':

    results = GoogleScraper.scrape('Best SEO tool', num_results_per_page=50, num_pages=3, offset=0)
    for page in results:
        for link_title, link_snippet, link_url in page['results']:
            # You can access all parts of the search results like that
            # link_url.scheme => URL scheme specifier (Ex: 'http')
            # link_url.netloc => Network location part (Ex: 'www.python.org')
            # link_url.path => URL scheme specifier (Ex: ''help/Python.html'')
            # link_url.params => Parameters for last path element
            # link_url.query => Query component
            try:
                print(urllib.parse.unquote(link_url.geturl())) # This reassembles the parts of the url to the whole thing
            except:
                pass

# How many urls did we get on all pages?
print(sum(len(page['results']) for page in results))

# How many hits has google found with our keyword (as shown on the first page)?
print(results[0]['num_results_for_kw'])