def Search(paper): gs = GoogleSearch(paper + " ieeexplore.ieee.org") url = gs.top_urls()[0] artNumber = [word for word in url.split("=")][-1] return artNumber
def main(): text_crawler = TextCrawler() google_search = GoogleSearch() search_words = ["コロナ ビール", "コロナ ウイルス", "コロナ 太陽"] search_result = [] for word in search_words: print("start 「{word}」") word_hash = hashlib.sha256(word.encode("utf-8")).hexdigest() links_results = [] links = google_search.search_links(word, num=40) for link in links: print(f"getting ... {link}") try: text = text_crawler.get_text(link) text = normalize(text) links_results.append( {"url": link, "success": 1, "text": text, "reason": None} ) except Exception as e: links_results.append( {"url": link, "success": 0, "text": None, "reason": e} ) search_result.append({"word": word, "hash": word_hash, "links": links_results}) with open(search_result_json, "w") as f: f.write(json.dumps(search_result, indent=4, ensure_ascii=False)) print("finish to output search result.")
def googleSearch(self): #Query can be site specific-> site:link word google = GoogleSearch(self.query) ct = GoogleSearch(self.query).count() print "No. Of Google Results: %d" % (ct) results = google.top_urls() return results
def get_n_results(query, n): """ Print a list of n hits from Google""" from pprint import pprint gs = GoogleSearch(query) for hit in gs.get_results(n): pprint(hit) print
def retrieve_data(query, number_results, filter=0): '''(str,int,str) -> str this will take data from the google searches and ''' progress_time = number_results / 100 progress = 0 text_data = "" today = datetime.date.today() today_to_str = str(today.year) + str(today.day) if filter == 0: crawler = GoogleSearch().search("site:news.google.com " + query + " daterange:" + today_to_str, num_results=number_results) print("Scanning google news for: " + query + " today is " + today_to_str) elif filter == 1: crawler = GoogleSearch().search(query + " daterange:" + today_to_str + "-" + today_to_str, num_results=number_results) print("You searched: " + query + " daterange:" + today_to_str) else: crawler = GoogleSearch().search(query, num_results=number_results) for url in crawler.results: print("visiting: ") text_data += " " + url.getText() return text_data
def print_top_results(query): """ Print a list of top hits for a query. Like a mini returned first page on Google""" from pprint import pprint gs = GoogleSearch(query) for hit in gs.top_results(): pprint(hit) print
def getGoogleContent(query): contents=[] gs = GoogleSearch(query) for i in gs.top_results(): contents.append(i['url']) return contents
def print_top_results_in_spanish(query): """ Print a list of top hits for a query. Like a mini returned first page on Google. Language setted in parameter hl""" from pprint import pprint gs = GoogleSearch(query, hl='es') for hit in gs.top_results(): pprint(hit) print
def x_vs_y_count_match(x, y): """Which of the two words is used more on the Internet?""" nx = GoogleSearch(x).count() ny = GoogleSearch(y).count() print '%s vs %s:' % (x,y) report = '%s wins with %i vs %i' if nx > ny: print report % (x,nx,ny) elif nx < ny: print report % (y,ny,nx) else: print "it's a tie with %s each!" % nx return nx, ny
def imdb_suggest(self, query): excludes = [ #"Parents Guide", "Plot Summary", "Release Info", "Quotes", "Taglines", "FAQ", "Trivia", "News", #"Full Cast", "Technical Specifications", "Goofs", "Filming Locations", "User ratings", #"Critic Reviews", "Company credits", "Synopsis", "External Reviews", "Soundtracks", "Recommendations" ] google_query = ' '.join([query, ' '.join(['-"%s"' % e for e in excludes]), 'site:imdb.com/title/']) gs = GoogleSearch(google_query, use_proxy=False) suggestions = [] for hit in gs.top_results(): m = self.imdb_pattern.match(hit["url"]) if m: suggestions.append({"id": m.group(1), "title": hit["titleNoFormatting"].replace(' - IMDb', '')}) logging.debug("google found %d results with query: %s" % (len(suggestions), google_query)) return suggestions
def get_google_result(query): ''' FUCN return {content:' ', url: ''} for google query search ''' search_set = set() gs = GoogleSearch(query) for hit in gs.top_results(): result = hit['titleNoFormatting'] #pprint(result) if '|' in result: que = hit['titleNoFormatting'].split('|') if '-' in result: que = hit['titleNoFormatting'].split('-') search_set.add(que[0]) print search_set for search in search_set: get_wiki_result(search) print '*'*20+'Google End'+'*'*20
def imdb_id_for_movie(movie_name): """Retrieve the imdb id for a movie from the name (and year if there are remakes)""" query = 'site:imdb.com %s' % movie_name url = GoogleSearch(query).top_url() import re imdb_id = re.search('/tt[0-9]+/', url).group(0).strip('/') print 'The imdb id for %s is %s' % (movie_name, imdb_id) return imdb_id
def __findUrban__(self, phrase): # Define a list to store matches # gs = GoogleSearch(phrase + " site:urbandictionary.com") # Create an empty list # matches = [] # Loop over all the results # for hit in gs.top_results(): # Append to our list the results removeing the phrase # 'Urban Dictionary: ' from the result # matches.append(hit["titleNoFormatting"].replace("Urban Dictionary: ", "")) # return results from ubran dictionary google search # return matches
def search(self, **kwargs): sites = kwargs['site'] if kwargs.has_key('site') else ['imdb','tvdb', 'none'] keys = kwargs['keys'] if kwargs.has_key('keys') else ['title', 'subtitle'] for site in sites: query = self.config[site]['query'] for key in keys: query = query + " " + '"' + self.data[key] + '"' print "[" + query + "]" gs = GoogleSearch(query) if len(gs.result_data['results']) > 0: for result in gs.top_results(): if getattr(self, '_parse_'+site+'_result')(result): return return
def imdb_suggest(self, query): excludes = [ #"Parents Guide", "Plot Summary", "Release Info", "Quotes", "Taglines", "FAQ", "Trivia", "News", #"Full Cast", "Technical Specifications", "Goofs", "Filming Locations", "User ratings", #"Critic Reviews", "Company credits", "Synopsis", "External Reviews", "Soundtracks", "Recommendations" ] google_query = ' '.join([ query, ' '.join(['-"%s"' % e for e in excludes]), 'site:imdb.com/title/' ]) gs = GoogleSearch(google_query, use_proxy=False) suggestions = [] for hit in gs.top_results(): m = self.imdb_pattern.match(hit["url"]) if m: suggestions.append({ "id": m.group(1), "title": hit["titleNoFormatting"].replace(' - IMDb', '') }) logging.debug("google found %d results with query: %s" % (len(suggestions), google_query)) return suggestions
def gather_pdfs(self): google = GoogleSearch() google.query(self.search) google.get_results() self.pdfs = google.get_pdfs() print(self.pdfs) count = 0 for pdf in self.pdfs: if count == 10: break files.append_data(pdf, self.search + "/PDFs.txt") count += 1
def get_urls_from_google(terms): url_list = [] print ('Searching the term : '+term) response = GoogleSearch().search(term) for result in response.results: if len(url_list) < LIMIT: # Get top link which limited url_list.append(result.url) else : pass if (SHUFFLE) : # shuffle url list shuffle(url_list) #print ('URL pool : '+'\n'.join(url_list)) if (len(url_list) < 1) : # problem var bazen bos liste geliyor timeout olabilir sleep(1000) get_urls_from_google(terms) return url_list
def SearhcGoogle(string): gs = GoogleSearch(string) for hit in gs.top_results(): # send(hit[u'content']) chatvoice(hit[u'content'])
def url_search(query, lucky=True): gs = GoogleSearch(query) try: return [gs.top_url()] if lucky else gs.top_urls() except ProxyError: raise ValueError
from openpyxl import Workbook, load_workbook from googlesearch import GoogleSearch wb = load_workbook('Attendees.xlsx') for sheet in wb: ws = wb[sheet.title] for i in range(1, 290): nameCell = ws.cell(row=i, column=1) jobCell = ws.cell(row=i, column=3) search = str(nameCell.value) + " " + str(jobCell.value) + " email" gs = GoogleSearch(search) print search print gs.top_urls()[0] print("\n")
from googlesearch import GoogleSearch import CRUD import time import threading input_txt_path_name = 'to_search.txt' GS = GoogleSearch() def show_contents(): # show all contents contents = CRUD.read_content() for content in contents: print(content.title + ': ' + content.url) def get_title_and_url_by_keyword(keyword, num_results): # like the name existing_keyword = CRUD.read_keyword(keyword) if len(existing_keyword) == 0: CRUD.create_keyword(keyword) existing_keyword.append(CRUD.read_keyword(keyword)) else: CRUD.delete_content(existing_keyword[0].id) response = GS.search(keyword, num_results=num_results) for result in response.results: CRUD.create_content(result.title, result.url, existing_keyword[0])
root = tree.getroot() print root #print p.xpath('//body')[0].text_content() # GET https://www.googleapis.com/customsearch/v1?key=INSERT_YOUR_API_KEY&cx=017576662512468239146:omuauf_lfve&q=lectures # # In[24]: #Sending request to google search engine and getting results from there from googlesearch import GoogleSearch from pprint import pprint gs = GoogleSearch('Neelu') for hit in gs.top_results(): pprint(hit) print # In[23]: #install google and pygoogle get_ipython().system(u'pip install google') get_ipython().system(u'pip install pygoogle') # In[31]: import google
def google(bot, chat_id, text): gs = GoogleSearch(text) message_str = "\n\r".join(["%s -- %s" % (gsr['url'], gsr['title']) for gsr in gs.top_results()]).replace("<b>", "").replace("</b>", "") bot.sendMessage(chat_id=chat_id, text=message_str)
def get_expand(self, word='keyword'): gs = GoogleSearch() gs.search(word) return gs.get_articles()
from googlesearch import GoogleSearch from readability.readability import Document from bs4 import BeautifulSoup import re import sys import requests import urllib def remove_non_ascii(text): return ''.join(i for i in text if ord(i)<128) if __name__ == "__main__": if len(sys.argv) < 2: print 'no urls found' sys.exit(0) query = '' for i in range (1,len(sys.argv)): query += sys.argv[i] + ' ' gs = GoogleSearch(query) if len(gs.top_urls()) < 1: print 'no urls found' sys.exit(0) urls = gs.top_urls() # for url in urls: # print url html = urllib.urlopen(urls[0]).read() soup = BeautifulSoup(Document(html).summary(), "lxml") print remove_non_ascii(soup.get_text()[0:1000])
def test_top_results(): gs = GoogleSearch('george osborne hinkley point') res = gs.top_results() assert any((r['titleNoFormatting'].startswith('George Osborne presses') for r in res))
def imp(text): gs = GoogleSearch(text) return 10 * math.log10(gs.count())
def get_url_by_name(name,elem): gs = GoogleSearch('site:goodreads.com/{0}/show {1}'.format(elem,name)) for url in gs.top_urls(): if 'goodreads.com/{0}/show'.format(elem) in url: return str(url) return int(0)
def __findGoogle__(self, phrase): gs = GoogleSearch(phrase) matches = [] for hit in gs.top_results(): matches = hit["titleNoFormatting"] return matches
def top_search_results(text): gs = GoogleSearch(text) ret = [] for hit in gs.top_results(): ret.append((hit['title'], hit['content'])) return ret
def test_top_results_unicode(): gs = GoogleSearch('乔治·奥斯本欣克利点') res = gs.top_results() assert any((r['url'] == 'http://www.bbc.com/zhongwen/simp/uk/2015/10/151021_analysis_uk_xi_visit_nuclear_deal' for r in res))
#!/usr/bin/env python from codecs import open from googlesearch import GoogleSearch from random import uniform from sys import argv from time import sleep if __name__ == '__main__': if len(argv) != 3: print('ERROR: missing input and output filenames') exit(1) with open(argv[2], 'w', encoding='utf-8') as outputfile: with open(argv[1], 'r', encoding='utf-8') as inputfile: processed = 0 for line in inputfile.readlines(): line = line[:-1].encode('utf-8') print(line) count = GoogleSearch(line, use_proxy=False).count() line = str(count) + '\t' + line + '\n' outputfile.write(line.decode('utf-8')) processed += 1 if processed % 100 == 0: print(processed) sleep(uniform(7, 10))
location = json.loads( r.text)['results']['places']['focus']['states'] elif (json.loads(r.text)['results']['places']['focus']['countries'] != []): location = json.loads( r.text)['results']['places']['focus']['countries'] else: continue else: continue name = location[0]['name'] lat = location[0]['lat'] lon = location[0]['lon'] searchText = disasterKeyWord + " " + name + " " + date.strftime("%d %B %Y") #print searchText search = GoogleSearch(searchText) time.sleep(300) wikiTitle = search.top_results()[0]['title'] wikiURL = search.top_results()[0]['url'] wikiContent = search.top_results()[0]['content'] #wikiSearchList = wikipedia.search(searchText); #print wikiSearchList # for wikiSearch in wikiSearchList: # if "2016" in wikiSearch : # wikiSearchWord = wikiSearch # break # else: # wikiSearchWord = wikiSearchList[0] # if ( wikiSearchWord != "" ): # wikiContent = wikipedia.page(wikiSearch) # wikiTitle = date.strftime("%d %B %Y") + " " + wikiContent.title
def search_wikipedia(query): """Query Wikipedia and show the top hit""" gs = GoogleSearch("site:wikipedia.com %s" % query) print gs.top_result()['titleNoFormatting'] print gs.top_url() return gs.top_url()
def Google1(string): gs = GoogleSearch(string) for hit in gs.top_results(): #send(hit[u'content']) chatvoice(hit[u'content']) break