def googlesearch(name, location=False): """ Perform Google Search lookup. """ # The base plan for SerpAPI is rate limited to 1k calls per hour. # We intentionally slow this down to avoid hitting the rate limit. if not serp_api_fast: sleep(2.5) if not location: client = GoogleSearch({"q": name, "api_key": serp_api_key}) else: client = GoogleSearch({ "q": name, "location": location, "api_key": serp_api_key }) result = client.get_json() try: domain = result['organic_results'][0]['link'] tldr = tldextract.extract(domain) return '{}.{}'.format(tldr.domain, tldr.suffix) except KeyError: print("Unable to lookup record from SerpAPI.") return
def test_get_search_archive(self): search = GoogleSearch({"q": "Coffee", "location": "Austin,Texas"}) search_result = search.get_dictionary() search_id = search_result.get("search_metadata").get("id") archived_search_result = GoogleSearch({}).get_search_archive( search_id, 'json') self.assertEqual( archived_search_result.get("search_metadata").get("id"), search_id) html_buffer = GoogleSearch({}).get_search_archive(search_id, 'html') self.assertGreater(len(html_buffer), 10000)
def test_search_by_location(self): for city in ["new york", "paris", "berlin"]: location = GoogleSearch({}).get_location(city, 1)[0]["canonical_name"] search = GoogleSearch({ "q": "best coffee shop", # search search "location": location, "num": 10, "start": 0 }) data = search.get_json() top_result = data['organic_results'][0]["title"] print("top coffee result for " + location + " is: " + top_result)
def test_async(): # store searches search_queue = Queue() # Serp API search search = GoogleSearch({"location": "Austin,Texas", "async": True}) json_q = load_json("./dataset/Questions_with_Ans.json") # json_q = load_json("./dataset/question.json") ll = list(map(lambda x: x["Question"], json_q)) # loop through companies for company in ll: print("execute async search: q = " + company) search.params_dict["q"] = company data = search.get_dict() print("add search to the queue where id: " + data['search_metadata']['id']) # add search to the search_queue search_queue.put(data) print("wait until all search statuses are cached or success") # Create regular search search = GoogleSearch({"async": True}) while not search_queue.empty(): data = search_queue.get() search_id = data['search_metadata']['id'] # retrieve search from the archive - blocker print(search_id + ": get search from archive") search_archived = search.get_search_archive(search_id) print(search_id + ": status = " + search_archived['search_metadata']['status']) # check status if re.search('Cached|Success', search_archived['search_metadata']['status']): print(search_id + ": search done with q = " + search_archived['search_parameters']['q']) print(search_archived["organic_results"]) else: # requeue search_queue print(search_id + ": requeue search") search_queue.put(search) # wait 1s time.sleep(1) # search is over. print('all searches completed')
def test_paginate(self): # to get 2 pages start = 0 end = 20 # basic search parameters params = { "q": "coca cola", "tbm": "nws", "api_key": os.getenv("API_KEY"), "start": start, "end": end } # as proof of concept # urls collects urls = [] # initialize a search search = GoogleSearch(params) # create a python generator pages = search.pagination() # fetch one search result per iteration # using a basic python for loop # which invokes python iterator under the hood. for page in pages: print(f"Current page: {page['serpapi_pagination']['current']}") for news_result in page["news_results"]: print( f"Title: {news_result['title']}\nLink: {news_result['link']}\n" ) urls.append(news_result['link']) # double check if things adds up. # total number pages expected # the exact number if variable depending on the search engine backend self.assertGreater(len(urls), 200)
def get_results(item, location): params = { "q": item, "tbm": "shop", "location": location, "hl": "en", "gl": "us", "api_key": "286dc1ea151c8c789b1babc2c6e89694919c91e5edb1908278d4c771c5fdcf68", "num": 30 } client = GoogleSearch(params) results = client.get_dict() results = results["shopping_results"] item_list = [] for result in results: item_list.append( ItemData(result.get("title"), result.get("link"), result.get("price"), result.get("snippet"), result.get("source"))) return item_list
def test_async(self): # store searches search_queue = Queue() # Serp API search search = GoogleSearch({"location": "Austin,Texas", "async": True}) # loop through companies for company in ['amd', 'nvidia', 'intel']: print("execute async search: q = " + company) search.params_dict["q"] = company data = search.get_dict() if data is not None: print("oops data is empty for: " + company) continue print("add search to the queue where id: " + data['search_metadata']['id']) # add search to the search_queue search_queue.put(data) print("wait until all search statuses are cached or success") # Create regular search search = GoogleSearch({"async": True}) while not search_queue.empty(): data = search_queue.get() search_id = data['search_metadata']['id'] # retrieve search from the archive - blocker print(search_id + ": get search from archive") search_archived = search.get_search_archive(search_id) print(search_id + ": status = " + search_archived['search_metadata']['status']) # check status if re.search('Cached|Success', search_archived['search_metadata']['status']): print(search_id + ": search done with q = " + search_archived['search_parameters']['q']) else: # requeue search_queue print(search_id + ": requeue search") search_queue.put(search) # wait 1s time.sleep(1) # search is over. print('all searches completed')
def test_get_json(self): search = GoogleSearch({"q": "Coffee", "location": "Austin,Texas"}) data = search.get_json() self.assertEqual(data["search_metadata"]["status"], "Success") self.assertIsNone(data.get("error")) self.assertIsNotNone(data["search_metadata"]["google_url"]) self.assertIsNotNone(data["search_metadata"]["id"]) self.assertIsNotNone(data['local_results']['places'][0])
def print_form(): global CONTEXT if request.method == "GET": # serve HTML page return render_template("index.html") else: # handle text from submitted form CONTEXT["photos"].clear() text_book = request.json["text_book"] slider_val = int(request.json["slider_val"]) # split text into sections text_book_sentences = text_book.split('.') text_book_sentences = text_book_sentences[: -1] # get rid of last empty string (after last sentence) num_sentences = len(text_book_sentences) text_book_sections = [] for idx in range(0, num_sentences, slider_val): if idx + slider_val < num_sentences: text_book_sections.append(". ".join( text_book_sentences[idx:(idx + slider_val)])) else: text_book_sections.append(". ".join(text_book_sentences[idx:])) # summarize each sentence url = "https://textanalysis-text-summarization.p.rapidapi.com/text-summarizer" summaries = [] for section in text_book_sections: payload = {"url": "", "text": section, "sentnum": 1} headers = { 'content-type': "application/json", 'x-rapidapi-key': "3370a90c6bmsh4469eda97977206p1dbffdjsne99d3fc5a7b0", 'x-rapidapi-host': "textanalysis-text-summarization.p.rapidapi.com" } summary = json.loads( requests.request("POST", url, data=json.dumps(payload), headers=headers).text) summaries.append(summary["sentences"][0]) print(summary["sentences"]) # perform image lookup for idx, summary in enumerate(summaries): # make call to image API params = { "q": summary, "tbm": "isch", "ijn": "0", "api_key": NEW_API_KEY } search = GoogleSearch(params) results = search.get_dict() images_results = results['images_results'] if images_results and ("original" in images_results[0]): link = images_results[0]["original"] print(link) CONTEXT["photos"][text_book_sections[idx]] = link return redirect(url_for('view_results'))
def test_get_json(self): search = GoogleSearch({"q": "Coffee", "location": "Austin,Texas"}) data = search.get_json() self.assertEqual(data["search_metadata"]["status"], "Success") self.assertIsNotNone(data["search_metadata"]["google_url"]) self.assertIsNotNone(data["search_metadata"]["id"]) # pp = pprint.PrettyPrinter(indent=2) # pp.pprint(data['local_results']) self.assertIsNotNone(data['local_results']['places'][0])
def test_paginate(self): search = GoogleSearch({"q": "Coffee", "location": "Austin,Texas"}) pages = search.pagination(0, 20, 10) urls = [] for page in pages: urls.append(page['serpapi_pagination']['next']) self.assertEqual(len(urls), 2) self.assertTrue("start=10" in urls[0]) print(urls[1]) self.assertTrue("start=21" in urls[1])
def test_get_json(self): search = GoogleSearch({"q": "Coffee", "engine": "google_scholar"}) data = search.get_json() print(data['search_metadata']) search_id = data['search_metadata']['id'] # retrieve search from the archive - blocker print(search_id + ": get search from archive") raw_html = search.get_search_archive(search_id, 'html') # print(search_id + ": status = " + search_archived['search_metadata']['status']) print(raw_html)
def test_get_object(self): search = GoogleSearch({"q": "Coffee", "location": "Austin,Texas"}) r = search.get_object() self.assertEqual(type(r.organic_results), list) self.assertIsNotNone(r.organic_results[0].title) self.assertIsNotNone(r.search_metadata.id) self.assertIsNotNone(r.search_metadata.google_url) self.assertEqual(r.search_parameters.q, "Coffee") self.assertEqual(r.search_parameters.engine, "google") self.assertGreater(r.search_information.total_results, 10)
def retrieve_paper(doi): """query google scholar api for the article""" params = {"engine": "google_scholar", "q": doi, "api_key": api_key} search = GoogleSearch(params) results = search.get_dict() # now we need to parse through the huge json returned # to actually find the pdf link pdflink = results["organic_results"][0]["resources"][0]["link"] return pdflink
def test_search_google_images(self): search = GoogleSearch({"q": "coffe", "tbm": "isch"}) for image_result in search.get_json()['images_results']: try: link = image_result["original"] print("link is found: " + link) # uncomment the line below to down the original image # wget.download(link, '.') except: print("link is not found.") pass
def search_request(query): params = { "q": query, "tbm": "isch", "ijn": 0, "api_key": API_KEY, } search = GoogleSearch(params) results = search.get_dict() return results['images_results']
def test_search_google_shopping(self): search = GoogleSearch({ "q": "coffe", # search search "tbm": "shop", # news "tbs": "p_ord:rv", # last 24h "num": 100 }) data = search.get_json() for shopping_result in data['shopping_results']: print( str(shopping_result['position']) + " - " + shopping_result['title'])
def test_search_google_news(self): search = GoogleSearch({ "q": "coffe", # search search "tbm": "nws", # news "tbs": "qdr:d", # last 24h "num": 10 }) for offset in [0, 1, 2]: search.params_dict["start"] = offset * 10 data = search.get_json() for news_result in data['news_results']: print( str(news_result['position'] + offset * 10) + " - " + news_result['title'])
def search_async(q_list): search_queue = Queue() search = build_search(is_async=True) show_msg = False # loop through companies for q in q_list: search.params_dict["q"] = q data = search.get_dict() # add search to the search_queue search_queue.put(data) if show_msg: print("execute async search: q = " + q) print("add search to the queue where id: " + data['search_metadata']['id']) print("wait until all search statuses are cached or success") # Create regular search search = GoogleSearch({"async": True}) while not search_queue.empty(): data = search_queue.get() search_id = data['search_metadata']['id'] # retrieve search from the archive - blocker search_archived = search.get_search_archive(search_id) if show_msg: print(search_id + ": get search from archive") print(search_id + ": status = " + search_archived['search_metadata']['status']) # check status if re.search('Cached|Success', search_archived['search_metadata']['status']): if show_msg: print(search_id + ": search done with q = " + search_archived['search_parameters']['q']) QUERY_RESULT[search_archived['search_parameters']['q'] [-5:]] = search_archived["organic_results"] else: # requeue search_queue print(search_id + ": requeue search") search_queue.put(search) # wait 1s time.sleep(1) # search is over. print('all searches completed')
def parseShopping(keyword): params = { "api_key": "9ef20b0d5060890669f34fae37eeb3fe2d0528f3557f84db54715d7a67373827", "engine": "google", "q": keyword.replace(" ", "+"), "location": "Indonesia", "google_domain": "google.co.id", "gl": "id", "hl": "id", "tbm": "shop" } search = GoogleSearch(params) results = search.get_dict() return jsonify(results)
def test_search_google_shopping(self): search = GoogleSearch({ "q": "coffe", # search search "tbm": "shop", # news "tbs": "p_ord:rv", # last 24h "num": 100 }) data = search.get_json() if 'shopping_results' in data: for shopping_result in data['shopping_results']: print( str(shopping_result['position']) + " - " + shopping_result['title']) else: print( "WARNING: oops shopping_results is missing from search result with tbm=shop" )
def parseFood(keyword): query = keyword.replace(" ", "+") if query.find('recipe') == -1: query += '+recipe' params = { "api_key": "9ef20b0d5060890669f34fae37eeb3fe2d0528f3557f84db54715d7a67373827", "engine": "google", "q": query, "google_domain": "google.com", "hl": "id" } search = GoogleSearch(params) results = search.get_dict() return jsonify(results)
def generateLinks(age, gender, student, salary, city, state, country): links = {} toSearch = "" state = "ontario" if gender == "M" or gender == "F": toSearch = toSearch + gender + " " else: toSearch = toSearch + "LGBTQ " toSearch = toSearch + "scholarship " if student == 'true': toSearch = toSearch + "student " if salary < 48535: toSearch = toSearch + "low income " elif salary < 97069: toSearch = toSearch + "middle income " toSearch = toSearch + country search = GoogleSearch({ "q": toSearch, "location": city + ',' + state, "api_key": "157a826ffcd18b1592accedc793f1059857ee66c91b004dfd295b6a9b28cadfc" }) results = search.get_dict() print("-------------------------") organic_results = results['organic_results'] link = "searchLink: " + results['search_metadata']['google_url'] print("\n\n" + link) count = 1 finalString = "" for x in organic_results[:3]: finalString = finalString + x["link"] + "," count += 1 return finalString
def test_paginate_page_size(self): # to get 2 pages with each page contains 20 search results start = 0 end = 80 page_size = 20 # use parameters in params = { "q": "coca cola", "tbm": "nws", "api_key": os.getenv("API_KEY"), "start": start, "end": end, "num": page_size } title = [] search = GoogleSearch(params) # parameter start,end,page_size will be used instead of pagination pages = search.pagination() page_count = 0 count = 0 for page in pages: page_count += 1 # print(f"Current page: {page['serpapi_pagination']['current']}") for news_result in page["news_results"]: count += 1 i = 0 for t in title: i += 1 if t == news_result['title']: print(("%d duplicated title: %s at index: %d" % (count, t, i))) #print(f"{count} - title: {news_result['title']}") title.append(news_result['title']) self.assertEqual( count % 2, 0, ("page %s does not contain 20 elements" % page_count)) # check number of pages match self.assertEqual(page_count, 4) self.assertEqual(len(title), end, "number of search results")
def build_search(is_async=False): params = { # "q": question, "location": "Austin, TX", "device": "desktop", "hl": "zh-tw", "gl": "tw", "safe": "active", # active, or off "num": "10", "start": "0", "api_key": "ecf62f54c84522b61d763db639364ada706243bdefcd988bfb9f53ab472d4d68", # To be match # "tbm": "nws|isch|shop", #use default # To be search # "tbs": "custom to be search criteria", #use default # allow async request "async": is_async, # output format "output": "json" } return GoogleSearch(params)
def pulling_data(job_titles, cities): job_results = [] for job in job_titles: for city in cities: params = { "engine": "google_jobs", "q": f'{job} {city}', "hl": "en", 'num': 100, 'start': 10, "api_key": None, } #looping through 10 pages for x in range(10): params['start'] = 10 * x search = GoogleSearch(params) #json data results = search.get_dict() #val is going to be either True of False, depending on what job_results_validation returns validate_response = jobs_results_validation(results) print(validate_response) #if jobs_results key is found in the json data, then this will return true and it'll enter the if statament. Otherwise continue with the for loop to get more pages if validate_response: job_postings = results['jobs_results'] print(type(job_postings)) #we're getting 10 pages per loop in a list, so we're looping through each job posting to check that the columns we want are there for job_post in job_postings: response = columns_validation(job_post) if response: print(job_post) job_results.append(job_post) else: print('response was false') #list of dictionaries print(len(job_results)) return job_results
def get_image_results_for_query(query: str, num_images: int = 100): results = [] for page_num in tqdm(range(math.ceil(num_images / 100))): params = { "api_key": os.getenv("SERPAPI_KEY"), "engine": "google", "q": query, "google_domain": "google.com", "tbs": "il:cl", "hl": "en", "tbm": "isch", "ijn": page_num } # tbs is licence, ijn is page search = GoogleSearch(params) result = search.get_dict() with contextlib.suppress(KeyError): results += result['images_results'] return results
def search(q, linked): print("search: %s" % q) # run search parameter = {"q": q, "api_key": os.getenv("API_KEY")} client = GoogleSearch(parameter) results = client.get_dict() # basic error handling if "error" in results: print("ERROR: " + results["error"]) sys.exit(1) # analyze results queue = [] if not 'knowledge_graph' in results: return queue for link in results['knowledge_graph']['people_also_search_for']: name = link['name'].replace('.', '') if name in linked: continue linked.append(name) queue.append(name) return queue
def reg_scrape(year): collected_voters = {} search = GoogleSearch({ "q": f"site:https://michiganvoters.info was born in {year}", "location": "Detroit,Michigan", "api_key": "GET_A_KEY_FROM_HERE:https://serpapi.com/manage-api-key" }) results = search.get_json() google_results = results['organic_results'] for voter in google_results: snippet = voter['snippet'] name_match = snippet.split(' was born in ') birth_year = name_match[1].split(' and')[0] full_name = name_match[0].split(', ') first_name = full_name[1] last_name = full_name[0] zip_match = re.search(zip_regex, snippet, re.MULTILINE) if zip_match != None: zipstr = str(zip_match.group(0)) zipcode = zipstr.strip(' U') if ' ' in first_name: first_name = first_name.split(' ')[1] collected_voters[f"{last_name}_{first_name}"] = {'first': first_name, 'last': last_name, 'zipcode':zipcode, 'birth_year': birth_year} return(collected_voters)
def make_url_request_using_cache(job_query): try: results = [] for i in list(range(0, 110, 10)): #search 200 results from API params = { "engine": "google_jobs", "q": job_query, "hl": "en", "api_key": "a463df1e2c78e577d9220ceeba3d0f6cc418db1a445ed7520d0fc6b0c62ab95a", "start": i } client = GoogleSearch(params) result = client.get_dict() result = result['jobs_results'] for i in result: dic = {} dic['title'] = i['title'] dic['company_name'] = i['company_name'] dic['location'] = i['location'] results.append(dic) return results except: return False #if fail to finish search, return false