Esempio n. 1
0
def get_results(item, location):
    params = {
        "q": item,
        "tbm": "shop",
        "location": location,
        "hl": "en",
        "gl": "us",
        "api_key":
        "286dc1ea151c8c789b1babc2c6e89694919c91e5edb1908278d4c771c5fdcf68",
        "num": 30
    }

    client = GoogleSearch(params)
    results = client.get_dict()
    results = results["shopping_results"]

    item_list = []

    for result in results:
        item_list.append(
            ItemData(result.get("title"), result.get("link"),
                     result.get("price"), result.get("snippet"),
                     result.get("source")))

    return item_list
 def test_paginate(self):
     # to get 2 pages
     start = 0
     end = 20
     # basic search parameters
     params = {
         "q": "coca cola",
         "tbm": "nws",
         "api_key": os.getenv("API_KEY"),
         "start": start,
         "end": end
     }
     # as proof of concept
     #  urls collects
     urls = []
     # initialize a search
     search = GoogleSearch(params)
     # create a python generator
     pages = search.pagination()
     # fetch one search result per iteration
     #  using a basic python for loop
     #   which invokes python iterator under the hood.
     for page in pages:
         print(f"Current page: {page['serpapi_pagination']['current']}")
         for news_result in page["news_results"]:
             print(
                 f"Title: {news_result['title']}\nLink: {news_result['link']}\n"
             )
             urls.append(news_result['link'])
     # double check if things adds up.
     # total number pages expected
     #  the exact number if variable depending on the search engine backend
     self.assertGreater(len(urls), 200)
def googlesearch(name, location=False):
    """
    Perform Google Search lookup.
    """

    # The base plan for SerpAPI is rate limited to 1k calls per hour.
    # We intentionally slow this down to avoid hitting the rate limit.
    if not serp_api_fast:
        sleep(2.5)

    if not location:
        client = GoogleSearch({"q": name, "api_key": serp_api_key})
    else:
        client = GoogleSearch({
            "q": name,
            "location": location,
            "api_key": serp_api_key
        })

    result = client.get_json()
    try:
        domain = result['organic_results'][0]['link']
        tldr = tldextract.extract(domain)
        return '{}.{}'.format(tldr.domain, tldr.suffix)
    except KeyError:
        print("Unable to lookup record from SerpAPI.")
    return
 def test_get_json(self):
     search = GoogleSearch({"q": "Coffee", "location": "Austin,Texas"})
     data = search.get_json()
     self.assertEqual(data["search_metadata"]["status"], "Success")
     self.assertIsNone(data.get("error"))
     self.assertIsNotNone(data["search_metadata"]["google_url"])
     self.assertIsNotNone(data["search_metadata"]["id"])
     self.assertIsNotNone(data['local_results']['places'][0])
Esempio n. 5
0
def print_form():
    global CONTEXT
    if request.method == "GET":
        # serve HTML page
        return render_template("index.html")
    else:
        # handle text from submitted form
        CONTEXT["photos"].clear()
        text_book = request.json["text_book"]
        slider_val = int(request.json["slider_val"])
        # split text into sections
        text_book_sentences = text_book.split('.')
        text_book_sentences = text_book_sentences[:
                                                  -1]  # get rid of last empty string (after last sentence)
        num_sentences = len(text_book_sentences)
        text_book_sections = []
        for idx in range(0, num_sentences, slider_val):
            if idx + slider_val < num_sentences:
                text_book_sections.append(". ".join(
                    text_book_sentences[idx:(idx + slider_val)]))
            else:
                text_book_sections.append(". ".join(text_book_sentences[idx:]))
        # summarize each sentence
        url = "https://textanalysis-text-summarization.p.rapidapi.com/text-summarizer"
        summaries = []
        for section in text_book_sections:
            payload = {"url": "", "text": section, "sentnum": 1}
            headers = {
                'content-type': "application/json",
                'x-rapidapi-key':
                "3370a90c6bmsh4469eda97977206p1dbffdjsne99d3fc5a7b0",
                'x-rapidapi-host':
                "textanalysis-text-summarization.p.rapidapi.com"
            }
            summary = json.loads(
                requests.request("POST",
                                 url,
                                 data=json.dumps(payload),
                                 headers=headers).text)
            summaries.append(summary["sentences"][0])
            print(summary["sentences"])
        # perform image lookup
        for idx, summary in enumerate(summaries):
            # make call to image API
            params = {
                "q": summary,
                "tbm": "isch",
                "ijn": "0",
                "api_key": NEW_API_KEY
            }
            search = GoogleSearch(params)
            results = search.get_dict()
            images_results = results['images_results']
            if images_results and ("original" in images_results[0]):
                link = images_results[0]["original"]
                print(link)
                CONTEXT["photos"][text_book_sections[idx]] = link
        return redirect(url_for('view_results'))
Esempio n. 6
0
 def test_get_json(self):
     search = GoogleSearch({"q": "Coffee", "location": "Austin,Texas"})
     data = search.get_json()
     self.assertEqual(data["search_metadata"]["status"], "Success")
     self.assertIsNotNone(data["search_metadata"]["google_url"])
     self.assertIsNotNone(data["search_metadata"]["id"])
     # pp = pprint.PrettyPrinter(indent=2)
     # pp.pprint(data['local_results'])
     self.assertIsNotNone(data['local_results']['places'][0])
Esempio n. 7
0
def retrieve_paper(doi):
    """query google scholar api for the article"""
    params = {"engine": "google_scholar", "q": doi, "api_key": api_key}
    search = GoogleSearch(params)
    results = search.get_dict()

    # now we need to parse through the huge json returned
    # to actually find the pdf link
    pdflink = results["organic_results"][0]["resources"][0]["link"]
    return pdflink
 def test_get_object(self):
     search = GoogleSearch({"q": "Coffee", "location": "Austin,Texas"})
     r = search.get_object()
     self.assertEqual(type(r.organic_results), list)
     self.assertIsNotNone(r.organic_results[0].title)
     self.assertIsNotNone(r.search_metadata.id)
     self.assertIsNotNone(r.search_metadata.google_url)
     self.assertEqual(r.search_parameters.q, "Coffee")
     self.assertEqual(r.search_parameters.engine, "google")
     self.assertGreater(r.search_information.total_results, 10)
Esempio n. 9
0
 def test_get_json(self):
     search = GoogleSearch({"q": "Coffee", "engine": "google_scholar"})
     data = search.get_json()
     print(data['search_metadata'])
     search_id = data['search_metadata']['id']
     # retrieve search from the archive - blocker
     print(search_id + ": get search from archive")
     raw_html = search.get_search_archive(search_id, 'html')
     # print(search_id + ": status = " + search_archived['search_metadata']['status'])
     print(raw_html)
 def test_paginate(self):
     search = GoogleSearch({"q": "Coffee", "location": "Austin,Texas"})
     pages = search.pagination(0, 20, 10)
     urls = []
     for page in pages:
         urls.append(page['serpapi_pagination']['next'])
     self.assertEqual(len(urls), 2)
     self.assertTrue("start=10" in urls[0])
     print(urls[1])
     self.assertTrue("start=21" in urls[1])
def search_request(query):
    params = {
        "q": query,
        "tbm": "isch",
        "ijn": 0,
        "api_key": API_KEY,
        
    }
    search = GoogleSearch(params)
    results = search.get_dict()
    return results['images_results']
Esempio n. 12
0
 def test_search_google_images(self):
     search = GoogleSearch({"q": "coffe", "tbm": "isch"})
     for image_result in search.get_json()['images_results']:
         try:
             link = image_result["original"]
             print("link is found: " + link)
             # uncomment the line below to down the original image
             # wget.download(link, '.')
         except:
             print("link is not found.")
             pass
Esempio n. 13
0
 def test_search_google_shopping(self):
     search = GoogleSearch({
         "q": "coffe",  # search search
         "tbm": "shop",  # news
         "tbs": "p_ord:rv",  # last 24h
         "num": 100
     })
     data = search.get_json()
     for shopping_result in data['shopping_results']:
         print(
             str(shopping_result['position']) + " - " +
             shopping_result['title'])
Esempio n. 14
0
 def test_search_by_location(self):
     for city in ["new york", "paris", "berlin"]:
         location = GoogleSearch({}).get_location(city,
                                                  1)[0]["canonical_name"]
         search = GoogleSearch({
             "q": "best coffee shop",  # search search
             "location": location,
             "num": 10,
             "start": 0
         })
         data = search.get_json()
         top_result = data['organic_results'][0]["title"]
         print("top coffee result for " + location + " is: " + top_result)
Esempio n. 15
0
 def test_search_google_news(self):
     search = GoogleSearch({
         "q": "coffe",  # search search
         "tbm": "nws",  # news
         "tbs": "qdr:d",  # last 24h
         "num": 10
     })
     for offset in [0, 1, 2]:
         search.params_dict["start"] = offset * 10
         data = search.get_json()
         for news_result in data['news_results']:
             print(
                 str(news_result['position'] + offset * 10) + " - " +
                 news_result['title'])
Esempio n. 16
0
def search_async(q_list):
    search_queue = Queue()
    search = build_search(is_async=True)
    show_msg = False

    # loop through companies
    for q in q_list:
        search.params_dict["q"] = q
        data = search.get_dict()

        # add search to the search_queue
        search_queue.put(data)

        if show_msg:
            print("execute async search: q = " + q)
            print("add search to the queue where id: " +
                  data['search_metadata']['id'])
    print("wait until all search statuses are cached or success")
    # Create regular search
    search = GoogleSearch({"async": True})
    while not search_queue.empty():
        data = search_queue.get()
        search_id = data['search_metadata']['id']

        # retrieve search from the archive - blocker
        search_archived = search.get_search_archive(search_id)
        if show_msg:
            print(search_id + ": get search from archive")
            print(search_id + ": status = " +
                  search_archived['search_metadata']['status'])

        # check status
        if re.search('Cached|Success',
                     search_archived['search_metadata']['status']):
            if show_msg:
                print(search_id + ": search done with q = " +
                      search_archived['search_parameters']['q'])
            QUERY_RESULT[search_archived['search_parameters']['q']
                         [-5:]] = search_archived["organic_results"]
        else:
            # requeue search_queue
            print(search_id + ": requeue search")
            search_queue.put(search)
            # wait 1s
            time.sleep(1)
    # search is over.
    print('all searches completed')
Esempio n. 17
0
def parseShopping(keyword):
    params = {
        "api_key":
        "9ef20b0d5060890669f34fae37eeb3fe2d0528f3557f84db54715d7a67373827",
        "engine": "google",
        "q": keyword.replace(" ", "+"),
        "location": "Indonesia",
        "google_domain": "google.co.id",
        "gl": "id",
        "hl": "id",
        "tbm": "shop"
    }

    search = GoogleSearch(params)
    results = search.get_dict()

    return jsonify(results)
Esempio n. 18
0
def parseFood(keyword):
    query = keyword.replace(" ", "+")
    if query.find('recipe') == -1:
        query += '+recipe'
    params = {
        "api_key":
        "9ef20b0d5060890669f34fae37eeb3fe2d0528f3557f84db54715d7a67373827",
        "engine": "google",
        "q": query,
        "google_domain": "google.com",
        "hl": "id"
    }

    search = GoogleSearch(params)
    results = search.get_dict()

    return jsonify(results)
Esempio n. 19
0
 def test_search_google_shopping(self):
     search = GoogleSearch({
         "q": "coffe",  # search search
         "tbm": "shop",  # news
         "tbs": "p_ord:rv",  # last 24h
         "num": 100
     })
     data = search.get_json()
     if 'shopping_results' in data:
         for shopping_result in data['shopping_results']:
             print(
                 str(shopping_result['position']) + " - " +
                 shopping_result['title'])
     else:
         print(
             "WARNING: oops shopping_results is missing from search result with tbm=shop"
         )
Esempio n. 20
0
def generateLinks(age, gender, student, salary, city, state, country):
    links = {}

    toSearch = ""

    state = "ontario"

    if gender == "M" or gender == "F":
        toSearch = toSearch + gender + " "
    else:
        toSearch = toSearch + "LGBTQ "

    toSearch = toSearch + "scholarship "

    if student == 'true':
        toSearch = toSearch + "student "

    if salary < 48535:
        toSearch = toSearch + "low income "
    elif salary < 97069:
        toSearch = toSearch + "middle income "

    toSearch = toSearch + country

    search = GoogleSearch({
        "q":
        toSearch,
        "location":
        city + ',' + state,
        "api_key":
        "157a826ffcd18b1592accedc793f1059857ee66c91b004dfd295b6a9b28cadfc"
    })
    results = search.get_dict()
    print("-------------------------")
    organic_results = results['organic_results']
    link = "searchLink: " + results['search_metadata']['google_url']

    print("\n\n" + link)
    count = 1
    finalString = ""
    for x in organic_results[:3]:
        finalString = finalString + x["link"] + ","
        count += 1

    return finalString
Esempio n. 21
0
    def test_paginate_page_size(self):
        # to get 2 pages with each page contains 20 search results
        start = 0
        end = 80
        page_size = 20

        # use parameters in
        params = {
            "q": "coca cola",
            "tbm": "nws",
            "api_key": os.getenv("API_KEY"),
            "start": start,
            "end": end,
            "num": page_size
        }
        title = []
        search = GoogleSearch(params)
        # parameter start,end,page_size will be used instead of pagination
        pages = search.pagination()
        page_count = 0
        count = 0
        for page in pages:
            page_count += 1
            # print(f"Current page: {page['serpapi_pagination']['current']}")
            for news_result in page["news_results"]:
                count += 1
                i = 0
                for t in title:
                    i += 1
                    if t == news_result['title']:
                        print(("%d duplicated title: %s at index: %d" %
                               (count, t, i)))
                #print(f"{count} - title: {news_result['title']}")
                title.append(news_result['title'])

            self.assertEqual(
                count % 2, 0,
                ("page %s does not contain 20 elements" % page_count))

        # check number of pages match
        self.assertEqual(page_count, 4)
        self.assertEqual(len(title), end, "number of search results")
Esempio n. 22
0
 def test_get_search_archive(self):
     search = GoogleSearch({"q": "Coffee", "location": "Austin,Texas"})
     search_result = search.get_dictionary()
     search_id = search_result.get("search_metadata").get("id")
     archived_search_result = GoogleSearch({}).get_search_archive(
         search_id, 'json')
     self.assertEqual(
         archived_search_result.get("search_metadata").get("id"), search_id)
     html_buffer = GoogleSearch({}).get_search_archive(search_id, 'html')
     self.assertGreater(len(html_buffer), 10000)
Esempio n. 23
0
def pulling_data(job_titles, cities):
    job_results = []
    for job in job_titles:
        for city in cities:
            params = {
                "engine": "google_jobs",
                "q": f'{job} {city}',
                "hl": "en",
                'num': 100,
                'start': 10,
                "api_key": None,
            }

            #looping through 10 pages
            for x in range(10):
                params['start'] = 10 * x
                search = GoogleSearch(params)
                #json data
                results = search.get_dict()

                #val is going to be either True of False, depending on what job_results_validation returns
                validate_response = jobs_results_validation(results)
                print(validate_response)
                #if jobs_results key is found in the json data, then this will return true and it'll enter the if statament. Otherwise continue with the for loop to get more pages
                if validate_response:
                    job_postings = results['jobs_results']
                    print(type(job_postings))
                    #we're getting 10 pages per loop in a list, so we're looping through each job posting to check that the columns we want are there
                    for job_post in job_postings:
                        response = columns_validation(job_post)
                        if response:
                            print(job_post)
                            job_results.append(job_post)
                        else:
                            print('response was false')

    #list of dictionaries
    print(len(job_results))
    return job_results
Esempio n. 24
0
def get_image_results_for_query(query: str, num_images: int = 100):
    results = []

    for page_num in tqdm(range(math.ceil(num_images / 100))):
        params = {
            "api_key": os.getenv("SERPAPI_KEY"),
            "engine": "google",
            "q": query,
            "google_domain": "google.com",
            "tbs": "il:cl",
            "hl": "en",
            "tbm": "isch",
            "ijn": page_num
        }

        # tbs is licence, ijn is page
        search = GoogleSearch(params)
        result = search.get_dict()
        with contextlib.suppress(KeyError):
            results += result['images_results']

    return results
def search(q, linked):
    print("search: %s" % q)
    # run search
    parameter = {"q": q, "api_key": os.getenv("API_KEY")}
    client = GoogleSearch(parameter)
    results = client.get_dict()

    # basic error handling
    if "error" in results:
        print("ERROR: " + results["error"])
        sys.exit(1)

    # analyze results
    queue = []
    if not 'knowledge_graph' in results:
        return queue
    for link in results['knowledge_graph']['people_also_search_for']:
        name = link['name'].replace('.', '')
        if name in linked:
            continue
        linked.append(name)
        queue.append(name)
    return queue
Esempio n. 26
0
def reg_scrape(year):
    collected_voters = {}
    search = GoogleSearch({
    "q": f"site:https://michiganvoters.info was born in {year}", 
    "location": "Detroit,Michigan",
    "api_key": "GET_A_KEY_FROM_HERE:https://serpapi.com/manage-api-key"
    })
    results = search.get_json()
    google_results = results['organic_results']
    for voter in google_results:
        snippet = voter['snippet']
        name_match = snippet.split(' was born in ')
        birth_year = name_match[1].split(' and')[0]
        full_name = name_match[0].split(', ')
        first_name = full_name[1]
        last_name = full_name[0]
        zip_match = re.search(zip_regex, snippet, re.MULTILINE)
        if zip_match != None:
            zipstr = str(zip_match.group(0))
            zipcode = zipstr.strip(' U')
            if ' ' in first_name:
                first_name = first_name.split(' ')[1]
            collected_voters[f"{last_name}_{first_name}"] = {'first': first_name, 'last': last_name, 'zipcode':zipcode, 'birth_year': birth_year}
    return(collected_voters)
Esempio n. 27
0
def make_url_request_using_cache(job_query):
    try:
        results = []
        for i in list(range(0, 110, 10)):  #search 200 results from API
            params = {
                "engine": "google_jobs",
                "q": job_query,
                "hl": "en",
                "api_key":
                "a463df1e2c78e577d9220ceeba3d0f6cc418db1a445ed7520d0fc6b0c62ab95a",
                "start": i
            }
            client = GoogleSearch(params)
            result = client.get_dict()
            result = result['jobs_results']
            for i in result:
                dic = {}
                dic['title'] = i['title']
                dic['company_name'] = i['company_name']
                dic['location'] = i['location']
                results.append(dic)
        return results
    except:
        return False  #if fail to finish search, return false
def test_async():
    # store searches
    search_queue = Queue()

    # Serp API search
    search = GoogleSearch({"location": "Austin,Texas", "async": True})

    json_q = load_json("./dataset/Questions_with_Ans.json")
    # json_q = load_json("./dataset/question.json")

    ll = list(map(lambda x: x["Question"], json_q))

    # loop through companies
    for company in ll:
        print("execute async search: q = " + company)
        search.params_dict["q"] = company
        data = search.get_dict()
        print("add search to the queue where id: " +
              data['search_metadata']['id'])
        # add search to the search_queue
        search_queue.put(data)

    print("wait until all search statuses are cached or success")

    # Create regular search
    search = GoogleSearch({"async": True})
    while not search_queue.empty():
        data = search_queue.get()
        search_id = data['search_metadata']['id']

        # retrieve search from the archive - blocker
        print(search_id + ": get search from archive")
        search_archived = search.get_search_archive(search_id)
        print(search_id + ": status = " +
              search_archived['search_metadata']['status'])

        # check status
        if re.search('Cached|Success',
                     search_archived['search_metadata']['status']):
            print(search_id + ": search done with q = " +
                  search_archived['search_parameters']['q'])
            print(search_archived["organic_results"])
        else:
            # requeue search_queue
            print(search_id + ": requeue search")
            search_queue.put(search)
            # wait 1s
            time.sleep(1)
    # search is over.
    print('all searches completed')
Esempio n. 29
0
    def test_async(self):
        # store searches
        search_queue = Queue()

        # Serp API search
        search = GoogleSearch({"location": "Austin,Texas", "async": True})

        # loop through companies
        for company in ['amd', 'nvidia', 'intel']:
            print("execute async search: q = " + company)
            search.params_dict["q"] = company
            data = search.get_dict()
            if data is not None:
                print("oops data is empty for: " + company)
                continue
            print("add search to the queue where id: " +
                  data['search_metadata']['id'])
            # add search to the search_queue
            search_queue.put(data)

        print("wait until all search statuses are cached or success")

        # Create regular search
        search = GoogleSearch({"async": True})
        while not search_queue.empty():
            data = search_queue.get()
            search_id = data['search_metadata']['id']

            # retrieve search from the archive - blocker
            print(search_id + ": get search from archive")
            search_archived = search.get_search_archive(search_id)
            print(search_id + ": status = " +
                  search_archived['search_metadata']['status'])

            # check status
            if re.search('Cached|Success',
                         search_archived['search_metadata']['status']):
                print(search_id + ": search done with q = " +
                      search_archived['search_parameters']['q'])
            else:
                # requeue search_queue
                print(search_id + ": requeue search")
                search_queue.put(search)
                # wait 1s
                time.sleep(1)
        # search is over.
        print('all searches completed')
Esempio n. 30
0
def build_search(is_async=False):
    params = {
        # "q": question,
        "location": "Austin, TX",
        "device": "desktop",
        "hl": "zh-tw",
        "gl": "tw",
        "safe": "active",  # active, or off
        "num": "10",
        "start": "0",
        "api_key":
        "ecf62f54c84522b61d763db639364ada706243bdefcd988bfb9f53ab472d4d68",
        # To be match
        # "tbm": "nws|isch|shop", #use default
        # To be search
        # "tbs": "custom to be search criteria", #use default
        # allow async request
        "async": is_async,
        # output format
        "output": "json"
    }
    return GoogleSearch(params)