Ejemplo n.º 1
0
def Search(paper):

	gs = GoogleSearch(paper + " ieeexplore.ieee.org")
	url = gs.top_urls()[0]
	artNumber = [word for word in url.split("=")][-1]
	
	return artNumber
Ejemplo n.º 2
0
def main():
    text_crawler = TextCrawler()
    google_search = GoogleSearch()

    search_words = ["コロナ ビール", "コロナ ウイルス", "コロナ 太陽"]

    search_result = []
    for word in search_words:
        print("start 「{word}」")
        word_hash = hashlib.sha256(word.encode("utf-8")).hexdigest()

        links_results = []
        links = google_search.search_links(word, num=40)
        for link in links:
            print(f"getting ... {link}")
            
            try:
                text = text_crawler.get_text(link)
                text = normalize(text)

                links_results.append(
                    {"url": link, "success": 1, "text": text, "reason": None}
                )
            except Exception as e:
                links_results.append(
                    {"url": link, "success": 0, "text": None, "reason": e}
                )

        search_result.append({"word": word, "hash": word_hash, "links": links_results})

    with open(search_result_json, "w") as f:
        f.write(json.dumps(search_result, indent=4, ensure_ascii=False))

    print("finish to output search result.")
 def googleSearch(self):
     #Query can be site specific-> site:link word
     google = GoogleSearch(self.query)
     ct = GoogleSearch(self.query).count()
     print "No. Of Google Results: %d" % (ct)
     results = google.top_urls()
     return results
Ejemplo n.º 4
0
def Search(paper):

    gs = GoogleSearch(paper + " ieeexplore.ieee.org")
    url = gs.top_urls()[0]
    artNumber = [word for word in url.split("=")][-1]

    return artNumber
Ejemplo n.º 5
0
def get_n_results(query, n):
    """ Print a list of n hits from Google"""
    from pprint import pprint
    gs = GoogleSearch(query)
    for hit in gs.get_results(n):
        pprint(hit)
        print
Ejemplo n.º 6
0
def retrieve_data(query, number_results, filter=0):
    '''(str,int,str) -> str
    this will take data from the google searches and 
    '''
    progress_time = number_results / 100
    progress = 0
    text_data = ""
    today = datetime.date.today()
    today_to_str = str(today.year) + str(today.day)
    if filter == 0:
        crawler = GoogleSearch().search("site:news.google.com " + query +
                                        " daterange:" + today_to_str,
                                        num_results=number_results)
        print("Scanning google news for: " + query + " today is " +
              today_to_str)
    elif filter == 1:
        crawler = GoogleSearch().search(query + " daterange:" + today_to_str +
                                        "-" + today_to_str,
                                        num_results=number_results)
        print("You searched: " + query + " daterange:" + today_to_str)
    else:
        crawler = GoogleSearch().search(query, num_results=number_results)
    for url in crawler.results:
        print("visiting: ")
        text_data += " " + url.getText()
    return text_data
Ejemplo n.º 7
0
def print_top_results(query):
    """ Print a list of top hits for a query. 
    Like a mini returned first page on Google"""
    from pprint import pprint
    gs = GoogleSearch(query)
    for hit in gs.top_results():
        pprint(hit)
        print
Ejemplo n.º 8
0
def print_top_results(query):
    """ Print a list of top hits for a query. 
    Like a mini returned first page on Google"""
    from pprint import pprint
    gs = GoogleSearch(query)
    for hit in gs.top_results():
        pprint(hit)
        print
Ejemplo n.º 9
0
def getGoogleContent(query):
    contents=[]
    gs = GoogleSearch(query)

    for i in gs.top_results():
        contents.append(i['url'])

    return contents
Ejemplo n.º 10
0
def print_top_results_in_spanish(query):
    """ Print a list of top hits for a query. 
    Like a mini returned first page on Google.
    Language setted in parameter hl"""
    from pprint import pprint
    gs = GoogleSearch(query, hl='es')
    for hit in gs.top_results():
        pprint(hit)
        print
Ejemplo n.º 11
0
def print_top_results_in_spanish(query):
    """ Print a list of top hits for a query. 
    Like a mini returned first page on Google.
    Language setted in parameter hl"""
    from pprint import pprint
    gs = GoogleSearch(query, hl='es')
    for hit in gs.top_results():
        pprint(hit)
        print
Ejemplo n.º 12
0
def x_vs_y_count_match(x, y):
    """Which of the two words is used more 
    on the Internet?"""
    nx = GoogleSearch(x).count()
    ny = GoogleSearch(y).count()
    print '%s vs %s:' % (x,y)
    report = '%s wins with %i vs %i'
    if   nx > ny:
        print report % (x,nx,ny)
    elif nx < ny:
        print report % (y,ny,nx)
    else:
        print "it's a tie with %s each!" % nx
    return nx, ny
Ejemplo n.º 13
0
 def imdb_suggest(self, query):
     excludes = [
                 #"Parents Guide", "Plot Summary", "Release Info", "Quotes", "Taglines", "FAQ", "Trivia", "News",
                 #"Full Cast", "Technical Specifications", "Goofs", "Filming Locations", "User ratings",
                 #"Critic Reviews", "Company credits", "Synopsis", "External Reviews", "Soundtracks", "Recommendations"
                ]
     google_query = ' '.join([query, ' '.join(['-"%s"' % e for e in excludes]), 'site:imdb.com/title/'])
     gs = GoogleSearch(google_query, use_proxy=False)
     suggestions = []
     for hit in gs.top_results():
         m = self.imdb_pattern.match(hit["url"])
         if m:
             suggestions.append({"id": m.group(1), "title": hit["titleNoFormatting"].replace(' - IMDb', '')})
     logging.debug("google found %d results with query: %s" % (len(suggestions), google_query))
     return suggestions
Ejemplo n.º 14
0
def get_google_result(query):
        '''  FUCN return {content:' ', url: ''} for google query search '''
        search_set = set()
        gs = GoogleSearch(query)
        for hit in gs.top_results():
                result = hit['titleNoFormatting']
                #pprint(result)
                if '|' in result:
                        que = hit['titleNoFormatting'].split('|')
                if '-' in result:
                        que = hit['titleNoFormatting'].split('-')
                search_set.add(que[0])
        print search_set
        for search in search_set:
                get_wiki_result(search)

        print '*'*20+'Google End'+'*'*20
Ejemplo n.º 15
0
def imdb_id_for_movie(movie_name):
    """Retrieve the imdb id for a movie 
    from the name (and year if there are remakes)"""
    query = 'site:imdb.com %s' % movie_name
    url = GoogleSearch(query).top_url()
    import re
    imdb_id = re.search('/tt[0-9]+/', url).group(0).strip('/')
    print 'The imdb id for %s is %s' % (movie_name, imdb_id)
    return imdb_id
Ejemplo n.º 16
0
    def __findUrban__(self, phrase):
        # Define a list to store matches
        #
        gs = GoogleSearch(phrase + " site:urbandictionary.com")

        # Create an empty list
        #
        matches = []
        # Loop over all the results
        #
        for hit in gs.top_results():
            # Append to our list the results removeing the phrase
            # 'Urban Dictionary: ' from the result
            #
            matches.append(hit["titleNoFormatting"].replace("Urban Dictionary: ", ""))

        # return results from ubran dictionary google search
        #
        return matches
Ejemplo n.º 17
0
    def search(self, **kwargs):

        sites = kwargs['site'] if kwargs.has_key('site') else ['imdb','tvdb', 'none']
        keys = kwargs['keys'] if kwargs.has_key('keys') else ['title', 'subtitle']

        for site in sites:

            query = self.config[site]['query']
            for key in keys:
                query = query + " " + '"' + self.data[key] + '"'

            print "[" + query + "]"

            gs = GoogleSearch(query)

            if len(gs.result_data['results']) > 0:
                for result in gs.top_results():
                    if getattr(self, '_parse_'+site+'_result')(result):
                        return
                return
Ejemplo n.º 18
0
 def imdb_suggest(self, query):
     excludes = [
         #"Parents Guide", "Plot Summary", "Release Info", "Quotes", "Taglines", "FAQ", "Trivia", "News",
         #"Full Cast", "Technical Specifications", "Goofs", "Filming Locations", "User ratings",
         #"Critic Reviews", "Company credits", "Synopsis", "External Reviews", "Soundtracks", "Recommendations"
     ]
     google_query = ' '.join([
         query, ' '.join(['-"%s"' % e for e in excludes]),
         'site:imdb.com/title/'
     ])
     gs = GoogleSearch(google_query, use_proxy=False)
     suggestions = []
     for hit in gs.top_results():
         m = self.imdb_pattern.match(hit["url"])
         if m:
             suggestions.append({
                 "id":
                 m.group(1),
                 "title":
                 hit["titleNoFormatting"].replace(' - IMDb', '')
             })
     logging.debug("google found %d results with query: %s" %
                   (len(suggestions), google_query))
     return suggestions
Ejemplo n.º 19
0
    def gather_pdfs(self):
        google = GoogleSearch()
        google.query(self.search)
        google.get_results()
        self.pdfs = google.get_pdfs()
        print(self.pdfs)

        count = 0
        for pdf in self.pdfs:
            if count == 10:
                break

            files.append_data(pdf, self.search + "/PDFs.txt")
            count += 1
Ejemplo n.º 20
0
def get_urls_from_google(terms):
    url_list = []

    print ('Searching the term : '+term)
    response = GoogleSearch().search(term)

    for result in response.results:
        if len(url_list) < LIMIT: # Get top link which limited
            url_list.append(result.url)
        else :
            pass

    if (SHUFFLE) : # shuffle url list
        shuffle(url_list)
    
    #print ('URL pool : '+'\n'.join(url_list))

    if (len(url_list) < 1) : # problem var bazen bos liste geliyor timeout olabilir
        sleep(1000)		
        get_urls_from_google(terms)

    return url_list
Ejemplo n.º 21
0
def SearhcGoogle(string):
    gs = GoogleSearch(string)
    for hit in gs.top_results():
        # send(hit[u'content'])
        chatvoice(hit[u'content'])
Ejemplo n.º 22
0
def url_search(query, lucky=True):
    gs = GoogleSearch(query)
    try:
        return [gs.top_url()] if lucky else gs.top_urls()
    except ProxyError:
        raise ValueError
Ejemplo n.º 23
0
from openpyxl import Workbook, load_workbook
from googlesearch import GoogleSearch
wb = load_workbook('Attendees.xlsx')
for sheet in wb:
	ws = wb[sheet.title]
	for i in range(1, 290):
		nameCell = ws.cell(row=i, column=1)
		jobCell = ws.cell(row=i, column=3)
		search = str(nameCell.value) + " " + str(jobCell.value) + " email"
		gs = GoogleSearch(search)
		print search
		print gs.top_urls()[0]
		print("\n")
Ejemplo n.º 24
0
from googlesearch import GoogleSearch
import CRUD
import time
import threading

input_txt_path_name = 'to_search.txt'
GS = GoogleSearch()


def show_contents():
    # show all contents

    contents = CRUD.read_content()
    for content in contents:
        print(content.title + ': ' + content.url)


def get_title_and_url_by_keyword(keyword, num_results):
    # like the name

    existing_keyword = CRUD.read_keyword(keyword)
    if len(existing_keyword) == 0:
        CRUD.create_keyword(keyword)
        existing_keyword.append(CRUD.read_keyword(keyword))
    else:
        CRUD.delete_content(existing_keyword[0].id)

    response = GS.search(keyword, num_results=num_results)
    for result in response.results:
        CRUD.create_content(result.title, result.url, existing_keyword[0])
Ejemplo n.º 25
0
root = tree.getroot()

print root
#print p.xpath('//body')[0].text_content()


# GET https://www.googleapis.com/customsearch/v1?key=INSERT_YOUR_API_KEY&cx=017576662512468239146:omuauf_lfve&q=lectures
# 

# In[24]:

#Sending request to google search engine and getting results from there
from googlesearch import GoogleSearch
from pprint import pprint

gs = GoogleSearch('Neelu')
for hit in gs.top_results():
    pprint(hit)
    print


# In[23]:

#install google and pygoogle
get_ipython().system(u'pip install google')
get_ipython().system(u'pip install pygoogle')


# In[31]:

import google
Ejemplo n.º 26
0
def google(bot, chat_id, text):
  gs = GoogleSearch(text)
  message_str = "\n\r".join(["%s -- %s" % (gsr['url'], gsr['title']) for gsr in gs.top_results()]).replace("<b>", "").replace("</b>", "")
  bot.sendMessage(chat_id=chat_id, text=message_str)
Ejemplo n.º 27
0
 def get_expand(self, word='keyword'):
     gs = GoogleSearch()
     gs.search(word)
     return gs.get_articles()
Ejemplo n.º 28
0
from googlesearch import GoogleSearch
from readability.readability import Document
from bs4 import BeautifulSoup
import re
import sys
import requests
import urllib

def remove_non_ascii(text):
    return ''.join(i for i in text if ord(i)<128)


if __name__ == "__main__":
	if len(sys.argv) < 2:
		print 'no urls found'
		sys.exit(0)
	query = ''
        for i in range (1,len(sys.argv)):
                query += sys.argv[i] + ' '
	gs = GoogleSearch(query)
	if len(gs.top_urls()) < 1:
		print 'no urls found'
		sys.exit(0)
	urls = gs.top_urls()
#	for url in urls:
#		print url
	html = urllib.urlopen(urls[0]).read()
	soup = BeautifulSoup(Document(html).summary(), "lxml")
	print remove_non_ascii(soup.get_text()[0:1000])
Ejemplo n.º 29
0
def test_top_results():
    gs = GoogleSearch('george osborne hinkley point')
    res = gs.top_results()
    assert any((r['titleNoFormatting'].startswith('George Osborne presses')
                for r in res))
Ejemplo n.º 30
0
def imp(text):
    gs = GoogleSearch(text)
    return 10 * math.log10(gs.count())
Ejemplo n.º 31
0
def get_url_by_name(name,elem):
    gs = GoogleSearch('site:goodreads.com/{0}/show {1}'.format(elem,name))
    for url in gs.top_urls():
        if 'goodreads.com/{0}/show'.format(elem) in url:
            return str(url)
    return int(0)
Ejemplo n.º 32
0
 def __findGoogle__(self, phrase):
     gs = GoogleSearch(phrase)
     matches = []
     for hit in gs.top_results():
         matches = hit["titleNoFormatting"]
     return matches
Ejemplo n.º 33
0
def top_search_results(text):
    gs = GoogleSearch(text)
    ret = []
    for hit in gs.top_results():
        ret.append((hit['title'], hit['content']))
    return ret
Ejemplo n.º 34
0
def test_top_results_unicode():
    gs = GoogleSearch('乔治·奥斯本欣克利点')
    res = gs.top_results()
    assert any((r['url'] == 'http://www.bbc.com/zhongwen/simp/uk/2015/10/151021_analysis_uk_xi_visit_nuclear_deal' for r in res))
Ejemplo n.º 35
0
#!/usr/bin/env python

from codecs import open
from googlesearch import GoogleSearch
from random import uniform
from sys import argv
from time import sleep

if __name__ == '__main__':
    if len(argv) != 3:
        print('ERROR: missing input and output filenames')
        exit(1)
    with open(argv[2], 'w', encoding='utf-8') as outputfile:
        with open(argv[1], 'r', encoding='utf-8') as inputfile:
            processed = 0
            for line in inputfile.readlines():
                line = line[:-1].encode('utf-8')
                print(line)
                count = GoogleSearch(line, use_proxy=False).count()
                line = str(count) + '\t' + line + '\n'
                outputfile.write(line.decode('utf-8'))
                processed += 1
                if processed % 100 == 0:
                    print(processed)
                sleep(uniform(7, 10))
         location = json.loads(
             r.text)['results']['places']['focus']['states']
     elif (json.loads(r.text)['results']['places']['focus']['countries'] !=
           []):
         location = json.loads(
             r.text)['results']['places']['focus']['countries']
     else:
         continue
 else:
     continue
 name = location[0]['name']
 lat = location[0]['lat']
 lon = location[0]['lon']
 searchText = disasterKeyWord + " " + name + " " + date.strftime("%d %B %Y")
 #print searchText
 search = GoogleSearch(searchText)
 time.sleep(300)
 wikiTitle = search.top_results()[0]['title']
 wikiURL = search.top_results()[0]['url']
 wikiContent = search.top_results()[0]['content']
 #wikiSearchList = wikipedia.search(searchText);
 #print wikiSearchList
 # for wikiSearch in wikiSearchList:
 #     if "2016" in wikiSearch :
 #         wikiSearchWord = wikiSearch
 #         break
 #     else:
 #         wikiSearchWord = wikiSearchList[0]
 # if ( wikiSearchWord != "" ):
 #     wikiContent = wikipedia.page(wikiSearch)
 #     wikiTitle = date.strftime("%d %B %Y") + " " + wikiContent.title
Ejemplo n.º 37
0
def search_wikipedia(query):
    """Query Wikipedia and show the top hit"""
    gs = GoogleSearch("site:wikipedia.com %s" % query)
    print gs.top_result()['titleNoFormatting']
    print gs.top_url()
    return gs.top_url()
Ejemplo n.º 38
0
from openpyxl import Workbook, load_workbook
from googlesearch import GoogleSearch
wb = load_workbook('Attendees.xlsx')
for sheet in wb:
    ws = wb[sheet.title]
    for i in range(1, 290):
        nameCell = ws.cell(row=i, column=1)
        jobCell = ws.cell(row=i, column=3)
        search = str(nameCell.value) + " " + str(jobCell.value) + " email"
        gs = GoogleSearch(search)
        print search
        print gs.top_urls()[0]
        print("\n")
Ejemplo n.º 39
0
def Google1(string):
	gs = GoogleSearch(string)
	for hit in gs.top_results():
		#send(hit[u'content'])
		chatvoice(hit[u'content'])
		break