def calculateIndex(term, domain_name, start=0, checks=0): try: d = google.doGoogleSearch(term, start=start) except UnicodeDecodeError: term = unicode(term, 'iso-8859-1') d = google.doGoogleSearch(term, start=start) msg = "term:%s (start:%s, checks:%s)"%(term, start, checks) #LOG("calculateIndex()", INFO, msg) checks += 1 c = 1 index = None domain_name = domain_name.lower() if not domain_name.startswith('http://'): domain_name = 'http://%s'%domain_name for each in d.results: url = each.URL.lower() if url.startswith(domain_name): return c+start, checks c += 1 if start < GIVE_UP_LIMIT: return calculateIndex(term, domain_name, start+10, checks) else: return None, checks
def chanmsg_handler(channel, nick, cmd, args): g = sys.modules['gg'] action = "" q = " " for i in args: q += i+" " spell = google.doSpellingSuggestion(q) if spell != None: action = 'PRIVMSG %s :Perhaps you mean %s?\r\n' % (channel, spell) data = google.doGoogleSearch(spell) action += 'PRIVMSG %s :Then it would be %s .. else:\r\n' % (channel, data.results[0].URL) data = google.doGoogleSearch(q) action += 'PRIVMSG %s :%s\r\n' % (channel, data.results[0].URL) return action
def produce_entries(self): """ Produce feed entries from Google product item data. """ # Start off with an empty list for entries. entries = [] # Execute the Google search data = google.doGoogleSearch(self.search_query, license_key=self.license_key) # Run through all fetched items, building entries for result in data.results: # Map the web search result data to feed entry properties entry = FeedEntryDict(date_fmt=self.date_fmt, init_dict={ 'title' : result.directoryTitle or '(untitled)', 'link' : result.URL, 'summary' : result.snippet, }) # Append completed entry to list entries.append(entry) return entries
def on_IM_IN(self,data): # data contains "screenname:flag:message", such as # "jamwt:F:hey, ben.. how's work?" data_components = data.split(":",2) #maxsplit for handling #in-message colons screenname = data_components[0] # the sender's screenname message = data_components[2] # in case the sender # used a colon in their # message # TocTalk also includes a special helper function called # strip_html(). Many AIM clients like Windows AIM use HTML # code. strip_html() will remove HTML tags and make it text message = self.strip_html(message) # Perform the Google search data = google.doGoogleSearch(message) # Format the results. For now, just the top 3... response = """\nSearch: %s\nResuls:\n""" % message for res in data.results[0:3]: response = response + """<a href="%s">%s</a>\n""" % (res.URL, res.title) # Send the results back to the user self.do_SEND_IM(screenname, response)
def google_handler(t, s, p): p = re.search('^(((([0-9]{1,5})(\-([0-9]{1,5}))?)|next))?(\ ?.+)?$', p) if p: p = p.groups() else: s.syntax(t, 'google') return n = p[1] start = p[3] if start == '0': start =1 finish = p[5] q = p[6] if n == 'next': if G_CACHE.has_key(s.jid): start, finish, q = G_CACHE[s.jid] d = finish-start+1 start, finish = start+d, finish+d else: s.lmsg(t, 'google_history_notfound') return if not start: start = '1' if not finish: finish = start if not q: s.lmsg(t, 'google?') return G_CACHE[s.jid] = (int(start), int(finish), q) #s.msg(t, u'%s: %s-%s' % (q, start, finish)) x = google.doGoogleSearch(q, start=int(start)-1, maxResults=min(int(finish)-int(start)+1, 10), filter=1) if x.results: s.lmsg(t, 'google_results', show_list([lang.msg('google_result', (htmldecode(i.snippet), i.URL, i.cachedSize), lang.getLang(s.jid)) for i in x.results]), x.meta.estimatedTotalResultsCount) else: s.lmsg(t, 'google_no_results')
def doGoogleSearch(self, query, start = 0, maxResults = 10, filter = 1, restrict = '', safeSearch = 0, language = '', inputencoding = 'UTF-8', outputencoding = 'UTF-8', http_proxy=None): #doGoogleSearch google.setLicense(self.license_key) l_data = google.doGoogleSearch( query, start, maxResults, filter, restrict, safeSearch, language, inputencoding, outputencoding, self.license_key, http_proxy) l_meta = { 'documentFiltering' : l_data.meta.documentFiltering, 'searchComments' : l_data.meta.searchComments, 'estimatedTotalResultsCount' : l_data.meta.estimatedTotalResultsCount, 'estimateIsExact' : l_data.meta.estimateIsExact, 'searchQuery' : l_data.meta.searchQuery, 'startIndex' : l_data.meta.startIndex, 'endIndex' : l_data.meta.endIndex, 'searchTips' : l_data.meta.searchTips, 'directoryCategories' : l_data.meta.directoryCategories, 'searchTime' : l_data.meta.searchTime, } l_result = [] for r in l_data.results: l_result.append( { 'URL' : r.URL, 'title' : r.title, 'snippet' : r.snippet, 'cachedSize' : r.cachedSize, 'relatedInformationPresent' : r.relatedInformationPresent, 'hostName' : r.hostName, 'directoryCategory' : r.directoryCategory, 'directoryTitle' : r.directoryTitle, 'summary' : r.summary, } ) return (l_meta, l_result)
def produce_entries(self): """ Produce feed entries from Google product item data. """ # Start off with an empty list for entries. entries = [] # Execute the Google search data = google.doGoogleSearch(self.search_query, license_key=self.license_key) # Run through all fetched items, building entries for result in data.results: # Map the web search result data to feed entry properties entry = FeedEntryDict(date_fmt=self.date_fmt, init_dict={ 'title': result.directoryTitle or '(untitled)', 'link': result.URL, 'summary': result.snippet, }) # Append completed entry to list entries.append(entry) return entries
def queryViaSoapApi(self, query): import google google.LICENSE_KEY = config.google_key offset = 0 estimatedTotalResultsCount = None while not estimatedTotalResultsCount \ or offset < estimatedTotalResultsCount: while (True): # Google often yields 502 errors. try: pywikibot.output(u'Querying Google, offset %i' % offset) data = google.doGoogleSearch(query, start = offset, filter = False) break except KeyboardInterrupt: raise except: # SOAPpy.Errors.HTTPError or SOAP.HTTPError (502 Bad Gateway) # can happen here, depending on the module used. It's not easy # to catch this properly because pygoogle decides which one of # the soap modules to use. pywikibot.output(u"An error occured. Retrying in 10 seconds...") time.sleep(10) continue for result in data.results: #print 'DBG: ', result.URL yield result.URL # give an estimate of pages to work on, but only once. if not estimatedTotalResultsCount: pywikibot.output(u'Estimated total result count: %i pages.' % data.meta.estimatedTotalResultsCount) estimatedTotalResultsCount = data.meta.estimatedTotalResultsCount #print 'estimatedTotalResultsCount: ', estimatedTotalResultsCount offset += 10
def handleCommand(self, bot, nick, channel, argument=None): if argument is None: return try: data = google.doGoogleSearch('rfc: ' + argument) except: bot.msg(channel, 'Exception in google.doGoogleSearch()') if data is not None: bot.msg(channel, data.results[0].URL)
def testMetaLong(self): """--meta should return meta information""" google.main(["--meta", "-s", self.q]) commandLineAnswer = self.lastOutput() commandLineAnswer = commandLineAnswer[:commandLineAnswer.index('searchTime')] google._output(google.doGoogleSearch(self.q), self.metaparams) realAnswer = self.lastOutput() realAnswer = realAnswer[:realAnswer.index('searchTime')] self.assertEqual(commandLineAnswer, realAnswer)
def getSearchResultIndex(q): d = google.doGoogleSearch(q) c = 1 index = None for each in d.results: if each.URL.lower().startswith('http://www.peterbe.com'): index = c break c += 1 return index
def testMetaLong(self): """--meta should return meta information""" google.main(["--meta", "-s", self.q]) commandLineAnswer = self.lastOutput() commandLineAnswer = commandLineAnswer[:commandLineAnswer. index('searchTime')] google._output(google.doGoogleSearch(self.q), self.metaparams) realAnswer = self.lastOutput() realAnswer = realAnswer[:realAnswer.index('searchTime')] self.assertEqual(commandLineAnswer, realAnswer)
def webtrawl(): alltheweb_prefix = "http://www.alltheweb.com/search?cat=web&o=0&_sb_lang=any&q=link:" yahoo_prefix = "http://search.yahoo.com/search?p=link:" start = strftime("%Y-%m-%d %H:%M:%S", localtime()) alltheweb_total = 0 yahoo_total = 0 google_total = 0 for url in urls: try: #You can't use any other search terms when you use "link:" in Google. google_results = google.doGoogleSearch("link:"+url) google_count = google_results.meta.estimatedTotalResultsCount google_total += int(google_count) except: google_count = '!' goog=etree.SubElement(gmain,"Search") goog.set("URL",url) goog.text=str(google_count) goog.tail="\n" for term in ['']: print term print url start_time = strftime("%Y-%m-%d %H:%M:%S", localtime()) try: alltheweb_results = urllib.urlopen(alltheweb_prefix+url+"+"+term).read() alltheweb_count = re.search('<span class="ofSoMany">(.+?)</span>',alltheweb_results).group(1) alltheweb_count = string.replace(alltheweb_count,',','') alltheweb_total += int(alltheweb_count) except: alltheweb_count = '!' atw=etree.SubElement(amain,"Search") atw.set("URL",url) atw.set("Term",term) atw.text=alltheweb_count atw.tail="\n" try: yahoo_results = urllib.urlopen(yahoo_prefix+url+"+"+term).read() yahoo_count = re.search('of about (\S+)',yahoo_results).group(1) yahoo_count = string.replace(yahoo_count,',','') yahoo_total += int(yahoo_count) except: yahoo_count = '!' yah=etree.SubElement(ymain,"Search") yah.set("URL",url) yah.set("Term",term) yah.text=yahoo_count yah.tail="\n" print start_time+"\t"+url+"\n"+alltheweb_count+"\n"+`google_count`+"\n"+yahoo_count+"\n"
def _doGoogleSearch(self, query, start = 0, maxResults = 10, filter = 1, restrict = '', safeSearch = 0, language = '', inputencoding = 'UTF-8', outputencoding = 'UTF-8', http_proxy=None, license_key=''): #doGoogleSearch return self._formatGoogleData(google.doGoogleSearch(query, start, maxResults, filter, restrict, safeSearch, language, inputencoding, outputencoding, license_key, http_proxy))
def getGoogleTotalResultsCount(s): google.setLicense('...') # must get your own key! ustr = unicode(s,'shiftjis') flg = True while flg: try: data = google.doGoogleSearch(ustr) except: print sys.exc_info() time.sleep(5) else: flg = False return data.meta.estimatedTotalResultsCount
def getGoogleTotalResultsCount(s): keys = open("./googlelicense.txt","r").read().rstrip("\n").split("\n") google.setLicense(random.choice(keys)) # must get your own key! ustr = unicode(s,'shiftjis') fsuccess = False while fsuccess == False: try: data = google.doGoogleSearch(ustr) except: print sys.exc_info() time.sleep(5) else: fsuccess = True return data.meta.estimatedTotalResultsCount
def calculateIndex(term, domain_name, start=0, checks=0): try: d = google.doGoogleSearch(term, start=start) except UnicodeDecodeError: term = unicode(term, 'iso-8859-1') d = google.doGoogleSearch(term, start=start) msg = "term:%s (start:%s, checks:%s)" % (term, start, checks) #LOG("calculateIndex()", INFO, msg) checks += 1 c = 1 index = None domain_name = domain_name.lower() if not domain_name.startswith('http://'): domain_name = 'http://%s' % domain_name for each in d.results: url = each.URL.lower() if url.startswith(domain_name): return c + start, checks c += 1 if start < GIVE_UP_LIMIT: return calculateIndex(term, domain_name, start + 10, checks) else: return None, checks
def google_query(query, searchlimit, destination): try: data = google.doGoogleSearch(query, start=0, maxResults=searchlimit, filter=1, restrict='', safeSearch=option["safesearch"]) stripper = __Stripper() for item in data.results: item.title = stripper.strip(item.title) item.snippet = stripper.strip(item.snippet) if option["resultsintab"] == True: destination.prnt(color["red"] + item.title + " " + color["black"] + item.snippet + " " + color["blue"] + item.URL) else: destination.command("say " + color["red"] + item.title + " " + color["black"] + item.snippet + " " + color["blue"] + item.URL) except Exception, args: print color["red"], Exception, args return
def handleCommand(self, bot, nick, channel, argument=None): if argument is None: return if argument.find(' ') != -1: argument = '"' + argument + '"' try: data = google.doGoogleSearch(argument) except: bot.msg(channel, 'Exception in google.doGoogleSearch()') return links = '' if data is not None: for i in range(5): if len(data.results) >= i: links += data.results[i].URL + ' | ' bot.msg(channel, links[:-3])
def doGoogleSearch(self, query, start=0, maxResults=10, filter=1, restrict='', safeSearch=0, language='', inputencoding='UTF-8', outputencoding='UTF-8', http_proxy=None): #doGoogleSearch google.setLicense(self.license_key) l_data = google.doGoogleSearch(query, start, maxResults, filter, restrict, safeSearch, language, inputencoding, outputencoding, self.license_key, http_proxy) l_meta = { 'documentFiltering': l_data.meta.documentFiltering, 'searchComments': l_data.meta.searchComments, 'estimatedTotalResultsCount': l_data.meta.estimatedTotalResultsCount, 'estimateIsExact': l_data.meta.estimateIsExact, 'searchQuery': l_data.meta.searchQuery, 'startIndex': l_data.meta.startIndex, 'endIndex': l_data.meta.endIndex, 'searchTips': l_data.meta.searchTips, 'directoryCategories': l_data.meta.directoryCategories, 'searchTime': l_data.meta.searchTime, } l_result = [] for r in l_data.results: l_result.append({ 'URL': r.URL, 'title': r.title, 'snippet': r.snippet, 'cachedSize': r.cachedSize, 'relatedInformationPresent': r.relatedInformationPresent, 'hostName': r.hostName, 'directoryCategory': r.directoryCategory, 'directoryTitle': r.directoryTitle, 'summary': r.summary, }) return (l_meta, l_result)
def google2(): terms.remove("") for term in terms: for g in grest: start_time = strftime("%Y-%m-%d %H:%M:%S", localtime()) try: results=google.doGoogleSearch(term, restrict=g) count=results.meta.estimatedTotalResultsCount except: count='!' print start_time+"\t"+term+"\t"+g+"\n"+`count`+"\n" goog=etree.SubElement(googlecc,"Search") goog.set("Term",term) goog.set("Rest",g) goog.text=str(count) goog.tail="\n" terms.append("") terms.sort()
def google_search(query): try: data = google.doGoogleSearch(query) except SOAP.HTTPError: return "\r\nGoogle API Error." except SOAP.faultType: return "\r\nInvalid Google Key. Maybe still default??\r\nTake a look on modules/googlekey.txt" try: first = data.results[0] url = first.URL title = google_remove_html(first.title) if first.summary: summary = google_remove_html(first.summary) else: summary = google_remove_html(first.snippet) searchtime = str(round(data.meta.searchTime, 3)) total = str(data.meta.estimatedTotalResultsCount) return url + " - " + title + " - " + summary + " (" + searchtime + "sec) (" + total + " sites)" except: return "No Results"
def google_search(query): try: data = google.doGoogleSearch(query) except SOAP.HTTPError: return '\r\nGoogle API Error.' except SOAP.faultType: return '\r\nInvalid Google Key. Maybe still default??\r\nTake a look on modules/googlekey.txt' try: first = data.results[0] url = first.URL title = google_remove_html(first.title) if first.summary: summary = google_remove_html(first.summary) else: summary = google_remove_html(first.snippet) searchtime = str(round(data.meta.searchTime, 3)) total = str(data.meta.estimatedTotalResultsCount) return url + ' - ' + title + ' - ' + summary + ' (' + searchtime + 'sec) (' + total + ' sites)' except: return 'No Results'
def go_get_googles(filetype, results_desired=10): search_string = "e filetype:%s" % (filetype) if results_desired <= 10: batches = 1 #the google api only supports retrieving #10 results per search so we have to batch #the requests if results_desired > 10: if ((results_desired % 10) != 0): #if there is a remainder batches = (results_desired / 10)+1 #then round up else: batches = (results_desired / 10) urls = [] for inc in range(0, batches): googles = google.doGoogleSearch(search_string, (inc*10), 10) rl = len(googles.results) for x in range(0,len(googles.results)): urls.append(uriobj(googles.results[x].URL)) #pt(urls) print "Doing:", batches, "batches for", len(urls), "files found." return urls #returns a list of uriobj's
def go_get_googles(filetype, results_desired=10): search_string = "e filetype:%s" % (filetype) if results_desired <= 10: batches = 1 #the google api only supports retrieving #10 results per search so we have to batch #the requests if results_desired > 10: if ((results_desired % 10) != 0): #if there is a remainder batches = (results_desired / 10) + 1 #then round up else: batches = (results_desired / 10) urls = [] for inc in range(0, batches): googles = google.doGoogleSearch(search_string, (inc * 10), 10) rl = len(googles.results) for x in range(0, len(googles.results)): urls.append(uriobj(googles.results[x].URL)) #pt(urls) print "Doing:", batches, "batches for", len(urls), "files found." return urls #returns a list of uriobj's
def soap(self, engine, query, url, numresults=10): print " %s query..." % engine.capitalize() search_request_retry = config.copyright_connection_tries query_success = False while search_request_retry: try: if engine == 'google': import google google.LICENSE_KEY = config.google_key data = google.doGoogleSearch( '%s "%s"' % (no_result_with_those_words, query)) for entry in data.results: self.add_in_urllist(url, entry.URL, 'google', entry.cachedSize) self.num_google_queries += 1 elif engine == 'yahoo': import yahoo.search.web data = yahoo.search.web.WebSearch( config.yahoo_appid, query='"%s" %s' % (query.encode('utf_8'), no_result_with_those_words), results=numresults) for entry in data.parse_results(): cacheurl = None if entry.Cache: cacheurl = entry.Cache.Url self.add_in_urllist(url, entry.Url, 'yahoo', cacheurl) self.num_yahoo_queries += 1 elif engine == 'msn': #max_query_len = 150? from SOAPpy import WSDL try: server = WSDL.Proxy( 'http://soap.search.msn.com/webservices.asmx?wsdl') except Exception, err: error("Live Search Error: %s" % err) raise params = { 'AppID': config.msn_appid, 'Query': '%s "%s"' % (no_result_with_those_words, query), 'CultureInfo': region_code, 'SafeSearch': 'Off', 'Requests': { 'SourceRequest': { 'Source': 'Web', 'Offset': 0, 'Count': 10, 'ResultFields': 'All', } } } results = '' server_results = server.Search(Request=params) if server_results.Responses[0].Results: results = server_results.Responses[0].Results[0] if results: # list or instance? if type(results) == list: for entry in results: cacheurl = None if hasattr(entry, 'CacheUrl'): cacheurl = entry.CacheUrl self.add_in_urllist(url, entry.Url, 'msn', cacheurl) else: cacheurl = None if hasattr(results, 'CacheUrl'): cacheurl = results.CacheUrl self.add_in_urllist(url, results.Url, 'msn', cacheurl) self.num_msn_queries += 1 search_request_retry = 0 query_success = True except KeyboardInterrupt: raise
def search(args): return google.doGoogleSearch(*args).results
def testSearchDefault(self): """no options + search phrase should search""" google.main([self.q]) commandLineAnswer = self.lastOutput() google._output(google.doGoogleSearch(self.q), self.searchparams) self.assertEqual(commandLineAnswer, self.lastOutput())
def testSearchLong(self): """--search should search""" google.main(["--search", self.q]) commandLineAnswer = self.lastOutput() google._output(google.doGoogleSearch(self.q), self.searchparams) self.assertEqual(commandLineAnswer, self.lastOutput())
word2, index = self.parseWord(argument_list, index) except self.WordParseError, arg: raise arg return word1, word2 def handleCommand(self, bot, nick, channel, argument=None): if argument is None: bot.msg(channel, 'Syntax Fehler.') return try: word1, word2 = self.parseWords(argument) except self.WordParseError, arg: bot.msg(channel, arg) return try: word1_data = google.doGoogleSearch(word1) word2_data = google.doGoogleSearch(word2) except: bot.msg(channel, 'Exception in google.doGoogleSearch()') return if word1_data.meta.estimatedTotalResultsCount > word2_data.meta.estimatedTotalResultsCount: bot.msg(channel, word1 + ' hat gewonnen.') elif word2_data.meta.estimatedTotalResultsCount > word1_data.meta.estimatedTotalResultsCount: bot.msg(channel, word2 + ' hat gewonnen.') else: bot.msg(channel, 'Unentschieden.') class RFCPlugin(Plugin): command = 'RFC'
# Create the query termkeyword = "inurl:" query = keyword + domain #Start the query looppotentials = [] for i in [0,10,20,30,40,50,60,70,80,90]: # Perform the query five times, taking 10 results each time # and putting them into our potentials list data = google.doGoogleSearch(query,start=(i)) for result in data.results: potentials.append(result.URL) # Initialize a second list to hold unique, cleaned entries refined = [] # Do the cleaning work and populate the new list for i in potentials: i = i.replace('http://','') i = i.replace('https:','') keep = i.split('/')
def soap(self, engine, query, url, numresults = 10): print " %s query..." % engine.capitalize() search_request_retry = config.copyright_connection_tries query_success = False while search_request_retry: try: if engine == 'google': import google google.LICENSE_KEY = config.google_key data = google.doGoogleSearch('%s "%s"' % (no_result_with_those_words, query)) for entry in data.results: self.add_in_urllist(url, entry.URL, 'google', entry.cachedSize) self.num_google_queries += 1 elif engine == 'yahoo': import yahoo.search.web data = yahoo.search.web.WebSearch(config.yahoo_appid, query='"%s" %s' % ( query.encode('utf_8'), no_result_with_those_words ), results = numresults) for entry in data.parse_results(): cacheurl = None if entry.Cache: cacheurl = entry.Cache.Url self.add_in_urllist(url, entry.Url, 'yahoo', cacheurl) self.num_yahoo_queries += 1 elif engine == 'msn': #max_query_len = 150? from SOAPpy import WSDL try: server = WSDL.Proxy('http://soap.search.msn.com/webservices.asmx?wsdl') except Exception, err: error("Live Search Error: %s" % err) raise params = {'AppID': config.msn_appid, 'Query': '%s "%s"' % (no_result_with_those_words, query), 'CultureInfo': region_code, 'SafeSearch': 'Off', 'Requests': { 'SourceRequest':{'Source': 'Web', 'Offset': 0, 'Count': 10, 'ResultFields': 'All',}}} results = '' server_results = server.Search(Request = params) if server_results.Responses[0].Results: results = server_results.Responses[0].Results[0] if results: # list or instance? if type(results) == list: for entry in results: cacheurl = None if hasattr(entry, 'CacheUrl'): cacheurl = entry.CacheUrl self.add_in_urllist(url, entry.Url, 'msn', cacheurl) else: cacheurl = None if hasattr(results, 'CacheUrl'): cacheurl = results.CacheUrl self.add_in_urllist(url, results.Url, 'msn', cacheurl) self.num_msn_queries += 1 search_request_retry = 0 query_success = True except KeyboardInterrupt: raise
# Create the query termkeyword = "inurl:" query = keyword + domain #Start the query looppotentials = [] for i in [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]: # Perform the query five times, taking 10 results each time # and putting them into our potentials list data = google.doGoogleSearch(query, start=(i)) for result in data.results: potentials.append(result.URL) # Initialize a second list to hold unique, cleaned entries refined = [] # Do the cleaning work and populate the new list for i in potentials: i = i.replace('http://', '') i = i.replace('https:', '') keep = i.split('/')
#Failed import google # to search query = "Geeksforgeeks" for j in google.doGoogleSearch(query, 0, 10, 1): #(query, tld="co.in", num=10, stop=1, pause=2): print(j)
def _search_serie_api(self, searchterms): """Search for a serie using the Google SOAP API""" d = google.doGoogleSearch("site:epguides.com %s" % searchterms) return d.results[0].URL
def createSemanticWeb(self, term = '', language = 'english', languages = [], googleLicenseKey = ''): import helper, analyser, re, google from operator import itemgetter # initialise keyword dictionary keywords = {} # initialise query string query = term + ' site:en.wikipedia.org' # initialise error string error = 0 errorMessage = '' # Google licence key google.LICENSE_KEY = googleLicenseKey # query Google try: data = google.doGoogleSearch(query) wikiPage = data.results[0] except: error = 1 errorMessage = 'Sorry, an error occured during this request, please try again' # get corpus corpusHelper = helper.CorpusHelper() corpusInfo = corpusHelper.getCorpus(wikiPage.URL) error = corpusInfo['error'] errorMessage = corpusInfo['errorMessage'] corpus = corpusInfo['corpus'] charset = corpusInfo['charset'] # if retrieval was successful if error != 1: # tokenizer tokenizer = analyser.Tokenizer() # initialise corpus helper corpusHelper = helper.CorpusHelper() # get corpus information corpusInfo = corpusHelper.getLinkedTerms(tokenizer, corpus, language, languages) tokenizedCorpus = corpusInfo['tokenizedCorpus'] linkedTerms = corpusInfo['linkedTerms'] # calculate some average values tokenCount = len(tokenizedCorpus) # word structure wordStructure = analyser.WordStructure() # text structure textStructure = analyser.TextStructure() # get N-grams ngrams = textStructure.getNGrams(tokenizedCorpus, tokenCount) mostFrequentUnigrams = ngrams['mostFrequentUnigrams'] mostFrequentBigrams = ngrams['mostFrequentBigrams'] mostFrequentTrigrams = ngrams['mostFrequentTrigrams'] # add unigrams to keyword dictionary for ngram in mostFrequentUnigrams: keywords[ngram] = mostFrequentUnigrams[ngram] # add bigrams to keyword dictionary for ngram in mostFrequentBigrams: keywords[ngram] = mostFrequentBigrams[ngram] # add trigrams to keyword dictionary for ngram in mostFrequentTrigrams: keywords[ngram] = mostFrequentTrigrams[ngram] # build return dictionary returnDict = dict(error = error, errorMessage = errorMessage, charset = charset, tokenCount = tokenCount, keywords = sorted(keywords.iteritems(), key = itemgetter(1), reverse = True), debug = 0, wikiPage = wikiPage, term = term, linkedTerms = linkedTerms) # if URL retrieval error else: # build return dictionary returnDict = dict(error = error, errorMessage = errorMessage, fallback = fallback, debug = 1, term = term) # return values return returnDict
def createSemanticWeb(self, term='', language='english', languages=[], googleLicenseKey=''): import helper, analyser, re, google from operator import itemgetter # initialise keyword dictionary keywords = {} # initialise query string query = term + ' site:en.wikipedia.org' # initialise error string error = 0 errorMessage = '' # Google licence key google.LICENSE_KEY = googleLicenseKey # query Google try: data = google.doGoogleSearch(query) wikiPage = data.results[0] except: error = 1 errorMessage = 'Sorry, an error occured during this request, please try again' # get corpus corpusHelper = helper.CorpusHelper() corpusInfo = corpusHelper.getCorpus(wikiPage.URL) error = corpusInfo['error'] errorMessage = corpusInfo['errorMessage'] corpus = corpusInfo['corpus'] charset = corpusInfo['charset'] # if retrieval was successful if error != 1: # tokenizer tokenizer = analyser.Tokenizer() # initialise corpus helper corpusHelper = helper.CorpusHelper() # get corpus information corpusInfo = corpusHelper.getLinkedTerms(tokenizer, corpus, language, languages) tokenizedCorpus = corpusInfo['tokenizedCorpus'] linkedTerms = corpusInfo['linkedTerms'] # calculate some average values tokenCount = len(tokenizedCorpus) # word structure wordStructure = analyser.WordStructure() # text structure textStructure = analyser.TextStructure() # get N-grams ngrams = textStructure.getNGrams(tokenizedCorpus, tokenCount) mostFrequentUnigrams = ngrams['mostFrequentUnigrams'] mostFrequentBigrams = ngrams['mostFrequentBigrams'] mostFrequentTrigrams = ngrams['mostFrequentTrigrams'] # add unigrams to keyword dictionary for ngram in mostFrequentUnigrams: keywords[ngram] = mostFrequentUnigrams[ngram] # add bigrams to keyword dictionary for ngram in mostFrequentBigrams: keywords[ngram] = mostFrequentBigrams[ngram] # add trigrams to keyword dictionary for ngram in mostFrequentTrigrams: keywords[ngram] = mostFrequentTrigrams[ngram] # build return dictionary returnDict = dict(error=error, errorMessage=errorMessage, charset=charset, tokenCount=tokenCount, keywords=sorted(keywords.iteritems(), key=itemgetter(1), reverse=True), debug=0, wikiPage=wikiPage, term=term, linkedTerms=linkedTerms) # if URL retrieval error else: # build return dictionary returnDict = dict(error=error, errorMessage=errorMessage, fallback=fallback, debug=1, term=term) # return values return returnDict
#!/usr/bin/python # filename: google.py # usage: python google.py <query> import sys, string, codecs import google if sys.argv[1:]: query = sys.argv[1] else: sys.exit('Usage: python google.py <query>') google.LICENSE_KEY = 'insert key here' data = google.doGoogleSearch(query) sys.stdout = codes.lookup('utf-8')[-1](sys.stdout) for result in data.results: title = result.title URL = result.URL snippet = result.snippet regex = re.compile('<[^>]>') title = regex.sub(r'', title) snippet = regex.sub(r'', snippet) print string.join( (title, URL, snippet), "\n"), "\n"
def testLuckyLong(self): """--lucky should return only first result""" google.main(["--lucky", "-s", self.q]) commandLineAnswer = self.lastOutput() google._output(google.doGoogleSearch(self.q), self.luckyparams) self.assertEqual(commandLineAnswer, self.lastOutput())
request = urllib2.Request(top_hindi_link) response = urllib2.urlopen(request) soup = BeautifulSoup(response, "html.parser") type(soup) for div in soup.findAll('div', {'class': 'info'}): for b in soup.findAll('b'): for a in b.findAll('a'): print a.text links = [] titles = [] for div in soup.findAll('div', {'class': 'info'}): for b in div.findAll('b'): for a in b.findAll('a'): titles.append(a.text) links.append(a['href']) print a.text print 'you collected ' + str(len(links)) + ' links.' print print 'you collected ' + str(len(titles)) + ' titles.' str(links[0]) import google g = google.doGoogleSearch('Titanic film wikipedia') g.pages = 5 print '*Found %s results*' % (g.get_result_count()) g.get_urls()
def testReverseLong(self): """--reverse should reverse results""" google.main(["--reverse", "-s", self.q]) commandLineAnswer = self.lastOutput() google._output(google.doGoogleSearch(self.q), self.reverseparams) self.assertEqual(commandLineAnswer, self.lastOutput())
"""Relevant method: meta""" import google google.LICENSE_KEY = '...' # must get your own! data = google.doGoogleSearch('python') data.
def getSimilarDocuments(self, url = '', plainText = '', language = 'automatic', **kw): from core import analyser, helper import google # Google licence key google.LICENSE_KEY = self.googleLicenseKey # initialise error string error = 0 errorMessage = '' # get corpus corpusHelper = helper.CorpusHelper() corpusInfo = corpusHelper.getCorpus(url, plainText) error = corpusInfo['error'] errorMessage = corpusInfo['errorMessage'] corpus = corpusInfo['corpus'] charset = corpusInfo['charset'] url = corpusInfo['url'] # if retrieval was successful if error != 1: # tokenizer tokenizer = analyser.Tokenizer() # initialise corpus helper corpusHelper = helper.CorpusHelper() # get tokenized corpus corpusInfo = corpusHelper.getTokenizedCorpus(tokenizer, corpus, language, self.languages) tokenizedCorpus = corpusInfo['tokenizedCorpus'] # get token count tokenCount = len(tokenizedCorpus) # analyse text structure textStructure = analyser.TextStructure() # get N-grams ngrams = textStructure.getNGrams(tokenizedCorpus, tokenCount) mostFrequentUnigrams = ngrams['mostFrequentUnigrams'] mostFrequentBigrams = ngrams['mostFrequentBigrams'] mostFrequentTrigrams = ngrams['mostFrequentTrigrams'] mostFrequentBigramsWithStopWords = ngrams['mostFrequentBigramsWithStopWords'] mostFrequentTrigramsWithStopWords = ngrams['mostFrequentTrigramsWithStopWords'] # get keywords keywords = list(textStructure.getKeywords(mostFrequentUnigrams, mostFrequentBigrams, mostFrequentTrigrams, mostFrequentBigramsWithStopWords, mostFrequentTrigramsWithStopWords)) # build string for Google query googleHelper = helper.GoogleHelper() query = googleHelper.getGoogleQuery(mostFrequentUnigrams) # query Google try: data = google.doGoogleSearch(query) similarDocuments = data.results except: if url != 'text': url = '/getSimilarDocuments/?url=' + url else: url = '/getSimilarDocuments/?plainText=' + plainText similarDocuments = [] error = 1 errorMessage = 'Sorry, an error occured during this request, please try again' # build return dictionary returnDict = dict(error = error, errorMessage = errorMessage, languages = self.languages, similarDocuments = similarDocuments, debug = 0, url = url) # if URL retrieval error else: # build return dictionary returnDict = dict(error = error, errorMessage = errorMessage, languages = self.languages, debug = 1, url = url) # return values return returnDict