Ejemplo n.º 1
0
def calculateIndex(term, domain_name, start=0, checks=0):
    try:
        d = google.doGoogleSearch(term, start=start)
    except UnicodeDecodeError:
        term = unicode(term, 'iso-8859-1')
        d = google.doGoogleSearch(term, start=start)
        
            
    msg = "term:%s (start:%s, checks:%s)"%(term, start, checks)
    #LOG("calculateIndex()", INFO, msg)
    checks += 1
    c = 1
    index = None
    domain_name = domain_name.lower()
    if not domain_name.startswith('http://'):
        domain_name = 'http://%s'%domain_name
    for each in d.results:
        url = each.URL.lower()
        if url.startswith(domain_name):
            return c+start, checks
        c += 1
    if start < GIVE_UP_LIMIT:
        return calculateIndex(term, domain_name, start+10, checks)
    else:
        return None, checks
Ejemplo n.º 2
0
def chanmsg_handler(channel, nick, cmd, args):
	g = sys.modules['gg']
	action = ""
	q = " "
	for i in args: q += i+" "
	spell = google.doSpellingSuggestion(q)
	if spell != None:
		action = 'PRIVMSG %s :Perhaps you mean %s?\r\n' % (channel, spell)
		data = google.doGoogleSearch(spell)
		action += 'PRIVMSG %s :Then it would be %s .. else:\r\n' % (channel, data.results[0].URL)	
	data = google.doGoogleSearch(q)
	action += 'PRIVMSG %s :%s\r\n' % (channel, data.results[0].URL)
	return action
    def produce_entries(self):
        """
        Produce feed entries from Google product item data.
        """
        # Start off with an empty list for entries.
        entries = []

        # Execute the Google search
        data = google.doGoogleSearch(self.search_query, 
                license_key=self.license_key)
        
        # Run through all fetched items, building entries
        for result in data.results:
            
            # Map the web search result data to feed entry properties
            entry = FeedEntryDict(date_fmt=self.date_fmt, init_dict={
                'title'    : result.directoryTitle or '(untitled)',
                'link'     : result.URL,
                'summary'  : result.snippet,
            })

            # Append completed entry to list
            entries.append(entry)

        return entries
Ejemplo n.º 4
0
    def on_IM_IN(self,data):
        # data contains "screenname:flag:message", such as
        # "jamwt:F:hey, ben.. how's work?"
        data_components = data.split(":",2) #maxsplit for handling 
                                            #in-message colons
        
        screenname = data_components[0]  # the sender's screenname
        message = data_components[2]     # in case the sender 
                                                # used a colon in their 
                                                # message

        # TocTalk also includes a special helper function called
        # strip_html().  Many AIM clients like Windows AIM use HTML
        # code.  strip_html() will remove HTML tags and make it text
        message = self.strip_html(message)

        # Perform the Google search
        data = google.doGoogleSearch(message)

        # Format the results.  For now, just the top 3...
        response = """\nSearch: %s\nResuls:\n""" % message
        for res in data.results[0:3]:
            response = response + """<a href="%s">%s</a>\n""" % (res.URL, res.title)

        # Send the results back to the user
        self.do_SEND_IM(screenname, response)
Ejemplo n.º 5
0
def google_handler(t, s, p):
 p = re.search('^(((([0-9]{1,5})(\-([0-9]{1,5}))?)|next))?(\ ?.+)?$', p)
 if p: p = p.groups()
 else:
  s.syntax(t, 'google')
  return
 n = p[1]
 start = p[3]
 if start == '0': start =1
 finish = p[5]
 q = p[6]
 if n == 'next':
  if G_CACHE.has_key(s.jid):
   start, finish, q = G_CACHE[s.jid]
   d = finish-start+1
   start, finish = start+d, finish+d
  else:
   s.lmsg(t, 'google_history_notfound')
   return
 if not start: start = '1'
 if not finish: finish = start
 if not q:
  s.lmsg(t, 'google?')
  return
 G_CACHE[s.jid] = (int(start), int(finish), q)
 #s.msg(t, u'%s: %s-%s' % (q, start, finish))
 x = google.doGoogleSearch(q, start=int(start)-1, maxResults=min(int(finish)-int(start)+1, 10), filter=1)
 if x.results:
  s.lmsg(t, 'google_results', show_list([lang.msg('google_result', (htmldecode(i.snippet), i.URL, i.cachedSize), lang.getLang(s.jid)) for i in x.results]), x.meta.estimatedTotalResultsCount)
 else: s.lmsg(t, 'google_no_results')
Ejemplo n.º 6
0
 def doGoogleSearch(self, query, start = 0, maxResults = 10, filter = 1, restrict = '', safeSearch = 0, language = '', inputencoding = 'UTF-8', outputencoding = 'UTF-8', http_proxy=None):
     #doGoogleSearch
     google.setLicense(self.license_key)
     l_data = google.doGoogleSearch( query, start, maxResults, filter, restrict, safeSearch, language, inputencoding, outputencoding, self.license_key, http_proxy)
     l_meta = {
         'documentFiltering' : l_data.meta.documentFiltering,
         'searchComments' : l_data.meta.searchComments,
         'estimatedTotalResultsCount' : l_data.meta.estimatedTotalResultsCount,
         'estimateIsExact' : l_data.meta.estimateIsExact,
         'searchQuery' : l_data.meta.searchQuery,
         'startIndex' : l_data.meta.startIndex,
         'endIndex' : l_data.meta.endIndex,
         'searchTips' : l_data.meta.searchTips,
         'directoryCategories' : l_data.meta.directoryCategories,
         'searchTime' : l_data.meta.searchTime,
     }
     l_result = []
     for r in l_data.results:
         l_result.append(
             {
                 'URL' : r.URL,
                 'title' : r.title,
                 'snippet' : r.snippet,
                 'cachedSize' : r.cachedSize,
                 'relatedInformationPresent' : r.relatedInformationPresent,
                 'hostName' : r.hostName,
                 'directoryCategory' : r.directoryCategory,
                 'directoryTitle' : r.directoryTitle,
                 'summary' : r.summary,
             }
         )
     return (l_meta, l_result)
    def produce_entries(self):
        """
        Produce feed entries from Google product item data.
        """
        # Start off with an empty list for entries.
        entries = []

        # Execute the Google search
        data = google.doGoogleSearch(self.search_query,
                                     license_key=self.license_key)

        # Run through all fetched items, building entries
        for result in data.results:

            # Map the web search result data to feed entry properties
            entry = FeedEntryDict(date_fmt=self.date_fmt,
                                  init_dict={
                                      'title': result.directoryTitle
                                      or '(untitled)',
                                      'link': result.URL,
                                      'summary': result.snippet,
                                  })

            # Append completed entry to list
            entries.append(entry)

        return entries
Ejemplo n.º 8
0
    def queryViaSoapApi(self, query):
        import google
        google.LICENSE_KEY = config.google_key
        offset = 0
        estimatedTotalResultsCount = None
        while not estimatedTotalResultsCount \
              or offset < estimatedTotalResultsCount:
            while (True):
                # Google often yields 502 errors.
                try:
                    pywikibot.output(u'Querying Google, offset %i' % offset)
                    data = google.doGoogleSearch(query, start = offset, filter = False)
                    break
                except KeyboardInterrupt:
                    raise
                except:
                    # SOAPpy.Errors.HTTPError or SOAP.HTTPError (502 Bad Gateway)
                    # can happen here, depending on the module used. It's not easy
                    # to catch this properly because pygoogle decides which one of
                    # the soap modules to use.
                    pywikibot.output(u"An error occured. Retrying in 10 seconds...")
                    time.sleep(10)
                    continue

            for result in data.results:
                #print 'DBG: ', result.URL
                yield result.URL
            # give an estimate of pages to work on, but only once.
            if not estimatedTotalResultsCount:
                pywikibot.output(u'Estimated total result count: %i pages.' % data.meta.estimatedTotalResultsCount)
            estimatedTotalResultsCount = data.meta.estimatedTotalResultsCount
            #print 'estimatedTotalResultsCount: ', estimatedTotalResultsCount
            offset += 10
Ejemplo n.º 9
0
    def queryViaSoapApi(self, query):
        import google
        google.LICENSE_KEY = config.google_key
        offset = 0
        estimatedTotalResultsCount = None
        while not estimatedTotalResultsCount \
              or offset < estimatedTotalResultsCount:
            while (True):
                # Google often yields 502 errors.
                try:
                    pywikibot.output(u'Querying Google, offset %i' % offset)
                    data = google.doGoogleSearch(query, start = offset, filter = False)
                    break
                except KeyboardInterrupt:
                    raise
                except:
                    # SOAPpy.Errors.HTTPError or SOAP.HTTPError (502 Bad Gateway)
                    # can happen here, depending on the module used. It's not easy
                    # to catch this properly because pygoogle decides which one of
                    # the soap modules to use.
                    pywikibot.output(u"An error occured. Retrying in 10 seconds...")
                    time.sleep(10)
                    continue

            for result in data.results:
                #print 'DBG: ', result.URL
                yield result.URL
            # give an estimate of pages to work on, but only once.
            if not estimatedTotalResultsCount:
                pywikibot.output(u'Estimated total result count: %i pages.' % data.meta.estimatedTotalResultsCount)
            estimatedTotalResultsCount = data.meta.estimatedTotalResultsCount
            #print 'estimatedTotalResultsCount: ', estimatedTotalResultsCount
            offset += 10
Ejemplo n.º 10
0
 def handleCommand(self, bot, nick, channel, argument=None):
     if argument is None:
         return
     try:
         data = google.doGoogleSearch('rfc: ' + argument)
     except:
         bot.msg(channel, 'Exception in google.doGoogleSearch()')
     if data is not None:
         bot.msg(channel, data.results[0].URL)
Ejemplo n.º 11
0
 def testMetaLong(self):
     """--meta should return meta information"""
     google.main(["--meta", "-s", self.q])
     commandLineAnswer = self.lastOutput()
     commandLineAnswer = commandLineAnswer[:commandLineAnswer.index('searchTime')]
     google._output(google.doGoogleSearch(self.q), self.metaparams)
     realAnswer = self.lastOutput()
     realAnswer = realAnswer[:realAnswer.index('searchTime')]
     self.assertEqual(commandLineAnswer, realAnswer)
Ejemplo n.º 12
0
def getSearchResultIndex(q):
    d = google.doGoogleSearch(q)
    c = 1
    index = None
    for each in d.results:
        if each.URL.lower().startswith('http://www.peterbe.com'):
            index = c
            break
        c += 1
    return index
 def testMetaLong(self):
     """--meta should return meta information"""
     google.main(["--meta", "-s", self.q])
     commandLineAnswer = self.lastOutput()
     commandLineAnswer = commandLineAnswer[:commandLineAnswer.
                                           index('searchTime')]
     google._output(google.doGoogleSearch(self.q), self.metaparams)
     realAnswer = self.lastOutput()
     realAnswer = realAnswer[:realAnswer.index('searchTime')]
     self.assertEqual(commandLineAnswer, realAnswer)
Ejemplo n.º 14
0
def webtrawl():
  alltheweb_prefix = "http://www.alltheweb.com/search?cat=web&o=0&_sb_lang=any&q=link:"
  yahoo_prefix = "http://search.yahoo.com/search?p=link:"
  start = strftime("%Y-%m-%d %H:%M:%S", localtime())
  
  alltheweb_total = 0
  yahoo_total = 0
  google_total = 0
  for url in urls:
    try:
      #You can't use any other search terms when you use "link:" in Google.
      google_results = google.doGoogleSearch("link:"+url)
      google_count = google_results.meta.estimatedTotalResultsCount
      google_total += int(google_count)
    except:
      google_count = '!'
      
    goog=etree.SubElement(gmain,"Search")
    goog.set("URL",url)
    goog.text=str(google_count)
    goog.tail="\n"

    for term in ['']:
      print term
      print url
      start_time = strftime("%Y-%m-%d %H:%M:%S", localtime())
      try:
        alltheweb_results = urllib.urlopen(alltheweb_prefix+url+"+"+term).read()
        alltheweb_count = re.search('<span class="ofSoMany">(.+?)</span>',alltheweb_results).group(1)
        alltheweb_count = string.replace(alltheweb_count,',','')
        alltheweb_total += int(alltheweb_count)
      except:
        alltheweb_count = '!'
      atw=etree.SubElement(amain,"Search")
      atw.set("URL",url)
      atw.set("Term",term)
      atw.text=alltheweb_count
      atw.tail="\n"

      try:
        yahoo_results = urllib.urlopen(yahoo_prefix+url+"+"+term).read()
        yahoo_count = re.search('of about (\S+)',yahoo_results).group(1)
        yahoo_count = string.replace(yahoo_count,',','')
        yahoo_total += int(yahoo_count)
      except:
        yahoo_count = '!'

      yah=etree.SubElement(ymain,"Search")
      yah.set("URL",url)
      yah.set("Term",term)
      yah.text=yahoo_count
      yah.tail="\n"
      
      print start_time+"\t"+url+"\n"+alltheweb_count+"\n"+`google_count`+"\n"+yahoo_count+"\n"
Ejemplo n.º 15
0
 def _doGoogleSearch(self, query, start = 0, maxResults = 10, filter = 1, restrict = '', safeSearch = 0, language = '', inputencoding = 'UTF-8', outputencoding = 'UTF-8', http_proxy=None, license_key=''):
     #doGoogleSearch
     return self._formatGoogleData(google.doGoogleSearch(query,
                                                         start,
                                                         maxResults,
                                                         filter,
                                                         restrict,
                                                         safeSearch,
                                                         language,
                                                         inputencoding,
                                                         outputencoding,
                                                         license_key,
                                                         http_proxy))
Ejemplo n.º 16
0
def getGoogleTotalResultsCount(s):
    google.setLicense('...') # must get your own key!
    ustr = unicode(s,'shiftjis')
    flg = True
    while flg:
        try:
            data = google.doGoogleSearch(ustr)
        except:
            print sys.exc_info()
            time.sleep(5)
        else:
            flg = False

    return data.meta.estimatedTotalResultsCount
Ejemplo n.º 17
0
def getGoogleTotalResultsCount(s):
    keys = open("./googlelicense.txt","r").read().rstrip("\n").split("\n")
    google.setLicense(random.choice(keys)) # must get your own key!
    ustr = unicode(s,'shiftjis')
    fsuccess = False
    while fsuccess == False:
        try:
            data = google.doGoogleSearch(ustr)
        except:
            print sys.exc_info()
            time.sleep(5)
        else:
            fsuccess = True

    return data.meta.estimatedTotalResultsCount
Ejemplo n.º 18
0
def calculateIndex(term, domain_name, start=0, checks=0):
    try:
        d = google.doGoogleSearch(term, start=start)
    except UnicodeDecodeError:
        term = unicode(term, 'iso-8859-1')
        d = google.doGoogleSearch(term, start=start)

    msg = "term:%s (start:%s, checks:%s)" % (term, start, checks)
    #LOG("calculateIndex()", INFO, msg)
    checks += 1
    c = 1
    index = None
    domain_name = domain_name.lower()
    if not domain_name.startswith('http://'):
        domain_name = 'http://%s' % domain_name
    for each in d.results:
        url = each.URL.lower()
        if url.startswith(domain_name):
            return c + start, checks
        c += 1
    if start < GIVE_UP_LIMIT:
        return calculateIndex(term, domain_name, start + 10, checks)
    else:
        return None, checks
Ejemplo n.º 19
0
def google_query(query, searchlimit, destination):
    try:
        data = google.doGoogleSearch(query, start=0, maxResults=searchlimit, filter=1, restrict='',
        safeSearch=option["safesearch"])
        stripper = __Stripper()
        for item in data.results:
            item.title = stripper.strip(item.title)
            item.snippet = stripper.strip(item.snippet)
            if option["resultsintab"] == True:
                destination.prnt(color["red"] + item.title + " " + color["black"] + item.snippet + " " + color["blue"] + item.URL)
            else:
                destination.command("say " + color["red"] + item.title + " " + color["black"] + item.snippet + " " + color["blue"] + item.URL)
        
    except Exception, args:
        print color["red"], Exception, args
        return
Ejemplo n.º 20
0
 def handleCommand(self, bot, nick, channel, argument=None):
     if argument is None:
         return
     if argument.find(' ') != -1:
         argument = '"' + argument + '"'
     try:
         data = google.doGoogleSearch(argument)
     except:
         bot.msg(channel, 'Exception in google.doGoogleSearch()')
         return
     links = ''
     if data is not None:
         for i in range(5):
             if len(data.results) >= i:
                 links += data.results[i].URL + ' | '
         bot.msg(channel, links[:-3])
Ejemplo n.º 21
0
 def doGoogleSearch(self,
                    query,
                    start=0,
                    maxResults=10,
                    filter=1,
                    restrict='',
                    safeSearch=0,
                    language='',
                    inputencoding='UTF-8',
                    outputencoding='UTF-8',
                    http_proxy=None):
     #doGoogleSearch
     google.setLicense(self.license_key)
     l_data = google.doGoogleSearch(query, start, maxResults, filter,
                                    restrict, safeSearch, language,
                                    inputencoding, outputencoding,
                                    self.license_key, http_proxy)
     l_meta = {
         'documentFiltering': l_data.meta.documentFiltering,
         'searchComments': l_data.meta.searchComments,
         'estimatedTotalResultsCount':
         l_data.meta.estimatedTotalResultsCount,
         'estimateIsExact': l_data.meta.estimateIsExact,
         'searchQuery': l_data.meta.searchQuery,
         'startIndex': l_data.meta.startIndex,
         'endIndex': l_data.meta.endIndex,
         'searchTips': l_data.meta.searchTips,
         'directoryCategories': l_data.meta.directoryCategories,
         'searchTime': l_data.meta.searchTime,
     }
     l_result = []
     for r in l_data.results:
         l_result.append({
             'URL': r.URL,
             'title': r.title,
             'snippet': r.snippet,
             'cachedSize': r.cachedSize,
             'relatedInformationPresent': r.relatedInformationPresent,
             'hostName': r.hostName,
             'directoryCategory': r.directoryCategory,
             'directoryTitle': r.directoryTitle,
             'summary': r.summary,
         })
     return (l_meta, l_result)
Ejemplo n.º 22
0
def google2():
  terms.remove("")
  for term in terms:
    for g in grest:
      start_time = strftime("%Y-%m-%d %H:%M:%S", localtime())
      try:
        results=google.doGoogleSearch(term, restrict=g)
        count=results.meta.estimatedTotalResultsCount
      except:
        count='!'
      print start_time+"\t"+term+"\t"+g+"\n"+`count`+"\n"
      goog=etree.SubElement(googlecc,"Search")
      goog.set("Term",term)
      goog.set("Rest",g)
      goog.text=str(count)
      goog.tail="\n"

  terms.append("")
  terms.sort()
Ejemplo n.º 23
0
def google_search(query):
    try:
        data = google.doGoogleSearch(query)
    except SOAP.HTTPError:
        return "\r\nGoogle API Error."
    except SOAP.faultType:
        return "\r\nInvalid Google Key. Maybe still default??\r\nTake a look on modules/googlekey.txt"
    try:
        first = data.results[0]
        url = first.URL
        title = google_remove_html(first.title)
        if first.summary:
            summary = google_remove_html(first.summary)
        else:
            summary = google_remove_html(first.snippet)
        searchtime = str(round(data.meta.searchTime, 3))
        total = str(data.meta.estimatedTotalResultsCount)
        return url + " - " + title + " - " + summary + " (" + searchtime + "sec) (" + total + " sites)"
    except:
        return "No Results"
Ejemplo n.º 24
0
def google_search(query):
    try:
        data = google.doGoogleSearch(query)
    except SOAP.HTTPError:
        return '\r\nGoogle API Error.'
    except SOAP.faultType:
        return '\r\nInvalid Google Key. Maybe still default??\r\nTake a look on modules/googlekey.txt'
    try:
        first = data.results[0]
        url = first.URL
        title = google_remove_html(first.title)
        if first.summary:
            summary = google_remove_html(first.summary)
        else:
            summary = google_remove_html(first.snippet)
        searchtime = str(round(data.meta.searchTime, 3))
        total = str(data.meta.estimatedTotalResultsCount)
        return url + ' - ' + title + ' - ' + summary + ' (' + searchtime + 'sec) (' + total + ' sites)'
    except:
        return 'No Results'
Ejemplo n.º 25
0
def go_get_googles(filetype, results_desired=10):
    search_string = "e filetype:%s" % (filetype)
    if results_desired <= 10:
        batches = 1 #the google api only supports retrieving
                    #10 results per search so we have to batch 
                    #the requests
    if results_desired > 10:
        if ((results_desired % 10) != 0): #if there is a remainder
            batches = (results_desired / 10)+1 #then round up
        else:
            batches = (results_desired / 10)
    urls = []
    for inc in range(0, batches):
        googles = google.doGoogleSearch(search_string, (inc*10), 10)
        rl = len(googles.results)
        for x in range(0,len(googles.results)):
           urls.append(uriobj(googles.results[x].URL))
    #pt(urls)
    print "Doing:", batches, "batches for", len(urls), "files found."
    return urls #returns a list of uriobj's
Ejemplo n.º 26
0
def go_get_googles(filetype, results_desired=10):
    search_string = "e filetype:%s" % (filetype)
    if results_desired <= 10:
        batches = 1  #the google api only supports retrieving
        #10 results per search so we have to batch
        #the requests
    if results_desired > 10:
        if ((results_desired % 10) != 0):  #if there is a remainder
            batches = (results_desired / 10) + 1  #then round up
        else:
            batches = (results_desired / 10)
    urls = []
    for inc in range(0, batches):
        googles = google.doGoogleSearch(search_string, (inc * 10), 10)
        rl = len(googles.results)
        for x in range(0, len(googles.results)):
            urls.append(uriobj(googles.results[x].URL))
    #pt(urls)
    print "Doing:", batches, "batches for", len(urls), "files found."
    return urls  #returns a list of uriobj's
Ejemplo n.º 27
0
    def soap(self, engine, query, url, numresults=10):
        print "  %s query..." % engine.capitalize()
        search_request_retry = config.copyright_connection_tries
        query_success = False

        while search_request_retry:
            try:
                if engine == 'google':
                    import google
                    google.LICENSE_KEY = config.google_key
                    data = google.doGoogleSearch(
                        '%s "%s"' % (no_result_with_those_words, query))
                    for entry in data.results:
                        self.add_in_urllist(url, entry.URL, 'google',
                                            entry.cachedSize)

                    self.num_google_queries += 1

                elif engine == 'yahoo':
                    import yahoo.search.web
                    data = yahoo.search.web.WebSearch(
                        config.yahoo_appid,
                        query='"%s" %s' %
                        (query.encode('utf_8'), no_result_with_those_words),
                        results=numresults)
                    for entry in data.parse_results():
                        cacheurl = None
                        if entry.Cache:
                            cacheurl = entry.Cache.Url
                        self.add_in_urllist(url, entry.Url, 'yahoo', cacheurl)

                    self.num_yahoo_queries += 1

                elif engine == 'msn':
                    #max_query_len = 150?
                    from SOAPpy import WSDL

                    try:
                        server = WSDL.Proxy(
                            'http://soap.search.msn.com/webservices.asmx?wsdl')
                    except Exception, err:
                        error("Live Search Error: %s" % err)
                        raise

                    params = {
                        'AppID': config.msn_appid,
                        'Query':
                        '%s "%s"' % (no_result_with_those_words, query),
                        'CultureInfo': region_code,
                        'SafeSearch': 'Off',
                        'Requests': {
                            'SourceRequest': {
                                'Source': 'Web',
                                'Offset': 0,
                                'Count': 10,
                                'ResultFields': 'All',
                            }
                        }
                    }

                    results = ''

                    server_results = server.Search(Request=params)
                    if server_results.Responses[0].Results:
                        results = server_results.Responses[0].Results[0]
                    if results:
                        # list or instance?
                        if type(results) == list:
                            for entry in results:
                                cacheurl = None
                                if hasattr(entry, 'CacheUrl'):
                                    cacheurl = entry.CacheUrl
                                self.add_in_urllist(url, entry.Url, 'msn',
                                                    cacheurl)
                        else:
                            cacheurl = None
                            if hasattr(results, 'CacheUrl'):
                                cacheurl = results.CacheUrl
                            self.add_in_urllist(url, results.Url, 'msn',
                                                cacheurl)

                    self.num_msn_queries += 1

                search_request_retry = 0
                query_success = True
            except KeyboardInterrupt:
                raise
Ejemplo n.º 28
0
 def search(args):
     return google.doGoogleSearch(*args).results
 def testSearchDefault(self):
     """no options + search phrase should search"""
     google.main([self.q])
     commandLineAnswer = self.lastOutput()
     google._output(google.doGoogleSearch(self.q), self.searchparams)
     self.assertEqual(commandLineAnswer, self.lastOutput())
 def testSearchLong(self):
     """--search should search"""
     google.main(["--search", self.q])
     commandLineAnswer = self.lastOutput()
     google._output(google.doGoogleSearch(self.q), self.searchparams)
     self.assertEqual(commandLineAnswer, self.lastOutput())
Ejemplo n.º 31
0
            word2, index = self.parseWord(argument_list, index)
        except self.WordParseError, arg:
            raise arg
        return word1, word2

    def handleCommand(self, bot, nick, channel, argument=None):
        if argument is None: 
            bot.msg(channel, 'Syntax Fehler.')
            return
        try:
            word1, word2 = self.parseWords(argument)
        except self.WordParseError, arg:
            bot.msg(channel, arg)
            return
        try:
            word1_data = google.doGoogleSearch(word1)
            word2_data = google.doGoogleSearch(word2)
        except:
            bot.msg(channel, 'Exception in google.doGoogleSearch()')
            return
        
        if word1_data.meta.estimatedTotalResultsCount > word2_data.meta.estimatedTotalResultsCount:
            bot.msg(channel, word1 + ' hat gewonnen.')
        elif word2_data.meta.estimatedTotalResultsCount > word1_data.meta.estimatedTotalResultsCount:
            bot.msg(channel, word2 + ' hat gewonnen.')
        else:
            bot.msg(channel, 'Unentschieden.')


class RFCPlugin(Plugin):
    command = 'RFC'
Ejemplo n.º 32
0
# Create the query 

termkeyword = "inurl:"
query = keyword + domain

#Start the query 

looppotentials = []

for i in [0,10,20,30,40,50,60,70,80,90]: 

# Perform the query five times, taking 10 results each time 

# and putting them into our potentials list 

	data = google.doGoogleSearch(query,start=(i)) 

for result in data.results: 
	potentials.append(result.URL)
	
# Initialize a second list to hold unique, cleaned entries

refined = []

# Do the cleaning work and populate the new list

for i in potentials: 
	i = i.replace('http://','') 
	i = i.replace('https:','') 
	keep = i.split('/') 
	
Ejemplo n.º 33
0
    def soap(self, engine, query, url, numresults = 10):
        print "  %s query..." % engine.capitalize()
        search_request_retry = config.copyright_connection_tries
        query_success = False

        while search_request_retry:
            try:
                if engine == 'google':
                    import google
                    google.LICENSE_KEY = config.google_key
                    data = google.doGoogleSearch('%s "%s"' % (no_result_with_those_words, query))
                    for entry in data.results:
                       self.add_in_urllist(url, entry.URL, 'google', entry.cachedSize)

                    self.num_google_queries += 1

                elif engine == 'yahoo':
                    import yahoo.search.web
                    data = yahoo.search.web.WebSearch(config.yahoo_appid, query='"%s" %s' % (
                                                      query.encode('utf_8'),
                                                      no_result_with_those_words
                                                     ), results = numresults)
                    for entry in data.parse_results():
                        cacheurl = None
                        if entry.Cache:
                            cacheurl = entry.Cache.Url
                        self.add_in_urllist(url, entry.Url, 'yahoo', cacheurl)

                    self.num_yahoo_queries += 1

                elif engine == 'msn':
                    #max_query_len = 150?
                    from SOAPpy import WSDL

                    try:
                        server = WSDL.Proxy('http://soap.search.msn.com/webservices.asmx?wsdl')
                    except Exception, err:
                        error("Live Search Error: %s" % err)
                        raise

                    params = {'AppID': config.msn_appid, 'Query': '%s "%s"' % (no_result_with_those_words, query),
                             'CultureInfo': region_code, 'SafeSearch': 'Off', 'Requests': {
                             'SourceRequest':{'Source': 'Web', 'Offset': 0, 'Count': 10, 'ResultFields': 'All',}}}

                    results = ''

                    server_results = server.Search(Request = params)
                    if server_results.Responses[0].Results:
                        results = server_results.Responses[0].Results[0]
                    if results:
                        # list or instance?
                        if type(results) == list:
                            for entry in results:
                                cacheurl = None
                                if hasattr(entry, 'CacheUrl'):
                                    cacheurl = entry.CacheUrl
                                self.add_in_urllist(url, entry.Url, 'msn', cacheurl)
                        else:
                            cacheurl = None
                            if hasattr(results, 'CacheUrl'):
                                cacheurl = results.CacheUrl
                            self.add_in_urllist(url, results.Url, 'msn', cacheurl)

                    self.num_msn_queries += 1

                search_request_retry = 0
                query_success = True
            except KeyboardInterrupt:
                raise
Ejemplo n.º 34
0
# Create the query

termkeyword = "inurl:"
query = keyword + domain

#Start the query

looppotentials = []

for i in [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]:

    # Perform the query five times, taking 10 results each time

    # and putting them into our potentials list

    data = google.doGoogleSearch(query, start=(i))

for result in data.results:
    potentials.append(result.URL)

# Initialize a second list to hold unique, cleaned entries

refined = []

# Do the cleaning work and populate the new list

for i in potentials:
    i = i.replace('http://', '')
    i = i.replace('https:', '')
    keep = i.split('/')
Ejemplo n.º 35
0
#Failed

import google

# to search
query = "Geeksforgeeks"

for j in google.doGoogleSearch(query, 0, 10, 1):

        #(query, tld="co.in", num=10, stop=1, pause=2):
    print(j)
Ejemplo n.º 36
0
 def _search_serie_api(self, searchterms):
     """Search for a serie using the Google SOAP API"""
     
     d = google.doGoogleSearch("site:epguides.com %s" % searchterms)
     return d.results[0].URL
Ejemplo n.º 37
0
    def createSemanticWeb(self, term = '', language = 'english', languages = [], googleLicenseKey = ''):
        import helper, analyser, re, google
        from operator import itemgetter
    
        # initialise keyword dictionary
        keywords = {}
    
        # initialise query string
        query = term + ' site:en.wikipedia.org'
    
        # initialise error string
        error = 0
        errorMessage = ''
    
        # Google licence key
        google.LICENSE_KEY = googleLicenseKey
    
        # query Google
        try:
            data = google.doGoogleSearch(query)
            wikiPage = data.results[0]
        except:
            error = 1
            errorMessage = 'Sorry, an error occured during this request, please try again'

        # get corpus
        corpusHelper = helper.CorpusHelper()
        corpusInfo = corpusHelper.getCorpus(wikiPage.URL)
        error = corpusInfo['error']
        errorMessage = corpusInfo['errorMessage']
        corpus = corpusInfo['corpus']
        charset = corpusInfo['charset']

        # if retrieval was successful
        if error != 1:
            # tokenizer
            tokenizer = analyser.Tokenizer()

            # initialise corpus helper
            corpusHelper = helper.CorpusHelper()
            
            # get corpus information
            corpusInfo = corpusHelper.getLinkedTerms(tokenizer, corpus, language, languages)
            tokenizedCorpus = corpusInfo['tokenizedCorpus']
            linkedTerms = corpusInfo['linkedTerms']
    
            # calculate some average values
            tokenCount = len(tokenizedCorpus)
            
            # word structure
            wordStructure = analyser.WordStructure()
    
            # text structure
            textStructure = analyser.TextStructure()
    
            # get N-grams
            ngrams = textStructure.getNGrams(tokenizedCorpus, tokenCount)
            mostFrequentUnigrams = ngrams['mostFrequentUnigrams']
            mostFrequentBigrams = ngrams['mostFrequentBigrams']
            mostFrequentTrigrams = ngrams['mostFrequentTrigrams']
    
            # add unigrams to keyword dictionary
            for ngram in mostFrequentUnigrams:
                keywords[ngram] = mostFrequentUnigrams[ngram]
    
            # add bigrams to keyword dictionary
            for ngram in mostFrequentBigrams:
                keywords[ngram] = mostFrequentBigrams[ngram]
    
            # add trigrams to keyword dictionary
            for ngram in mostFrequentTrigrams:
                keywords[ngram] = mostFrequentTrigrams[ngram]
    
            # build return dictionary
            returnDict = dict(error = error,
                              errorMessage = errorMessage,
                              charset = charset,
                              tokenCount = tokenCount,
                              keywords = sorted(keywords.iteritems(), key = itemgetter(1), reverse = True),
                              debug = 0,
                              wikiPage = wikiPage,
                              term = term,
                              linkedTerms = linkedTerms)
    
        # if URL retrieval error
        else:
            # build return dictionary
            returnDict = dict(error = error,
                              errorMessage = errorMessage,
                              fallback = fallback,
                              debug = 1,
                              term = term)
    
        # return values
        return returnDict
Ejemplo n.º 38
0
    def createSemanticWeb(self,
                          term='',
                          language='english',
                          languages=[],
                          googleLicenseKey=''):
        import helper, analyser, re, google
        from operator import itemgetter

        # initialise keyword dictionary
        keywords = {}

        # initialise query string
        query = term + ' site:en.wikipedia.org'

        # initialise error string
        error = 0
        errorMessage = ''

        # Google licence key
        google.LICENSE_KEY = googleLicenseKey

        # query Google
        try:
            data = google.doGoogleSearch(query)
            wikiPage = data.results[0]
        except:
            error = 1
            errorMessage = 'Sorry, an error occured during this request, please try again'

        # get corpus
        corpusHelper = helper.CorpusHelper()
        corpusInfo = corpusHelper.getCorpus(wikiPage.URL)
        error = corpusInfo['error']
        errorMessage = corpusInfo['errorMessage']
        corpus = corpusInfo['corpus']
        charset = corpusInfo['charset']

        # if retrieval was successful
        if error != 1:
            # tokenizer
            tokenizer = analyser.Tokenizer()

            # initialise corpus helper
            corpusHelper = helper.CorpusHelper()

            # get corpus information
            corpusInfo = corpusHelper.getLinkedTerms(tokenizer, corpus,
                                                     language, languages)
            tokenizedCorpus = corpusInfo['tokenizedCorpus']
            linkedTerms = corpusInfo['linkedTerms']

            # calculate some average values
            tokenCount = len(tokenizedCorpus)

            # word structure
            wordStructure = analyser.WordStructure()

            # text structure
            textStructure = analyser.TextStructure()

            # get N-grams
            ngrams = textStructure.getNGrams(tokenizedCorpus, tokenCount)
            mostFrequentUnigrams = ngrams['mostFrequentUnigrams']
            mostFrequentBigrams = ngrams['mostFrequentBigrams']
            mostFrequentTrigrams = ngrams['mostFrequentTrigrams']

            # add unigrams to keyword dictionary
            for ngram in mostFrequentUnigrams:
                keywords[ngram] = mostFrequentUnigrams[ngram]

            # add bigrams to keyword dictionary
            for ngram in mostFrequentBigrams:
                keywords[ngram] = mostFrequentBigrams[ngram]

            # add trigrams to keyword dictionary
            for ngram in mostFrequentTrigrams:
                keywords[ngram] = mostFrequentTrigrams[ngram]

            # build return dictionary
            returnDict = dict(error=error,
                              errorMessage=errorMessage,
                              charset=charset,
                              tokenCount=tokenCount,
                              keywords=sorted(keywords.iteritems(),
                                              key=itemgetter(1),
                                              reverse=True),
                              debug=0,
                              wikiPage=wikiPage,
                              term=term,
                              linkedTerms=linkedTerms)

        # if URL retrieval error
        else:
            # build return dictionary
            returnDict = dict(error=error,
                              errorMessage=errorMessage,
                              fallback=fallback,
                              debug=1,
                              term=term)

        # return values
        return returnDict
Ejemplo n.º 39
0
#!/usr/bin/python
# filename: google.py
# usage: python google.py <query>

import sys, string, codecs
import google

if sys.argv[1:]:
    query = sys.argv[1]
else:
    sys.exit('Usage: python google.py <query>')

google.LICENSE_KEY = 'insert key here'

data = google.doGoogleSearch(query)

sys.stdout = codes.lookup('utf-8')[-1](sys.stdout)

for result in data.results:
    title = result.title
    URL = result.URL
    snippet = result.snippet

    regex = re.compile('<[^>]>')
    title = regex.sub(r'', title)
    snippet = regex.sub(r'', snippet)
    
    print string.join( (title, URL, snippet), "\n"), "\n"

Ejemplo n.º 40
0
 def testSearchDefault(self):
     """no options + search phrase should search"""
     google.main([self.q])
     commandLineAnswer = self.lastOutput()
     google._output(google.doGoogleSearch(self.q), self.searchparams)
     self.assertEqual(commandLineAnswer, self.lastOutput())
 def testLuckyLong(self):
     """--lucky should return only first result"""
     google.main(["--lucky", "-s", self.q])
     commandLineAnswer = self.lastOutput()
     google._output(google.doGoogleSearch(self.q), self.luckyparams)
     self.assertEqual(commandLineAnswer, self.lastOutput())
Ejemplo n.º 42
0
 def testSearchLong(self):
     """--search should search"""
     google.main(["--search", self.q])
     commandLineAnswer = self.lastOutput()
     google._output(google.doGoogleSearch(self.q), self.searchparams)
     self.assertEqual(commandLineAnswer, self.lastOutput())
Ejemplo n.º 43
0
request = urllib2.Request(top_hindi_link)
response = urllib2.urlopen(request)
soup = BeautifulSoup(response, "html.parser")
type(soup)

for div in soup.findAll('div', {'class': 'info'}):
    for b in soup.findAll('b'):
        for a in b.findAll('a'):
            print a.text

links = []
titles = []

for div in soup.findAll('div', {'class': 'info'}):
    for b in div.findAll('b'):
        for a in b.findAll('a'):
            titles.append(a.text)
            links.append(a['href'])
            print a.text

print 'you collected ' + str(len(links)) + ' links.'
print
print 'you collected ' + str(len(titles)) + ' titles.'

str(links[0])

import google
g = google.doGoogleSearch('Titanic film wikipedia')
g.pages = 5
print '*Found %s results*' % (g.get_result_count())
g.get_urls()
 def testReverseLong(self):
     """--reverse should reverse results"""
     google.main(["--reverse", "-s", self.q])
     commandLineAnswer = self.lastOutput()
     google._output(google.doGoogleSearch(self.q), self.reverseparams)
     self.assertEqual(commandLineAnswer, self.lastOutput())
Ejemplo n.º 45
0
"""Relevant method: meta"""
import google
google.LICENSE_KEY = '...' # must get your own!
data = google.doGoogleSearch('python')
data.
Ejemplo n.º 46
0
    def getSimilarDocuments(self, url = '', plainText = '', language = 'automatic', **kw):
    	from core import analyser, helper
        import google
    
    	# Google licence key
    	google.LICENSE_KEY = self.googleLicenseKey
    
    	# initialise error string
    	error = 0
    	errorMessage = ''
    
        # get corpus
        corpusHelper = helper.CorpusHelper()
        corpusInfo = corpusHelper.getCorpus(url, plainText)
        error = corpusInfo['error']
        errorMessage = corpusInfo['errorMessage']
        corpus = corpusInfo['corpus']
        charset = corpusInfo['charset']
        url = corpusInfo['url']
    
    	# if retrieval was successful
    	if error != 1:
            # tokenizer
            tokenizer = analyser.Tokenizer()

            # initialise corpus helper
            corpusHelper = helper.CorpusHelper()
            
            # get tokenized corpus
            corpusInfo = corpusHelper.getTokenizedCorpus(tokenizer, corpus, language, self.languages)
            tokenizedCorpus = corpusInfo['tokenizedCorpus']
    
    	    # get token count
    	    tokenCount = len(tokenizedCorpus)
    
    	    # analyse text structure
    	    textStructure = analyser.TextStructure()
    
    	    # get N-grams
    	    ngrams = textStructure.getNGrams(tokenizedCorpus, tokenCount)
    	    mostFrequentUnigrams = ngrams['mostFrequentUnigrams']
    	    mostFrequentBigrams = ngrams['mostFrequentBigrams']
    	    mostFrequentTrigrams = ngrams['mostFrequentTrigrams']
    	    mostFrequentBigramsWithStopWords = ngrams['mostFrequentBigramsWithStopWords']
    	    mostFrequentTrigramsWithStopWords = ngrams['mostFrequentTrigramsWithStopWords']
    
    	    # get keywords
    	    keywords = list(textStructure.getKeywords(mostFrequentUnigrams, mostFrequentBigrams, mostFrequentTrigrams, mostFrequentBigramsWithStopWords, mostFrequentTrigramsWithStopWords))
            
            # build string for Google query
            googleHelper = helper.GoogleHelper()
            query = googleHelper.getGoogleQuery(mostFrequentUnigrams)
    
    	    # query Google
    	    try:
        		data = google.doGoogleSearch(query)
        		similarDocuments = data.results
    	    except:
                if url != 'text':
                    url = '/getSimilarDocuments/?url=' + url
                else:
                    url = '/getSimilarDocuments/?plainText=' + plainText
                    
                similarDocuments = []
                error = 1
                errorMessage = 'Sorry, an error occured during this request, please try again'
    
    	    # build return dictionary
    	    returnDict = dict(error = error,
            			      errorMessage = errorMessage,
            			      languages = self.languages,
            			      similarDocuments = similarDocuments,
            			      debug = 0,
            			      url = url)
    
    	# if URL retrieval error
    	else:
    	    # build return dictionary
    	    returnDict = dict(error = error,
            			      errorMessage = errorMessage,
            			      languages = self.languages,
            			      debug = 1,
            			      url = url)
    
    	# return values
    	return returnDict