def _get_results(self, position):
      #composes a search query to google and sends the associated http request
      if position >= self._max_results:
         return []
      reader = ReadUrl()
      #build a google-friendly parameter string, construct the URL and request header, pass along to opener routine
      params = urllib.urlencode({'q': self._query, 'v': '1.0', 'rsz': 'large', 'start': position})
      url = 'http://ajax.googleapis.com/ajax/services/search/web?%s' % (params)
      requestHeader = [("Referer", self._referer)]     
      self._urlContents = reader.openUrl(url = url, socket_timeout = self._socket_timeout, header = requestHeader)

      #go through the catch and load the urls into a list, first making sure we have *something* to look at
      if self._urlContents:
         try:
            #if we have urls, then use simplejson to parse them out into a dict w/ keys responseDatra
            self._doc = simplejson.loads(self._urlContents)
            
            #...and add a cleaned up version of them (i.e., no funky characters) to a list
            self._returnedUrls = []
            for element in self._doc['responseData']['results']:
               result = self._normalizeString(element['url'])
               if result:
                  self._returnedUrls.append(result)               
               
            logging.debug("GoogleSearcher finished: found %s matches" % len( self._returnedUrls))
         
         #if we ran into any issues, we'll just log them.  The return is just the initial (empty) list   
         except Exception , e: 
            logging.debug("GoogleSearcher had problem counting results %s" %e)
            pass
Exemple #2
0
 def _get_results(self, position):
    self._returnedUrls = []
    #composes a search query and sends the associated http request
    if position >= self._max_results:
       return self._returnedUrls
    reader = ReadUrl()
    
    #build a friendly parameter string, construct the URL and request header, pass along to opener routine      
    params = urllib.urlencode({'Appid' : self.key, "query" : self._query, 'sources' : self._sources, 
                               'Web.Count' : self._results_per_query, 'Web.Offset': position})
    url = 'http://api.search.live.net/json.aspx?%s' %(params)
    #the request header is a list of (key, value) tuples and can be of any length (or a null string if none)
    requestHeader = ''   
     
    #open the url; if we have string version of the return, convert it to a dict
    try:
       return_obj = reader.openUrl(url = url, socket_timeout = self._socket_timeout, header = requestHeader)
    except:
       msg = "Bing Searcher couldn't get return object for url, but didn't catch the error: %s. "%url
       logging.debug(msg)
       return self._returnedUrls
    
    #if we have an exception (most likely a UrlReadError from the reader) the connection's probably down.
    if isinstance(return_obj, Exception):
       msg = "Bing Searcher didn't get a return.  The server is probably down."
       return self._returnedUrls
    
    #Try to convert the return to a dict; if we can't we're done
    try:
       self._urlContents = simplejson.loads(return_obj)
    except:   
       msg = "Bing Searcher coultn't convert the return to a dict; it's possibly malformed"
       logging.info(msg)
       return self._returnedUrls
       
    #make sure we have a SerchResonse item in the return object
    try:
       search_response = self._urlContents['SearchResponse']
    except:
       msg = "Bing searcher failed to return a SearchResponse object"
       logging.debug(msg)
       return self._returnedUrls
    
    #if we have errors, log them and return an error
    try:
       if 'Errors' in self._urlContents['SearchResponse']:
             resp = self._urlContents['SearchResponse']
             errors_list = [elem['Message'] for elem in resp['Errors']]
             error_text = ','.join(errors_list)
             logging.info("Bing Error: %s" %error_text)
             return self._returnedUrls
    except:  #no errors: so far, so good
       pass
       
    #go through the catch and load the urls into a list, using simplejson to parse the return into a dict
    try:
       for element in self._urlContents['SearchResponse']['Web']['Results']:
          result = self._normalizeString(element.get('Url'))
          if result:
             self._returnedUrls.append(result)                          
          
       logging.debug("BingSearcher finished: found %s matches" % len( self._returnedUrls))
    
    #if we ran into any unanticipated issues, we'll just log them and return the initial (empty) list   
    except Exception , e: 
       logging.debug("BingSearcher had problem parsing results %s" %e)
       pass