def _get_results(self, position): #composes a search query to google and sends the associated http request if position >= self._max_results: return [] reader = ReadUrl() #build a google-friendly parameter string, construct the URL and request header, pass along to opener routine params = urllib.urlencode({'q': self._query, 'v': '1.0', 'rsz': 'large', 'start': position}) url = 'http://ajax.googleapis.com/ajax/services/search/web?%s' % (params) requestHeader = [("Referer", self._referer)] self._urlContents = reader.openUrl(url = url, socket_timeout = self._socket_timeout, header = requestHeader) #go through the catch and load the urls into a list, first making sure we have *something* to look at if self._urlContents: try: #if we have urls, then use simplejson to parse them out into a dict w/ keys responseDatra self._doc = simplejson.loads(self._urlContents) #...and add a cleaned up version of them (i.e., no funky characters) to a list self._returnedUrls = [] for element in self._doc['responseData']['results']: result = self._normalizeString(element['url']) if result: self._returnedUrls.append(result) logging.debug("GoogleSearcher finished: found %s matches" % len( self._returnedUrls)) #if we ran into any issues, we'll just log them. The return is just the initial (empty) list except Exception , e: logging.debug("GoogleSearcher had problem counting results %s" %e) pass
def _get_results(self, position): self._returnedUrls = [] #composes a search query and sends the associated http request if position >= self._max_results: return self._returnedUrls reader = ReadUrl() #build a friendly parameter string, construct the URL and request header, pass along to opener routine params = urllib.urlencode({'Appid' : self.key, "query" : self._query, 'sources' : self._sources, 'Web.Count' : self._results_per_query, 'Web.Offset': position}) url = 'http://api.search.live.net/json.aspx?%s' %(params) #the request header is a list of (key, value) tuples and can be of any length (or a null string if none) requestHeader = '' #open the url; if we have string version of the return, convert it to a dict try: return_obj = reader.openUrl(url = url, socket_timeout = self._socket_timeout, header = requestHeader) except: msg = "Bing Searcher couldn't get return object for url, but didn't catch the error: %s. "%url logging.debug(msg) return self._returnedUrls #if we have an exception (most likely a UrlReadError from the reader) the connection's probably down. if isinstance(return_obj, Exception): msg = "Bing Searcher didn't get a return. The server is probably down." return self._returnedUrls #Try to convert the return to a dict; if we can't we're done try: self._urlContents = simplejson.loads(return_obj) except: msg = "Bing Searcher coultn't convert the return to a dict; it's possibly malformed" logging.info(msg) return self._returnedUrls #make sure we have a SerchResonse item in the return object try: search_response = self._urlContents['SearchResponse'] except: msg = "Bing searcher failed to return a SearchResponse object" logging.debug(msg) return self._returnedUrls #if we have errors, log them and return an error try: if 'Errors' in self._urlContents['SearchResponse']: resp = self._urlContents['SearchResponse'] errors_list = [elem['Message'] for elem in resp['Errors']] error_text = ','.join(errors_list) logging.info("Bing Error: %s" %error_text) return self._returnedUrls except: #no errors: so far, so good pass #go through the catch and load the urls into a list, using simplejson to parse the return into a dict try: for element in self._urlContents['SearchResponse']['Web']['Results']: result = self._normalizeString(element.get('Url')) if result: self._returnedUrls.append(result) logging.debug("BingSearcher finished: found %s matches" % len( self._returnedUrls)) #if we ran into any unanticipated issues, we'll just log them and return the initial (empty) list except Exception , e: logging.debug("BingSearcher had problem parsing results %s" %e) pass