def get_new_access_token(client_id=FACEBOOK_APPLICATION_ID, client_secret=FACEBOOK_SECRET_KEY, access_token=FACEBOOK_ACCESS_KEY): ''' ''' url = API_URL.format(client_id=client_id, client_secret=client_secret, access_token=access_token) retrieve = Retrieve('fb') x = retrieve.open(url) result = x.read() new_access_token = access_token for key, param in urlparse.parse_qs(result).iteritems(): print key, param if key == 'access_token': if isinstance(param, list): param = param[0] if param == access_token: print 'access token still the same' else: print 'got new access_token %s' % param new_access_token = param return new_access_token
class WikiPedia(object): """ returns an WikiPedia Object """ def __init__(self): self.r = Retrieve(WikiPedia.__name__) def getWikiPage(self, pageName, lang='en'): """ returns the given wikipedia page considering different spellings @param[in] pageName @param[in] language (determines which wikipedia to query) @returns the page's wikipedia text """ assert (len(lang) == 2) for pn in self._getPageNameAlterations(pageName): pageContent = self._retrievePage(pn, lang) if pageContent: return pageContent return None @staticmethod def _getPageNameAlterations(pageName): """ @returns a list of differnt names for the given page """ alt = [ pageName, ] if not ' ' in pageName: alt words = pageName.split(" ") alt.append( "%s %s" % (words[0].capitalize(), " ".join(map(str.lower, words[1:])))) return alt def _retrievePage(self, pageName, lang): """ retrieves the given Wiki page @param[in] pageName @param[in] language (determines which wikipedia to query) @returns the page's wikipedia text """ param = urlencode({ 'action': 'query', 'format': 'json', 'export': '', 'redirects': 'true', 'titles': pageName }) data = self.r.open(WIKIPEDIA_API_QUERY % lang, param).read() jsonData = eval(data)['query'] if '-1' in jsonData['pages']: return None xmlData = jsonData['export']['*'].replace("\/", "/") return parseString(xmlData).getElementsByTagName( 'text')[0].firstChild.data
def testRetrieval(self): ''' tries to retrieve the following url's from the list ''' r_handler = Retrieve(self.__class__.__name__) for url in self.TEST_URLS: print(url) r = r_handler.open(url) r.read() r.close()
class WikiPedia(object): """ returns an WikiPedia Object """ def __init__(self): self.r = Retrieve( WikiPedia.__name__ ) def getWikiPage(self, pageName, lang='en'): """ returns the given wikipedia page considering different spellings @param[in] pageName @param[in] language (determines which wikipedia to query) @returns the page's wikipedia text """ assert( len(lang)==2 ) for pn in self._getPageNameAlterations( pageName ): pageContent = self._retrievePage( pn, lang ) if pageContent: return pageContent return None @staticmethod def _getPageNameAlterations(pageName): """ @returns a list of differnt names for the given page """ alt = [ pageName, ] if not ' ' in pageName: alt words = pageName.split(" ") alt.append( "%s %s" % (words[0].capitalize(), " ".join( map(str.lower, words[1:] ) )) ) return alt def _retrievePage(self, pageName, lang): """ retrieves the given Wiki page @param[in] pageName @param[in] language (determines which wikipedia to query) @returns the page's wikipedia text """ param = urlencode( {'action': 'query', 'format':'json', 'export':'', 'redirects':'true', 'titles':pageName }) data = self.r.open( WIKIPEDIA_API_QUERY % lang, param ).read() jsonData = eval( data )['query'] if '-1' in jsonData['pages']: return None xmlData = jsonData['export']['*'].replace("\/","/") return parseString( xmlData ).getElementsByTagName('text')[0].firstChild.data
class Yahoo(TagInfoService): """ interfaces with yahoo's search service * Search: Yahoo! BOSS (see http://developer.yahoo.com/search/boss) """ __slots__ = ('r', ) def __init__(self): self.r = Retrieve( Yahoo.__name__, sleep_time=0 ) def query(self, terms, count=0, queryParams={} ): """ returns search results for the given terms @param[in] terms ... a list of search terms @param[in] count ... number of results to return (0 if we are interested on the search meta data only). @param[in] queryParams ... a dictionary of query parameters to add to the request @returns the search results """ assert ( isinstance(terms, tuple) or isinstance(terms, list) ) queryParams.update( {'appid': YAHOO_APP_ID, 'count': count, 'format': 'json' } ) params = urlencode( queryParams ) url = YAHOO_SEARCH_URL % "%2B".join(map( quote, terms) ) +"?"+ params print url try: result = eval( self.r.open(url).read().replace("\\/", "/" )) return result['ysearchresponse'] except URLError: return "" @staticmethod def getSearchResults(query_result): """ returns a list of all search results returned by the given query result. @param[in] query_result Result of the query """ return [ YahooSearchResult(r) for r in query_result['resultset_web'] ] \ if 'resultset_web' in query_result else [] def getTagInfo(self, tag): """ @Override """ return int( self.query(tag)['totalhits'] )
class Yahoo(TagInfoService): """ interfaces with yahoo's search service * Search: Yahoo! BOSS (see http://developer.yahoo.com/search/boss) """ __slots__ = ('r', ) def __init__(self): self.r = Retrieve( Yahoo.__name__, sleep_time=0 ) def query(self, terms, count=0, queryParams={} ): """ returns search results for the given terms @param[in] terms ... a list of search terms @param[in] count ... number of results to return (0 if we are interested on the search meta data only). @param[in] queryParams ... a dictionary of query parameters to add to the request @returns the search results """ assert ( isinstance(terms, tuple) or isinstance(terms, list) ) queryParams.update( {'appid': YAHOO_APP_ID, 'count': count, 'format': 'json' } ) params = urlencode( queryParams ) url = YAHOO_SEARCH_URL % "%2B".join(map( quote, terms) ) +"?"+ params print(url) try: result = eval( self.r.open(url).read().replace("\\/", "/" )) return result['ysearchresponse'] except (timeout, URLError): return "" @staticmethod def getSearchResults(query_result): """ returns a list of all search results returned by the given query result. @param[in] query_result Result of the query """ return [ YahooSearchResult(r) for r in query_result['resultset_web'] ] \ if 'resultset_web' in query_result else [] def getTagInfo(self, tag): """ @Override """ return int( self.query(tag)['totalhits'] )
class YahooTermExtractor(object): """ interfaces with yahoo's search service * Term extraction: extract terms from yahoo search http://developer.yahoo.com/search/content/V1/termExtraction.html """ __slots__ = ('r', ) def __init__(self): self.r = Retrieve( YahooTermExtractor.__name__ ) def extractTerms(self, content): """ extract terms from yahoo search, see http://developer.yahoo.com/search/content/V1/termExtraction.html """ params = urlencode( {'appid': YAHOO_APP_ID, 'context': content, 'output': 'json' }) result = eval ( self.r.open(YAHOO_TERM_EXTRACTION_URI, params).read() ) return result['ResultSet']['Result']
def parse(url, last_modified=None): """ Parses the given RSS Feed an returns all articles and the content of the page referenced in the <link> tag. @param url: the url of the rss feed @param last_modified: a datetime object that specifies the last time the feed has been queried the last time (only newer entries are returned). """ feed = feedparser.parse(url, modified=last_modified) retrieve = Retrieve("rss", HTTP_FETCH_DELAY) result = [] for item in feed['items']: if datetime.fromtimestamp( mktime(item['updated_parsed'])) > last_modified: item['content'] = retrieve.open(item['link']).read() result.append(item) return result
def parse(url, last_modified=None): """ Parses the given RSS Feed an returns all articles and the content of the page referenced in the <link> tag. @param url: the url of the rss feed @param last_modified: a datetime object that specifies the last time the feed has been queried the last time (only newer entries are returned). """ feed = feedparser.parse(url, modified=last_modified) retrieve = Retrieve("rss", HTTP_FETCH_DELAY) result = [] for item in feed['items']: if datetime.fromtimestamp(mktime( item['updated_parsed'])) > last_modified: item['content'] = retrieve.open(item['link']).read() result.append(item) return result
class WikiPedia(object): """ returns a wikipedia article """ def __init__(self): self.r = Retrieve( WikiPedia.__name__ ) def getDescriptor(self, synonym, lang='en'): """ returns the descriptor for the given synonym in the diven language """ assert( len(lang)==2 ) try: result = self.getWikipediaSearchResults(synonym, lang) return result[0] except (HTTPError, IndexError): return None def getWikipediaSearchResults(self, term, lang): """ returns a list of wikipedia search results for the given term or None if nothing was found """ search_query = WIKIPEDIA_SEARCH_QUERY % (lang, quote(term) ) f=self.r.open(search_query) results = WikiPedia._parse_wikipedia_search_results( f.read() ) f.close() return results @staticmethod def _parse_wikipedia_search_results( text ): result = [] for line in text.split("\n"): # only consider lines containing search results if not "class='searchresult'" in line: continue (prefix, tmp) = line.split("title=\"", 1) (descriptor, suffix ) = tmp.split("\"", 1) result.append(descriptor) return result
class WikiPedia(object): """ returns a wikipedia article """ def __init__(self): self.r = Retrieve(WikiPedia.__name__) def getDescriptor(self, synonym, lang='en'): """ returns the descriptor for the given synonym in the diven language """ assert(len(lang) == 2) try: result = self.getWikipediaSearchResults(synonym, lang) return result[0] except (HTTPError, IndexError): return None def getWikipediaSearchResults(self, term, lang): """ returns a list of wikipedia search results for the given term or None if nothing was found """ search_query = WIKIPEDIA_SEARCH_QUERY % (lang, quote(term)) f = self.r.open(search_query) results = WikiPedia._parse_wikipedia_search_results(f.read()) f.close() return results @staticmethod def _parse_wikipedia_search_results(text): result = [] for line in text.split("\n"): # only consider lines containing search results if not "class='searchresult'" in line: continue (prefix, tmp) = line.split("title=\"", 1) (descriptor, suffix) = tmp.split("\"", 1) result.append(descriptor) return result
class WebOfTrust(object): def __init__(self, api_key, service_url=SERVICE_URL): self.api_key = api_key self.service_url = service_url self.retrieve = Retrieve('eWRT.ws.wot') def get_reputation(self, hosts): query = {'hosts': self._encode_hosts(hosts), 'api_key': self.api_key} urlObj = self.retrieve.open(self.service_url % query) if not urlObj: raise Exception('got no result') return self._format_result(json.loads(urlObj.read())) @classmethod def _encode_hosts(cls, hosts): ''' >>> WebOfTrust._encode_hosts(['http://wu.ac.at', 'https://wu.ac.at']) 'wu.ac.at/' >>> WebOfTrust._encode_hosts(['wu.ac.at', 'https://modul.ac.at/']) 'wu.ac.at/modul.ac.at/' ''' if isinstance(hosts, string_types): hosts = [hosts] selected_hosts = [] for host in hosts: if not host.startswith('http'): host = 'http://%s' % host netloc = '%s/' % quote(urlparse(host).netloc) if not netloc in selected_hosts: selected_hosts.append(netloc) assert len(hosts) <= MAX_HOSTS, 'too many hosts (max: %s)!' % MAX_HOSTS return ''.join(selected_hosts) @classmethod def _encode_url(cls, service_url, query): ''' encodes the url ''' return service_url % query @classmethod def _format_result(cls, data): ''' Formats the result using MAPPING. The components for the reputation provide the reputation and confidence. See WOT Developer API for details ''' result = {} for host, reputation in data.iteritems(): r = {} for attr_name, new_attr_name in MAPPING.iteritems(): if attr_name in reputation: r[new_attr_name] = reputation[attr_name] r['wot_link'] = WOT_LINK % r['target'] result[host] = r return result
class AmazonWS(object): """ This class provides low level amazon web service access """ def __init__(self, location='us', key=None): """ init """ assert (location in AMAZON_LOCATIONS) self.retrieve = Retrieve(self.__class__.__name__) self.wsBase = AMAZON_LOCATIONS[location] self.accessKey = key or AMAZON_ACCESS_KEY self.amazon_url = AmazonUrl() def generateWsUrl(self, arguments): """ generates a valid amazon webservice request url """ argList = ["%s&SubscriptionId=%s" % ( self.wsBase, self.accessKey)] + ["%s=%s" % (k, quote(v)) for k, v in arguments.items()] return "&".join(argList) def generateSignedWsUrl(self, **arguments): """ generates a valid amazon webservice request url """ #argList = [ "%s&SubscriptionId=%s" % (self.wsBase, self.accessKey) ] + [ "%s=%s" % (k,quote(v)) for k,v in arguments.items() ] # return "&".join(argList) return self.amazon_url.get_request_url(arguments) def query(self, arguments): """ retrieves a result from amazon webservice """ url = self.generateWsUrl(arguments) done = False while not done: try: f = self.retrieve.open(url) res = f.read() self._write_debug_data(res) f.close() done = True except ValueError: logging.warning( "Exception webservice query - waiting for %d seconds...\n" % ERROR_SLEEP_TIME) time.sleep(ERROR_SLEEP_TIME) return res @staticmethod def _write_debug_data(data): """ writes the given data to the debug file, if specified """ if not AMAZON_DEBUG_FILE: return d = open(AMAZON_DEBUG_FILE, "a") d.write(data) d.close() def searchItem(self, searchIndex='Books', **param): """ searches an item in the amazon product repository """ arguments = {'Operation': 'ItemSearch', 'SearchIndex': searchIndex, 'BrowseNode': '1000', 'Sort': 'salesrank', 'ResponseGroup': 'SalesRank,Small'} arguments.update(param) return self.query(arguments) def queryReview(self, itemId, **param): """ queries customers reviews to the selected Item """ arguments = {'Operation': 'ItemLookup', 'ResponseGroup': 'Reviews', 'ItemId': itemId} arguments.update(param) return self.query(arguments) def newReleases(self, **param): """ returns a list of asins of new releases """ arguments = {'Operation': 'BrowseNodeLookup', 'ResponseGroup': 'NewReleases', 'Marketplace': 'us'} arguments.update(param) return self.query(arguments) def itemAttributes(self, item_id, **param): """ returns all item attribues """ arguments = {'Operation': 'ItemLookup', 'ItemId': item_id, 'IdType': 'ASIN', 'ResponseGroup': 'ItemAttributes,SalesRank'} arguments.update(param) return self.query(arguments)
class WeblyzardDictionaries(object): def __init__(self, user, password, local_dir=LOCAL_DIR, server_url=SERVER_URL, max_age_hours=MAX_AGE_HOURS): if not os.path.exists(local_dir): os.makedirs(local_dir) self.max_file_age = datetime.now() - timedelta(hours=max_age_hours) self.local_dir = local_dir self.server_url = server_url self.retrieve = Retrieve(__file__) self.user = user self.password = password @staticmethod def is_online(server_url): ''' Checks, whether the given url is online. :param server_url: \ the url to check. :returns: True, if the dictionary server is online/reachable. ''' hostname = urlparse.urlsplit(server_url).netloc try: gethostbyname(hostname) return True except gaierror: return False def get_dictionary(self, dictionary_uri): ''' tries to load the dictionary from the file-system. If the function cannot find the file or if the file is too old (see MAX_AGE_HOURS), the function will load the dictionary from the server. :param dictionary_uri: URI for the dictionary, e.g. people/de/titles/all.txt :returns: full file name of the dictionary ''' if dictionary_uri.startswith('/'): dictionary_uri = dictionary_uri[1:] full_path = os.path.join(self.local_dir, dictionary_uri) # skip retrieval, if the server is not available if not self.is_online(SERVER_URL): return full_path fetch_file = True if os.path.isfile(full_path): last_mod = datetime.fromtimestamp(os.path.getmtime(full_path)) if last_mod < self.max_file_age: last_mod_server = self.get_last_mod_date(dictionary_uri) if last_mod_server < last_mod: fetch_file = False else: fetch_file = False if fetch_file: self.get_from_server(dictionary_uri, full_path) return full_path def get_last_mod_date(self, dictionary_uri): ''' Requests the URL with a HEAD request to retrieve the last_modified date of the file :param dictionary_uri: URI for the dictionary, e.g. people/de/titles/all.txt ''' full_url = urlparse.urljoin(self.server_url, dictionary_uri) response = self.retrieve.open(full_url, user=self.user, pwd=self.password, accept_gzip=False, head_only=True) last_modified = response.headers.get('Last-Modified') if last_modified: return datetime.strptime(last_modified, '%a, %d %b %Y %H:%M:%S %Z') def get_from_server(self, dictionary_uri, target_path): ''' Fetches a dictionary from the server and stores it on the local FS. :param dictionary_uri: URI for the dictionary, e.g. people/de/titles/all.txt :param target_path: destination on local FS to store the file :returns: target_path if the file was saved ''' full_url = urlparse.urljoin(self.server_url, dictionary_uri) response = self.retrieve.open(full_url, user=self.user, pwd=self.password) if response: target_directory = os.path.dirname(target_path) if not os.path.exists(target_directory): os.makedirs(target_directory) with open(target_path, 'w') as f: f.write(response.read()) return target_path
class GooglePlus(object): ''' classdocs ''' def __init__(self, api_key, api_url=API_URL): ''' Constructor ''' WebDataSource.__init__(self) self.api_key = api_key self.api_url = api_url self.retrieve = Retrieve('google-plus') def search(self, search_terms, max_results=DEFAULT_MAX_RESULTS): ''' searches Google+ for the given search_terms :param search_terms: search terms :type search_terms: list :param max_results: maximum number of result :type max_results: int :returns: generator with the result ''' for search_term in search_terms: if isinstance(search_term, str): search_term = search_term.encode('utf-8') params = { 'query': '"%s"' % search_term, 'orderBy': DEFAULT_ORDER_BY, 'maxResults': max_results } fetched = self.make_request(params, 'activities') for item in fetched['items']: try: yield self.convert_item(item) except Exception as e: logger.info('Error %s occured' % e) continue def get_activity(self, activity_id): ''' returns the activity with the given ID :param activity_id: GooglePlus activity ID :type activity_id: string :returns: mapped result :rtype: dict ''' item = self.make_request(path='activities/%s' % activity_id) return self.convert_item(item) def make_request(self, params=None, path='activities'): ''' executes the request to GooglePlus :param params: paremeters for the query :type params: list or None :param path: path to query, e.g. activities :type path: string :returns: GooglePlus result :rtype: dict ''' url = self.get_request_url(params, path) data = self.retrieve.open(url) return json.load(data) def get_request_url(self, params=None, path='activities'): ''' returns a correctly parsed request URL :param params: paremeters for the query :type params: list or None :param path: path to query, e.g. activities :type path: string :returns: GooglePlus request URL :rtype: str Usage: >>> plus = GooglePlus('abcd') >>> plus.get_request_url() 'https://www.googleapis.com/plus/v1/activities?key=abcd' ''' params = params if params else {} if not 'key' in params: params['key'] = self.api_key if 'maxResults' in params and params[ 'maxResults'] > DEFAULT_MAX_RESULTS: params['maxResults'] = DEFAULT_MAX_RESULTS return self.api_url.format(path=path, query=urlencode(params)) @classmethod def convert_item(cls, item): ''' applies a mapping to convert the result to the required format :param item: GooglePlus Activity :type item: dict :rtype: dict ''' last_modified = datetime.strptime(item['updated'], '%Y-%m-%dT%H:%M:%S.%fZ') published = datetime.strptime(item['updated'], '%Y-%m-%dT%H:%M:%S.%fZ') content = cls.convert_content(item['object']['content']) if not item['verb'] == 'post': raise Exception('Skipping activity of type "%s"' % item['verb']) if not len(content): logger.info('Skipping "%s" -> content is empty' % item['title']) raise Exception('content is empty') if 'attachments' in item['object']: for attachment in item['object']['attachments']: if attachment['objectType'] == 'article': if not 'content' in attachment: raise Exception('no content in attachment') content = '%s\n"%s" (%s)' % ( content, cls.convert_content( attachment['content']), attachment['url']) activity = { 'content': content, 'title': item['actor']['displayName'], 'url': item['url'], 'last_modified': last_modified, 'user_id': item['actor']['id'], 'user_img_url': item['actor']['image']['url'], 'screen_name': item['actor']['displayName'], 'encoding': u'utf-8', 'user_url': item['actor']['url'], 'valid_from': published, 'reshares': item['object']['resharers']['totalItems'], 'plusoners': item['object']['plusoners']['totalItems'], 'activity_id': item['id'], } if 'geocode' in activity: activity['geocode'] = item['geocode'] return activity
class AmazonWS(object): """ This class provides low level amazon web service access """ def __init__(self, location='us', key=None): """ init """ assert (location in AMAZON_LOCATIONS) self.retrieve = Retrieve(self.__class__.__name__) self.wsBase = AMAZON_LOCATIONS[location] self.accessKey = key or AMAZON_ACCESS_KEY self.amazon_url = AmazonUrl() def generateWsUrl(self, arguments): """ generates a valid amazon webservice request url """ argList = ["%s&SubscriptionId=%s" % (self.wsBase, self.accessKey)] + [ "%s=%s" % (k, quote(v)) for k, v in list(arguments.items()) ] return "&".join(argList) def generateSignedWsUrl(self, **arguments): """ generates a valid amazon webservice request url """ #argList = [ "%s&SubscriptionId=%s" % (self.wsBase, self.accessKey) ] + [ "%s=%s" % (k,quote(v)) for k,v in arguments.items() ] # return "&".join(argList) return self.amazon_url.get_request_url(arguments) def query(self, arguments): """ retrieves a result from amazon webservice """ url = self.generateWsUrl(arguments) done = False while not done: try: f = self.retrieve.open(url) res = f.read() self._write_debug_data(res) f.close() done = True except ValueError: logging.warning( "Exception webservice query - waiting for %d seconds...\n" % ERROR_SLEEP_TIME) time.sleep(ERROR_SLEEP_TIME) return res @staticmethod def _write_debug_data(data): """ writes the given data to the debug file, if specified """ if not AMAZON_DEBUG_FILE: return d = open(AMAZON_DEBUG_FILE, "a") d.write(data) d.close() def searchItem(self, searchIndex='Books', **param): """ searches an item in the amazon product repository """ arguments = { 'Operation': 'ItemSearch', 'SearchIndex': searchIndex, 'BrowseNode': '1000', 'Sort': 'salesrank', 'ResponseGroup': 'SalesRank,Small' } arguments.update(param) return self.query(arguments) def queryReview(self, itemId, **param): """ queries customers reviews to the selected Item """ arguments = { 'Operation': 'ItemLookup', 'ResponseGroup': 'Reviews', 'ItemId': itemId } arguments.update(param) return self.query(arguments) def newReleases(self, **param): """ returns a list of asins of new releases """ arguments = { 'Operation': 'BrowseNodeLookup', 'ResponseGroup': 'NewReleases', 'Marketplace': 'us' } arguments.update(param) return self.query(arguments) def itemAttributes(self, item_id, **param): """ returns all item attribues """ arguments = { 'Operation': 'ItemLookup', 'ItemId': item_id, 'IdType': 'ASIN', 'ResponseGroup': 'ItemAttributes,SalesRank' } arguments.update(param) return self.query(arguments)
class WebOfTrust(object): def __init__(self, api_key, service_url=SERVICE_URL): self.api_key = api_key self.service_url = service_url self.retrieve = Retrieve('eWRT.ws.wot') def get_reputation(self, hosts): query={'hosts': self._encode_hosts(hosts), 'api_key': self.api_key} urlObj = self.retrieve.open(self.service_url % query) if not urlObj: raise Exception('got no result') return self._format_result(json.loads(urlObj.read())) @classmethod def _encode_hosts(cls, hosts): ''' >>> WebOfTrust._encode_hosts(['http://wu.ac.at', 'https://wu.ac.at']) 'wu.ac.at/' >>> WebOfTrust._encode_hosts(['wu.ac.at', 'https://modul.ac.at/']) 'wu.ac.at/modul.ac.at/' ''' if isinstance(hosts, string_types): hosts = [hosts] selected_hosts = [] for host in hosts: if not host.startswith('http'): host = 'http://%s' % host netloc = '%s/' % quote(urlparse(host).netloc) if not netloc in selected_hosts: selected_hosts.append(netloc) assert len(hosts) <= MAX_HOSTS, 'too many hosts (max: %s)!' % MAX_HOSTS return ''.join(selected_hosts) @classmethod def _encode_url(cls, service_url, query): ''' encodes the url ''' return service_url % query @classmethod def _format_result(cls, data): ''' Formats the result using MAPPING. The components for the reputation provide the reputation and confidence. See WOT Developer API for details ''' result = {} for host, reputation in data.iteritems(): r = {} for attr_name, new_attr_name in MAPPING.iteritems(): if attr_name in reputation: r[new_attr_name] = reputation[attr_name] r['wot_link'] = WOT_LINK % r['target'] result[host] = r return result
class GooglePlus(object): ''' classdocs ''' def __init__(self, api_key, api_url=API_URL): ''' Constructor ''' WebDataSource.__init__(self) self.api_key = api_key self.api_url = api_url self.retrieve = Retrieve('google-plus') def search(self, search_terms, max_results=DEFAULT_MAX_RESULTS): ''' searches Google+ for the given search_terms :param search_terms: search terms :type search_terms: list :param max_results: maximum number of result :type max_results: int :returns: generator with the result ''' for search_term in search_terms: if isinstance(search_term, unicode): search_term = search_term.encode('utf-8') params = {'query': '"%s"' % search_term, 'orderBy': DEFAULT_ORDER_BY, 'maxResults': max_results} fetched = self.make_request(params, 'activities') for item in fetched['items']: try: yield self.convert_item(item) except Exception as e: logger.info('Error %s occured' % e) continue def get_activity(self, activity_id): ''' returns the activity with the given ID :param activity_id: GooglePlus activity ID :type activity_id: string :returns: mapped result :rtype: dict ''' item = self.make_request(path='activities/%s' % activity_id) return self.convert_item(item) def make_request(self, params=None, path='activities'): ''' executes the request to GooglePlus :param params: paremeters for the query :type params: list or None :param path: path to query, e.g. activities :type path: string :returns: GooglePlus result :rtype: dict ''' url = self.get_request_url(params, path) data = self.retrieve.open(url) return json.load(data) def get_request_url(self, params=None, path='activities'): ''' returns a correctly parsed request URL :param params: paremeters for the query :type params: list or None :param path: path to query, e.g. activities :type path: string :returns: GooglePlus request URL :rtype: str Usage: >>> plus = GooglePlus('abcd') >>> plus.get_request_url() 'https://www.googleapis.com/plus/v1/activities?key=abcd' ''' params = params if params else {} if not 'key' in params: params['key'] = self.api_key if 'maxResults' in params and params['maxResults'] > DEFAULT_MAX_RESULTS: params['maxResults'] = DEFAULT_MAX_RESULTS return self.api_url.format(path=path, query=urlencode(params)) @classmethod def convert_item(cls, item): ''' applies a mapping to convert the result to the required format :param item: GooglePlus Activity :type item: dict :rtype: dict ''' last_modified = datetime.strptime(item['updated'], '%Y-%m-%dT%H:%M:%S.%fZ') published = datetime.strptime(item['updated'], '%Y-%m-%dT%H:%M:%S.%fZ') content = cls.convert_content(item['object']['content']) if not item['verb'] == 'post': raise Exception('Skipping activity of type "%s"' % item['verb']) if not len(content): logger.info('Skipping "%s" -> content is empty' % item['title']) raise Exception('content is empty') if 'attachments' in item['object']: for attachment in item['object']['attachments']: if attachment['objectType'] == 'article': if not 'content' in attachment: raise Exception('no content in attachment') content = '%s\n"%s" (%s)' % (content, cls.convert_content( attachment['content']), attachment['url']) activity = {'content': content, 'title': item['actor']['displayName'], 'url': item['url'], 'last_modified': last_modified, 'user_id': item['actor']['id'], 'user_img_url': item['actor']['image']['url'], 'screen_name': item['actor']['displayName'], 'encoding': u'utf-8', 'user_url': item['actor']['url'], 'valid_from': published, 'reshares': item['object']['resharers']['totalItems'], 'plusoners': item['object']['plusoners']['totalItems'], 'activity_id': item['id'], } if 'geocode' in activity: activity['geocode'] = item['geocode'] return activity