def _get_tree(self, endpoint, payload=None): """Get xml response for given API endpoint and payload. :param: endpoint :type endpoint: str :param: payload :type payload: str """ if payload is None: payload = {} payload['token'] = self._get_auth_token(self.provider, update=True) url = self._get_absolute_url(endpoint) if not self.session: self.session = requests.Session() retries = 0 while True: try: response = self.session.get(url, params=payload, timeout=(30, 15)) except requests.exceptions.Timeout as ex: if retries < 3: logger.warn( 'Reuters API timeout retrying, retries {}'.format( retries)) retries += 1 continue raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError( _('Not found {payload}').format(payload=payload)) break try: return etree.fromstring( response.content) # workaround for http mock lib except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider)
def _get_tree(self, endpoint, payload=None): """Get xml response for given API endpoint and payload. :param: endpoint :type endpoint: str :param: payload :type payload: str """ if payload is None: payload = {} payload['token'] = self._get_auth_token(self.provider, update=True) url = self._get_absolute_url(endpoint) if not self.session: self.session = requests.Session() retries = 0 while True: try: response = self.session.get(url, params=payload, timeout=(30, 15)) except requests.exceptions.Timeout as ex: if retries < 3: logger.warn('Reuters API timeout retrying, retries {}'.format(retries)) retries += 1 continue raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found %s' % payload) break try: return etree.fromstring(response.content) # workaround for http mock lib except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider)
def test_raise_apiUnicodeError(self): with assert_raises(IngestApiError) as error_context: ex = Exception("Testing apiUnicodeError") raise IngestApiError.apiUnicodeError(ex, self.provider) exception = error_context.exception self.assertTrue(exception.code == 4004) self.assertTrue(exception.message == "API ingest Unicode Encode Error") self.assertIsNotNone(exception.system_exception) self.assertEqual(exception.system_exception.args[0], "Testing apiUnicodeError") self.assertEqual(len(self.mock_logger_handler.messages['error']), 1) self.assertEqual(self.mock_logger_handler.messages['error'][0], "IngestApiError Error 4004 - API ingest Unicode Encode Error: " "Testing apiUnicodeError on channel TestProvider")
def test_raise_apiUnicodeError(self): with assert_raises(IngestApiError) as error_context: ex = Exception("Testing apiUnicodeError") raise IngestApiError.apiUnicodeError(ex, self.provider) exception = error_context.exception self.assertTrue(exception.code == 4004) self.assertTrue(exception.message == "API ingest Unicode Encode Error") self.assertIsNotNone(exception.system_exception) self.assertEquals(exception.system_exception.args[0], "Testing apiUnicodeError") self.assertEqual(len(self.mock_logger_handler.messages['error']), 1) self.assertEqual( self.mock_logger_handler.messages['error'][0], "IngestApiError Error 4004 - API ingest Unicode Encode Error: " "Testing apiUnicodeError on channel TestProvider")
class EventHTTPFeedingService(HTTPFeedingServiceBase): """ Feeding Service class which can read events using HTTP """ NAME = 'event_http' label = 'Event HTTP feed' service = 'events' fields = [ { 'id': 'url', 'type': 'text', 'label': 'Feed URL', 'placeholder': 'Feed URL', 'required': True } ] ERRORS = [IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description()] HTTP_AUTH = False def _update(self, provider, update): """ Fetch events from external API. :param provider: Ingest Provider Details. :type provider: dict :param update: Any update that is required on provider. :type update: dict :return: a list of events which can be saved. """ response = self.get_url(self.config['url']) parser = self.get_feed_parser(provider) logger.info('Ingesting events with {} parser'.format(parser.__class__.__name__)) logger.info('Ingesting content: {} ...'.format(str(response.content)[:4000])) if hasattr(parser, 'parse_http'): items = parser.parse_http(response.content, provider) else: items = parser.parse(response.content) if isinstance(items, list): yield items else: yield [items]
def _get_tree(self, endpoint, payload=None): """ Get xml response for given API endpoint and payload. :param: endpoint :type endpoint: str :param: payload :type payload: str """ if payload is None: payload = {} payload['token'] = self._get_auth_token(self.provider, update=True) url = self._get_absolute_url(endpoint) try: response = requests.get(url, params=payload, timeout=15) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found %s' % payload) try: return etree.fromstring( response.content) # workaround for http mock lib except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider)
def _get_tree(self, endpoint, payload=None): """ Get xml response for given API endpoint and payload. :param: endpoint :type endpoint: str :param: payload :type payload: str """ if payload is None: payload = {} payload['token'] = self._get_auth_token(self.provider, update=True) url = self._get_absolute_url(endpoint) try: response = requests.get(url, params=payload, timeout=15) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found %s' % payload) try: return etree.fromstring(response.content) # workaround for http mock lib except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider)
def get_tree(self, endpoint, payload=None): """Get xml response for given API endpoint and payload.""" if payload is None: payload = {} payload['token'] = self.get_token() url = self.get_url(endpoint) try: response = requests.get(url, params=payload, timeout=21.0) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError(error, self.provider) if response.status_code == 404: raise IngestApiError.apiNotFoundError( LookupError('Not found %s' % payload), self.provider) try: # workaround for httmock lib # return etree.fromstring(response.text.encode('utf-8')) return etree.fromstring(response.content) except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError(error, self.provider)
def get_tree(self, endpoint, payload=None): """Get xml response for given API endpoint and payload.""" if payload is None: payload = {} payload['token'] = self.get_token() url = self.get_url(endpoint) try: response = requests.get(url, params=payload, timeout=21.0) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError(error, self.provider) if response.status_code == 404: raise IngestApiError.apiNotFoundError(LookupError('Not found %s' % payload), self.provider) try: # workaround for httmock lib # return etree.fromstring(response.text.encode('utf-8')) return etree.fromstring(response.content) except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError(error, self.provider)
class ReutersHTTPFeedingService(HTTPFeedingService): """ Feeding Service class which can read article(s) using HTTP provided by Reuters. """ NAME = 'reuters_http' ERRORS = [IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description()] DATE_FORMAT = '%Y.%m.%d.%H.%M' def _update(self, provider): updated = utcnow() last_updated = provider.get('last_updated') ttl_minutes = app.config['INGEST_EXPIRY_MINUTES'] if not last_updated or last_updated < updated - datetime.timedelta(minutes=ttl_minutes): last_updated = updated - datetime.timedelta(minutes=ttl_minutes) self.provider = provider provider_config = provider.get('config') if not provider_config: provider_config = {} provider['config'] = provider_config if 'url' not in provider_config: provider_config['url'] = 'http://rmb.reuters.com/rmd/rest/xml' if 'auth_url' not in provider_config: provider_config['auth_url'] = 'https://commerce.reuters.com/rmd/rest/xml/login' self.URL = provider_config.get('url') for channel in self._get_channels(): for guid in self._get_article_ids(channel, last_updated, updated): items = self.fetch_ingest(guid) if items: yield items def _get_channels(self): """Get subscribed channels.""" channels = [] tree = self._get_tree('channels') for channel in tree.findall('channelInformation'): channels.append(channel.find('alias').text) return channels def _get_tree(self, endpoint, payload=None): """ Get xml response for given API endpoint and payload. :param: endpoint :type endpoint: str :param: payload :type payload: str """ if payload is None: payload = {} payload['token'] = self._get_auth_token(self.provider, update=True) url = self._get_absolute_url(endpoint) try: response = requests.get(url, params=payload, timeout=15) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found %s' % payload) try: return etree.fromstring(response.content) # workaround for http mock lib except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) def _get_absolute_url(self, endpoint): """ Get absolute URL for given endpoint. :param: endpoint :type endpoint: str """ return '/'.join([self.URL, endpoint]) def _get_article_ids(self, channel, last_updated, updated): """ Get article ids which should be upserted. """ ids = set() payload = {'channel': channel, 'fieldsRef': 'id', 'dateRange': "%s-%s" % (self._format_date(last_updated), self._format_date(updated))} logger.info('Reuters requesting Date Range |{}| for channel {}'.format(payload['dateRange'], channel)) tree = self._get_tree('items', payload) for result in tree.findall('result'): ids.add(result.find('guid').text) return ids def _format_date(self, date): return date.strftime(self.DATE_FORMAT) def fetch_ingest(self, guid): items = self._parse_items(guid) result_items = [] while items: item = items.pop() self.add_timestamps(item) try: items.extend(self._fetch_items_in_package(item)) result_items.append(item) except LookupError as err: self.log_item_error(err, item, self.provider) return [] return result_items def _parse_items(self, guid): """ Parse item message and return given items. """ payload = {'id': guid} tree = self._get_tree('item', payload) parser = self.get_feed_parser(self.provider, tree) items = parser.parse(tree, self.provider) return items def _fetch_items_in_package(self, item): """ Fetch remote assets for given item. """ items = [] for group in item.get('groups', []): for ref in group.get('refs', []): if 'residRef' in ref: items.extend(self._parse_items(ref.get('residRef'))) return items
class HTTPFeedingService(FeedingService, metaclass=ABCMeta): """ Feeding Service class which can read article(s) using HTTP. """ ERRORS = [IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description()] label = 'HTTP' def __init__(self): super().__init__() self.token = None def _generate_token_and_update_provider(self, provider): """ Generates Authentication Token and updates the given provider with the authentication token. :param provider: dict - Ingest provider details to which the current directory has been configured :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource` :return: Authentication Token :rtype: str """ token = {'auth_token': self._generate_auth_token(provider), 'created': utcnow()} get_resource_service('ingest_providers').system_update(provider[config.ID_FIELD], updates={'tokens': token}, original=provider) provider['tokens'] = token return token['auth_token'] def _generate_auth_token(self, provider): """ Generates Authentication Token as per the configuration in Ingest Provider. :param provider: dict - Ingest provider details to which the current directory has been configured :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource` :return: token details if successfully authenticated :rtype: str :raises: IngestApiError.apiGeneralError() if auth_url is missing in the Ingest Provider configuration """ session = requests.Session() session.mount('https://', SSLAdapter()) auth_url = provider.get('config', {}).get('auth_url', None) if not auth_url: raise IngestApiError.apiGeneralError(provider=provider, exception=KeyError( ''' Ingest Provider {} is missing Authentication URL. Please check the configuration. '''.format(provider['name'])) ) payload = { 'username': provider.get('config', {}).get('username', ''), 'password': provider.get('config', {}).get('password', ''), } response = session.get(auth_url, params=payload, verify=False, timeout=30) if response.status_code < 200 or response.status_code >= 300: try: response.raise_for_status() except Exception: err = IngestApiError.apiAuthError(provider=provider) self.close_provider(provider, err, force=True) raise err tree = etree.fromstring(response.content) # workaround for http mock lib return tree.text def _is_valid_token(self, token): """Check if the given token is still valid. Most of authentication tokens issued by Ingest Providers are valid for 12 hours. :param token: Token information :type token: dict :return: True if valid, False otherwise :rtype: bool """ ttl = timedelta(hours=12) created = arrow.get(token.get('created')).datetime return created + ttl >= utcnow() and token.get('auth_token') def _get_auth_token(self, provider, update=False): """ Gets authentication token for given provider instance and save it in db based on the given update flag. :param provider: dict - Ingest provider details to which the current directory has been configured :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource` :param update: a flag which dictates whether to save the authentication token in Ingest Provider record or not. Saves if the value is True, defaults to False. :type update: bool :return: Authentication Token :rtype: str """ token = provider.get('tokens') if token and self._is_valid_token(token): return token.get('auth_token') return self._generate_token_and_update_provider(provider) if update else ''
class EventHTTPFeedingService(HTTPFeedingService): """ Feeding Service class which can read events using HTTP """ NAME = 'event_http' ERRORS = [ IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description() ] label = 'Event HTTP feed' """ Defines the collection service to be used with this ingest feeding service. """ service = 'events' fields = [{ 'id': 'url', 'type': 'text', 'label': 'Feed URL', 'placeholder': 'Feed URL', 'required': True }] def _update(self, provider, update): updated = utcnow() last_updated = provider.get('last_updated') ttl_minutes = app.config['INGEST_EXPIRY_MINUTES'] if not last_updated or last_updated < updated - datetime.timedelta( minutes=ttl_minutes): last_updated = updated - datetime.timedelta(minutes=ttl_minutes) self.provider = provider provider_config = provider.get('config') if not provider_config: provider_config = {} provider['config'] = provider_config self.URL = provider_config.get('url') payload = {} parser = self.get_feed_parser(provider) try: response = requests.get(self.URL, params=payload, timeout=15) # TODO: check if file has been updated since provider last_updated # although some ptovider do not include 'Last-Modified' in headers # so unsure how to do this logger.info('Http Headers: %s', response.headers) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found %s' % payload) logger.info('Ingesting: %s', str(response.content)) if isinstance(parser, NTBEventXMLFeedParser): xml = ET.fromstring(response.content) items = parser.parse(xml, provider) elif isinstance(parser, IcsTwoFeedParser): cal = Calendar.from_ical(response.content) items = parser.parse(cal, provider) else: items = parser.parse(response.content) if isinstance(items, list): yield items else: yield [items]
class AAPSportsHTTPFeedingService(HTTPFeedingService): label = 'AAP Sports Results Feed' NAME = 'aap_sports_http' ERRORS = [ IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description() ] """ Defines the collection service to be used with this ingest feeding service. """ service = 'events' fields = [ { 'id': 'login_url', 'type': 'text', 'label': 'Login Url', 'placeholder': 'Login Url', 'required': True, 'errors': { 4006: 'Server not found.', 4000: 'Unexpected server response' } }, { 'id': 'fixtures_url', 'type': 'text', 'label': 'Fixtures Url', 'placeholder': 'Fixtures Url', 'required': True }, { 'id': 'username', 'type': 'text', 'label': 'Username', 'placeholder': 'Username', 'required': True }, { 'id': 'password', 'type': 'password', 'label': 'Password', 'placeholder': 'Password', 'required': True, 'errors': { 4007: 'Authentication error.' } }, { 'id': 'sports', 'type': 'text', 'label': 'Sports', 'placeholder': 'Comma separate list of sports ids', 'required': True, 'default': '1,2,3,4,10' }, ] def _update(self, provider, update): self.provider = provider parser = self.get_feed_parser(provider) # get the current year, it is used to filter fixtures for this year and next year = int(utcnow().year) % 100 config = provider.get('config', {}) content = self._request( config.get('login_url').format(config.get('username'), config.get('password'))) # get the configured sports configured_sports = config.get('sports').split(',') xml = ET.fromstring(content) if xml.attrib['Status_Code'] == 'OK': session = xml.attrib['Status_Session'] content = self._request( config.get('fixtures_url').format(session, '', '', '')) xml = ET.fromstring(content) for s in xml.findall('.//Sports/Sport'): sport_id = s.attrib['SportID'] if sport_id not in configured_sports: continue sport_name = s.attrib['SportName'] content = self._request( config.get('fixtures_url').format(session, sport_id, '', '')) sport_xml = ET.fromstring(content) for c in sport_xml.findall('.//Competition'): comp_id = c.attrib.get('Comp_ID') comp_name = c.attrib.get('Comp_Name') content = self._request( config.get('fixtures_url').format( session, sport_id, comp_id, '')) comp_xml = ET.fromstring(content) for season in comp_xml.findall('.//Season'): season_id = season.attrib.get('SeasonID') if str(year) in season_id or str(year + 1) in season_id: content = self._request( config.get('fixtures_url').format( session, sport_id, comp_id, season_id)) fixture_xml = ET.fromstring(content) logger.info('Parsing {}/{} {}/{}'.format( sport_id, sport_name, comp_id, comp_name)) items = parser.parse( { 'fixture_xml': fixture_xml, 'sport_id': sport_id, 'sport_name': sport_name, 'comp_name': comp_name, 'comp_id': comp_id }, provider) if len(items) > 0: yield items def _request(self, url): try: response = requests.get(url, params={}, timeout=120) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found') return response.content
from superdesk.io.ingest_service import IngestService from superdesk.utc import utcnow from superdesk.etree import etree, ParseError from superdesk.io import register_provider from .newsml_2_0 import NewsMLTwoParser from .reuters_token import get_token from superdesk.errors import IngestApiError from flask import current_app as app PROVIDER = 'reuters' errors = [IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description()] class ReutersIngestService(IngestService): """Reuters ingest service.""" DATE_FORMAT = '%Y.%m.%d.%H.%M' URL = 'http://rmb.reuters.com/rmd/rest/xml' token = None def __init__(self): self.parser = NewsMLTwoParser() def get_token(self):
class ReutersHTTPFeedingService(HTTPFeedingService): """ Feeding Service class which can read article(s) using HTTP provided by Reuters. """ NAME = 'reuters_http' ERRORS = [ IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description() ] DATE_FORMAT = '%Y.%m.%d.%H.%M' def _update(self, provider): updated = utcnow() last_updated = provider.get('last_updated') ttl_minutes = app.config['INGEST_EXPIRY_MINUTES'] if not last_updated or last_updated < updated - datetime.timedelta( minutes=ttl_minutes): last_updated = updated - datetime.timedelta(minutes=ttl_minutes) self.provider = provider provider_config = provider.get('config') if not provider_config: provider_config = {} provider['config'] = provider_config if 'url' not in provider_config: provider_config['url'] = 'http://rmb.reuters.com/rmd/rest/xml' if 'auth_url' not in provider_config: provider_config[ 'auth_url'] = 'https://commerce.reuters.com/rmd/rest/xml/login' self.URL = provider_config.get('url') for channel in self._get_channels(): ids = self._get_article_ids(channel, last_updated, updated) for id in ids: try: items = self.fetch_ingest(id) if items: yield items # if there was an exception processing the one of the bunch log it and continue except Exception as ex: logger.warn( 'Reuters item {} has not been retrieved'.format(id)) logger.exception(ex) def _get_channels(self): """Get subscribed channels.""" channels = [] tree = self._get_tree('channels') for channel in tree.findall('channelInformation'): channels.append(channel.find('alias').text) return channels def _get_tree(self, endpoint, payload=None): """ Get xml response for given API endpoint and payload. :param: endpoint :type endpoint: str :param: payload :type payload: str """ if payload is None: payload = {} payload['token'] = self._get_auth_token(self.provider, update=True) url = self._get_absolute_url(endpoint) try: response = requests.get(url, params=payload, timeout=15) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found %s' % payload) try: return etree.fromstring( response.content) # workaround for http mock lib except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) def _get_absolute_url(self, endpoint): """ Get absolute URL for given endpoint. :param: endpoint :type endpoint: str """ return '/'.join([self.URL, endpoint]) def _get_article_ids(self, channel, last_updated, updated): """ Get article ids which should be upserted also save the poll token that is returned. """ ids = set() payload = {'channel': channel, 'fieldsRef': 'id'} # check if the channel has a pollToken if not fall back to dateRange last_poll_token = self._get_poll_token(channel) if last_poll_token is not None: logger.info( "Reuters requesting channel {} with poll token {}".format( channel, last_poll_token)) payload['pollToken'] = last_poll_token else: payload['dateRange'] = "%s-%s" % (self._format_date(last_updated), self._format_date(updated)) logger.info( "Reuters requesting channel {} with dateRange {}".format( channel, payload['dateRange'])) tree = self._get_tree('items', payload) status_code = tree.find('status').get( 'code') if tree.tag == 'results' else tree.get('code') # check the returned status if status_code != '10': logger.warn( "Reuters channel request returned status code {}".format( status_code)) # status code 30 indicates failure if status_code == '30': # invalid token logger.warn("Reuters error on channel {} code {} {}".format( channel, tree.find('error').get('code'), tree.find('error').text)) if tree.find('error').get('code') == '2100': self._save_poll_token(channel, None) logger.warn( "Reuters channel invalid token reseting {}".format( status_code)) return ids # extract the returned poll token if there is one poll_token = tree.find('pollToken') if poll_token is not None: # a new token indicated new content if poll_token.text != last_poll_token: logger.info("Reuters channel {} new token {}".format( channel, poll_token.text)) self._save_poll_token(channel, poll_token.text) else: # the token has not changed, so nothing new logger.info("Reuters channel {} nothing new".format(channel)) return ids else: logger.info( "Reuters channel {} retrieved no token".format(channel)) return ids for result in tree.findall('result'): id = result.find('id').text ids.add(id) logger.info("Reuters id : {}".format(id)) return ids def _save_poll_token(self, channel, poll_token): """ Saves the poll token for the passed channel in the config section of the :param channel: :param poll_token: :return: """ # get the provider in case it has been updated by another channel ingest_provider_service = superdesk.get_resource_service( 'ingest_providers') provider = ingest_provider_service.find_one( req=None, _id=self.provider[superdesk.config.ID_FIELD]) provider_token = provider.get('tokens') if 'poll_tokens' not in provider_token: provider_token['poll_tokens'] = {channel: poll_token} else: provider_token['poll_tokens'][channel] = poll_token upd_provider = {'tokens': provider_token} ingest_provider_service.system_update( self.provider[superdesk.config.ID_FIELD], upd_provider, self.provider) def _get_poll_token(self, channel): """ Get the poll token from provider config if it is available. :param channel: :return: token """ if 'tokens' in self.provider and 'poll_tokens' in self.provider[ 'tokens']: return self.provider.get('tokens').get('poll_tokens').get( channel, None) def _format_date(self, date): return date.strftime(self.DATE_FORMAT) def fetch_ingest(self, id): items = self._parse_items(id) result_items = [] while items: item = items.pop() self.add_timestamps(item) try: items.extend(self._fetch_items_in_package(item)) result_items.append(item) except LookupError as err: self.log_item_error(err, item, self.provider) return [] return result_items def _parse_items(self, id): """ Parse item message and return given items. """ payload = {'id': id} tree = self._get_tree('item', payload) parser = self.get_feed_parser(self.provider, tree) items = parser.parse(tree, self.provider) return items def _fetch_items_in_package(self, item): """ Fetch remote assets for given item. """ items = [] for group in item.get('groups', []): for ref in group.get('refs', []): if 'residRef' in ref: items.extend(self._parse_items(ref.get('residRef'))) return items def prepare_href(self, href, mimetype=None): (scheme, netloc, path, params, query, fragment) = urlparse(href) new_href = urlunparse((scheme, netloc, path, '', '', '')) return '%s?auth_token=%s' % ( new_href, self._get_auth_token(self.provider, update=True))
class ReutersHTTPFeedingService(HTTPFeedingService): """ Feeding Service class which can read article(s) using HTTP provided by Reuters. """ NAME = "reuters_http" ERRORS = [ IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description(), ] DATE_FORMAT = "%Y.%m.%d.%H.%M" label = "Reuters feed API" fields = [ { "id": "url", "type": "text", "label": "Feed URL", "placeholder": "Feed URL", "required": True, "default": "http://rmb.reuters.com/rmd/rest/xml", }, { "id": "auth_url", "type": "text", "label": "URL for Authentication", "placeholder": "authentication url", "required": True, "default": "https://commerce.reuters.com/rmd/rest/xml/login", }, {"id": "username", "type": "text", "label": "Username", "placeholder": "Username", "required": True}, {"id": "password", "type": "password", "label": "Password", "placeholder": "Password", "required": True}, ] session = None def _update(self, provider, update): updated = utcnow() last_updated = provider.get("last_updated") ttl_minutes = app.config["INGEST_EXPIRY_MINUTES"] if not last_updated or last_updated < updated - datetime.timedelta(minutes=ttl_minutes): last_updated = updated - datetime.timedelta(minutes=ttl_minutes) self.provider = provider provider_config = provider.get("config") if not provider_config: provider_config = {} provider["config"] = provider_config provider_config.setdefault("url", "http://rmb.reuters.com/rmd/rest/xml") provider_config.setdefault("auth_url", "https://commerce.reuters.com/rmd/rest/xml/login") self.URL = provider_config.get("url") for channel in self._get_channels(): ids = self._get_article_ids(channel, last_updated, updated) for id in ids: try: items = self.fetch_ingest(id) if items: yield items # if there was an exception processing the one of the bunch log it and continue except Exception as ex: logger.warn("Reuters item {} has not been retrieved".format(id)) logger.exception(ex) def _get_channels(self): """Get subscribed channels.""" channels = [] tree = self._get_tree("channels") for channel in tree.findall("channelInformation"): channels.append(channel.find("alias").text) return channels def _get_tree(self, endpoint, payload=None): """Get xml response for given API endpoint and payload. :param: endpoint :type endpoint: str :param: payload :type payload: str """ if payload is None: payload = {} payload["token"] = self._get_auth_token(self.provider, update=True) url = self._get_absolute_url(endpoint) if not self.session: self.session = requests.Session() retries = 0 while True: try: response = self.session.get(url, params=payload, timeout=(30, 15)) except requests.exceptions.Timeout as ex: if retries < 3: logger.warn("Reuters API timeout retrying, retries {}".format(retries)) retries += 1 continue raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError(_("Not found {payload}").format(payload=payload)) break try: return etree.fromstring(response.content) # workaround for http mock lib except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) def _get_absolute_url(self, endpoint): """ Get absolute URL for given endpoint. :param: endpoint :type endpoint: str """ return "/".join([self.URL, endpoint]) def _get_article_ids(self, channel, last_updated, updated): """ Get article ids which should be upserted also save the poll token that is returned. """ ids = set() payload = {"channel": channel, "fieldsRef": "id"} # check if the channel has a pollToken if not fall back to dateRange last_poll_token = self._get_poll_token(channel) if last_poll_token is not None: logger.info("Reuters requesting channel {} with poll token {}".format(channel, last_poll_token)) payload["pollToken"] = last_poll_token else: payload["dateRange"] = "%s-%s" % (self._format_date(last_updated), self._format_date(updated)) logger.info("Reuters requesting channel {} with dateRange {}".format(channel, payload["dateRange"])) tree = self._get_tree("items", payload) status_code = tree.find("status").get("code") if tree.tag == "results" else tree.get("code") # check the returned status if status_code != "10": logger.warn("Reuters channel request returned status code {}".format(status_code)) # status code 30 indicates failure if status_code == "30": # invalid token logger.warn( "Reuters error on channel {} code {} {}".format( channel, tree.find("error").get("code"), tree.find("error").text ) ) if tree.find("error").get("code") == "2100": self._save_poll_token(channel, None) logger.warn("Reuters channel invalid token reseting {}".format(status_code)) return ids # extract the returned poll token if there is one poll_token = tree.find("pollToken") if poll_token is not None: # a new token indicated new content if poll_token.text != last_poll_token: logger.info("Reuters channel {} new token {}".format(channel, poll_token.text)) self._save_poll_token(channel, poll_token.text) else: # the token has not changed, so nothing new logger.info("Reuters channel {} nothing new".format(channel)) return ids else: logger.info("Reuters channel {} retrieved no token".format(channel)) return ids for result in tree.findall("result"): id = result.find("id").text ids.add(id) logger.info("Reuters id : {}".format(id)) return ids def _save_poll_token(self, channel, poll_token): """Saves the poll token for the passed channel in the config section of the :param channel: :param poll_token: :return: """ # get the provider in case it has been updated by another channel ingest_provider_service = superdesk.get_resource_service("ingest_providers") provider = ingest_provider_service.find_one(req=None, _id=self.provider[superdesk.config.ID_FIELD]) provider_token = provider.get("tokens") if "poll_tokens" not in provider_token: provider_token["poll_tokens"] = {channel: poll_token} else: provider_token["poll_tokens"][channel] = poll_token upd_provider = {"tokens": provider_token} ingest_provider_service.system_update(self.provider[superdesk.config.ID_FIELD], upd_provider, self.provider) def _get_poll_token(self, channel): """Get the poll token from provider config if it is available. :param channel: :return: token """ if "tokens" in self.provider and "poll_tokens" in self.provider["tokens"]: return self.provider.get("tokens").get("poll_tokens").get(channel, None) def _format_date(self, date): return date.strftime(self.DATE_FORMAT) def fetch_ingest(self, id): items = self._parse_items(id) result_items = [] while items: item = items.pop() self.localize_timestamps(item) try: items.extend(self._fetch_items_in_package(item)) result_items.append(item) except LookupError as err: self.log_item_error(err, item, self.provider) return [] return result_items def _parse_items(self, id): """ Parse item message and return given items. """ payload = {"id": id} tree = self._get_tree("item", payload) parser = self.get_feed_parser(self.provider, tree) items = parser.parse(tree, self.provider) return items def _fetch_items_in_package(self, item): """ Fetch remote assets for given item. """ items = [] for group in item.get("groups", []): for ref in group.get("refs", []): if "residRef" in ref: items.extend(self._parse_items(ref.get("residRef"))) return items def prepare_href(self, href, mimetype=None): (scheme, netloc, path, params, query, fragment) = urlparse(href) new_href = urlunparse((scheme, netloc, path, "", "", "")) return "%s?auth_token=%s" % (new_href, self._get_auth_token(self.provider, update=True))
class ReutersIngestService(IngestService): """Reuters ingest service.""" PROVIDER = 'reuters' ERRORS = [ IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description() ] DATE_FORMAT = '%Y.%m.%d.%H.%M' URL = 'http://rmb.reuters.com/rmd/rest/xml' token = None def __init__(self): self.parser = NewsMLTwoParser() def get_token(self): """Get reuters token once for an update run.""" if not self.token: self.token = get_token(self.provider, update=True) return self.token def _update(self, provider): """Service update call.""" self.provider = provider updated = utcnow() last_updated = provider.get('last_updated') ttl_minutes = app.config['INGEST_EXPIRY_MINUTES'] if not last_updated or last_updated < updated - datetime.timedelta( minutes=ttl_minutes): last_updated = updated - datetime.timedelta(minutes=ttl_minutes) for channel in self.get_channels(): for guid in self.get_ids(channel, last_updated, updated): items = self.fetch_ingest(guid) if items: yield items def fetch_ingest(self, guid): items = self.get_items(guid) result_items = [] while items: item = items.pop() self.add_timestamps(item) try: items.extend(self.fetch_assets(item)) result_items.append(item) except LookupError as err: self.log_item_error(err, item, self.provider) return [] return result_items def fetch_assets(self, item): """Fetch remote assets for given item.""" items = [] for group in item.get('groups', []): for ref in group.get('refs', []): if 'residRef' in ref: items.extend(self.get_items(ref.get('residRef'))) return items def get_items(self, guid): """Parse item message and return given items.""" payload = {'id': guid} tree = self.get_tree('item', payload) items = self.parser.parse_message(tree, self.provider) return items def get_ids(self, channel, last_updated, updated): """Get ids of documents which should be updated.""" ids = [] payload = {'channel': channel, 'fieldsRef': 'id'} payload['dateRange'] = "%s-%s" % (self.format_date(last_updated), self.format_date(updated)) tree = self.get_tree('items', payload) for result in tree.findall('result'): ids.append(result.find('guid').text) return ids def get_channels(self): """Get subscribed channels.""" channels = [] tree = self.get_tree('channels') for channel in tree.findall('channelInformation'): channels.append(channel.find('alias').text) return channels def get_tree(self, endpoint, payload=None): """Get xml response for given API endpoint and payload.""" if payload is None: payload = {} payload['token'] = self.get_token() url = self.get_url(endpoint) try: response = requests.get(url, params=payload, timeout=15) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found %s' % payload) try: # workaround for httmock lib # return etree.fromstring(response.text.encode('utf-8')) return etree.fromstring(response.content) except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) def get_url(self, endpoint): """Get API url for given endpoint.""" return '/'.join([self.URL, endpoint]) def format_date(self, date): """Format date for API usage.""" return date.strftime(self.DATE_FORMAT) def prepare_href(self, href): (scheme, netloc, path, params, query, fragment) = urlparse(href) new_href = urlunparse((scheme, netloc, path, '', '', '')) return '%s?auth_token=%s' % (new_href, self.get_token())