def _update(self, provider, update): try: config = provider['config'] user = config['username'] password = config['password'] id_list = config['idList'] except KeyError as e: SuperdeskIngestError.notConfiguredError( Exception('username, password and idList are needed')) # we remove spaces and empty values from id_list to do a clean list id_list = ','.join( [id_.strip() for id_ in id_list.split(',') if id_.strip()]) params = { 'idList': id_list, 'idListType': 'products', 'format': '5', 'maxItems': '25', 'sortOrder': 'chronological' } try: min_date_time = provider['private']['min_date_time'] sequence_number = provider['private']['sequence_number'] except KeyError: pass else: params['minDateTime'] = min_date_time params['sequenceNumber'] = sequence_number try: r = requests.get(URL, auth=(user, password), params=params) except Exception as e: raise IngestApiError.apiRequestError( Exception('error while doing the request')) try: root_elt = etree.fromstring(r.content) except Exception as e: raise IngestApiError.apiRequestError( Exception('error while doing the request')) parser = self.get_feed_parser(provider) items = parser.parse(root_elt, provider) try: min_date_time = root_elt.xpath( '//iptc:timestamp[@role="minDateTime"]/text()', namespaces=NS)[0].strip() sequence_number = root_elt.xpath('//iptc:transmitId/text()', namespaces=NS)[0].strip() except IndexError: raise IngestApiError.apiRequestError( Exception('missing minDateTime or transmitId')) else: update.setdefault('private', {}) update['private']['min_date_time'] = min_date_time update['private']['sequence_number'] = sequence_number return [items]
def _update(self, provider, update): config = self.config try: user, password = self.config["username"], self.config["password"] except KeyError: SuperdeskIngestError.notConfiguredError( Exception("username and password are needed")) url_override = config.get("url", "").strip() if not url_override.startswith("http"): SuperdeskIngestError.notConfiguredError( Exception("if URL is set, it must be a valid http link")) if url_override: params = {"user": user, "password": password, "maksAntal": 50} else: params = { "user": user, "password": password, "maksAntal": 50, "waitAcknowledge": "true" } r = self.get_url(url_override, params=params) try: root_elt = etree.fromstring(r.text) except Exception: raise IngestApiError.apiRequestError( Exception("error while parsing the request answer")) try: if root_elt.xpath("(//error/text())[1]")[0] != "0": err_msg = root_elt.xpath("(//errormsg/text())[1]")[0] raise IngestApiError.apiRequestError( Exception("error code returned by API: {msg}".format( msg=err_msg))) except IndexError: raise IngestApiError.apiRequestError( Exception("Invalid XML, <error> element not found")) parser = self.get_feed_parser(provider) items = [] for elt in root_elt.xpath("//RBNews"): item = parser.parse(elt, provider) items.append(item) if not url_override: try: queue_id = elt.xpath(".//ServiceQueueId/text()")[0] except IndexError: raise IngestApiError.apiRequestError( Exception("missing ServiceQueueId element")) ack_params = { "user": user, "password": password, "servicequeueid": queue_id } self.get_url(URL_ACK, params=ack_params) return [items]
def _update(self, provider, update): try: config = provider['config'] user = config['username'] password = config['password'] id_list = config['idList'] # before "products" was hardcoded as value for "idListType" id_list_type = config.get('idListType', 'products') if not user.strip() or not password.strip() or not id_list.strip(): raise KeyError except KeyError: raise SuperdeskIngestError.notConfiguredError(Exception('username, password and idList are needed')) # we remove spaces and empty values from id_list to do a clean list id_list = ','.join([id_.strip() for id_ in id_list.split(',') if id_.strip()]) params = {'idList': id_list, 'idListType': id_list_type, 'format': '5', 'maxItems': '25', 'sortOrder': 'chronological'} try: min_date_time = provider['private']['min_date_time'] sequence_number = provider['private']['sequence_number'] except KeyError: pass else: params['minDateTime'] = min_date_time params['sequenceNumber'] = sequence_number try: r = requests.get(URL, auth=(user, password), params=params) except Exception: raise IngestApiError.apiRequestError(Exception('error while doing the request')) try: root_elt = etree.fromstring(r.content) except Exception: raise IngestApiError.apiRequestError(Exception('error while doing the request')) parser = self.get_feed_parser(provider) items = parser.parse(root_elt, provider) try: min_date_time = root_elt.xpath('//iptc:timestamp[@role="minDateTime"]/text()', namespaces=NS)[0].strip() sequence_number = root_elt.xpath('//iptc:transmitId/text()', namespaces=NS)[0].strip() except IndexError: raise IngestApiError.apiRequestError(Exception('missing minDateTime or transmitId')) else: update.setdefault('private', {}) update['private']['min_date_time'] = min_date_time update['private']['sequence_number'] = sequence_number return [items]
def _get_tree(self, endpoint, payload=None): """Get xml response for given API endpoint and payload. :param: endpoint :type endpoint: str :param: payload :type payload: str """ if payload is None: payload = {} payload['token'] = self._get_auth_token(self.provider, update=True) url = self._get_absolute_url(endpoint) if not self.session: self.session = requests.Session() retries = 0 while True: try: response = self.session.get(url, params=payload, timeout=(30, 15)) except requests.exceptions.Timeout as ex: if retries < 3: logger.warn( 'Reuters API timeout retrying, retries {}'.format( retries)) retries += 1 continue raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError( _('Not found {payload}').format(payload=payload)) break try: return etree.fromstring( response.content) # workaround for http mock lib except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider)
class WufooFeedingService(FeedingService): """ Feeding Service class which can read article(s) using Wufoo API """ NAME = "wufoo" ERRORS = [ IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiGeneralError().get_error_description(), ] label = "Wufoo feed API" fields = [ { "id": "wufoo_username", "type": "text", "label": "Login", "placeholder": "Wufoo login", "required": True }, { "id": "wufoo_api_key", "type": "password", "label": "API key", "placeholder": "Wufoo API Key", "required": True, }, ] def __init__(self): super().__init__() self.fields_cache = {} def _update(self, provider, update): user = provider["config"]["wufoo_username"] wufoo_data = { "url": WUFOO_URL.format(subdomain=user), "user": user, "api_key": provider["config"]["wufoo_api_key"], "form_query_entries_tpl": WUFOO_QUERY_FORM + WUFOO_QUERY_ENTRIES, "update": update, } try: parser = self.get_feed_parser(provider, None) except requests.exceptions.Timeout as ex: raise IngestApiError.apiTimeoutError(ex, provider) except requests.exceptions.TooManyRedirects as ex: raise IngestApiError.apiRedirectError(ex, provider) except requests.exceptions.RequestException as ex: raise IngestApiError.apiRequestError(ex, provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) items = parser.parse(wufoo_data, provider) return [items]
def _update(self, provider, update): updated = utcnow() last_updated = provider.get('last_updated') ttl_minutes = app.config['INGEST_EXPIRY_MINUTES'] if not last_updated or last_updated < updated - datetime.timedelta( minutes=ttl_minutes): last_updated = updated - datetime.timedelta(minutes=ttl_minutes) self.provider = provider provider_config = provider.get('config') if not provider_config: provider_config = {} provider['config'] = provider_config self.URL = provider_config.get('url') payload = {} parser = self.get_feed_parser(provider) try: response = requests.get(self.URL, params=payload, timeout=15) # TODO: check if file has been updated since provider last_updated # although some ptovider do not include 'Last-Modified' in headers # so unsure how to do this logger.info('Http Headers: %s', response.headers) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found %s' % payload) logger.info('Ingesting: %s', str(response.content)) if isinstance(parser, NTBEventXMLFeedParser): xml = ET.fromstring(response.content) items = parser.parse(xml, provider) elif isinstance(parser, IcsTwoFeedParser): cal = Calendar.from_ical(response.content) items = parser.parse(cal, provider) else: items = parser.parse(response.content) if isinstance(items, list): yield items else: yield [items]
class WufooFeedingService(FeedingService): """ Feeding Service class which can read article(s) using Wufoo API """ NAME = 'wufoo' ERRORS = [ IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiGeneralError().get_error_description() ] label = 'Wufoo feed API' fields = [{ 'id': 'wufoo_username', 'type': 'text', 'label': 'Login', 'placeholder': 'Wufoo login', 'required': True }, { 'id': 'wufoo_api_key', 'type': 'password', 'label': 'API key', 'placeholder': 'Wufoo API Key', 'required': True }] parser_restricted_values = ['wufoo'] def __init__(self): self.fields_cache = {} def _update(self, provider, update): user = provider['config']['wufoo_username'] wufoo_data = { "url": WUFOO_URL.format(subdomain=user), "user": user, "api_key": provider['config']['wufoo_api_key'], "form_query_entries_tpl": WUFOO_QUERY_FORM + WUFOO_QUERY_ENTRIES, "update": update } try: parser = self.get_feed_parser(provider, None) except requests.exceptions.Timeout as ex: raise IngestApiError.apiTimeoutError(ex, provider) except requests.exceptions.TooManyRedirects as ex: raise IngestApiError.apiRedirectError(ex, provider) except requests.exceptions.RequestException as ex: raise IngestApiError.apiRequestError(ex, provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) items = parser.parse(wufoo_data, provider) return [items]
def _update(self, provider, update): config = self.config try: user, password = self.config['username'], self.config['password'] except KeyError: SuperdeskIngestError.notConfiguredError(Exception('username and password are needed')) url_override = config.get('url', '').strip() if not url_override.startswith('http'): SuperdeskIngestError.notConfiguredError(Exception('if URL is set, it must be a valid http link')) if url_override: params = {'user': user, 'password': password, 'maksAntal': 50} else: params = {'user': user, 'password': password, 'maksAntal': 50, 'waitAcknowledge': 'true'} r = self.get_url(url_override, params=params) try: root_elt = etree.fromstring(r.text) except Exception: raise IngestApiError.apiRequestError(Exception('error while parsing the request answer')) try: if root_elt.xpath('(//error/text())[1]')[0] != '0': err_msg = root_elt.xpath('(//errormsg/text())[1]')[0] raise IngestApiError.apiRequestError(Exception('error code returned by API: {msg}'.format(msg=err_msg))) except IndexError: raise IngestApiError.apiRequestError(Exception('Invalid XML, <error> element not found')) parser = self.get_feed_parser(provider) items = [] for elt in root_elt.xpath('//RBNews'): item = parser.parse(elt, provider) items.append(item) if not url_override: try: queue_id = elt.xpath('.//ServiceQueueId/text()')[0] except IndexError: raise IngestApiError.apiRequestError(Exception('missing ServiceQueueId element')) ack_params = {'user': user, 'password': password, 'servicequeueid': queue_id} self.get_url(URL_ACK, params=ack_params) return [items]
def _get_tree(self, endpoint, payload=None): """Get xml response for given API endpoint and payload. :param: endpoint :type endpoint: str :param: payload :type payload: str """ if payload is None: payload = {} payload['token'] = self._get_auth_token(self.provider, update=True) url = self._get_absolute_url(endpoint) if not self.session: self.session = requests.Session() retries = 0 while True: try: response = self.session.get(url, params=payload, timeout=(30, 15)) except requests.exceptions.Timeout as ex: if retries < 3: logger.warn('Reuters API timeout retrying, retries {}'.format(retries)) retries += 1 continue raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found %s' % payload) break try: return etree.fromstring(response.content) # workaround for http mock lib except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider)
def test_raise_apiRequestError(self): with assert_raises(IngestApiError) as error_context: ex = Exception("Testing apiRequestError") raise IngestApiError.apiRequestError(ex, self.provider) exception = error_context.exception self.assertTrue(exception.code == 4003) self.assertTrue(exception.message == "API ingest has request error") self.assertIsNotNone(exception.system_exception) self.assertEqual(exception.system_exception.args[0], "Testing apiRequestError") self.assertEqual(len(self.mock_logger_handler.messages['error']), 1) self.assertEqual(self.mock_logger_handler.messages['error'][0], "IngestApiError Error 4003 - API ingest has request error: " "Testing apiRequestError on channel TestProvider")
class NewsworthyFeedingService(FeedingService): """ Feeding Service class which can retrieve articles from Newsworthy web service """ NAME = 'newsworthy' ERRORS = [IngestApiError.apiRequestError().get_error_description(), SuperdeskIngestError.notConfiguredError().get_error_description()] label = 'Newsworthy' fields = [ { 'id': 'url', 'type': 'text', 'label': 'Use this URL for webhook', 'default_value': '', 'readonly': True, }, { 'id': 'username', 'type': 'text', 'label': 'Username', 'required': True }, { 'id': 'password', 'type': 'password', 'label': 'Password', 'required': True }, { 'id': 'secret', 'type': 'password', 'label': 'Shared Secret', 'placeholder': 'Shared Secret', 'required': False }, ] def _update(self, provider, update): try: data = provider['newsworthy_data'] except IndexError: return [[]] if data['hook']['event'] == EVENT_UNPUBLISHED: logger.info("ignoring unpublish event on following data:\n{data}".format(data=data)) return [[]] # we have to write to a temporary file because feed parser expect a file path # FIXME: it would be better to use the data directly with NamedTemporaryFile('w') as f: json.dump(data['data'], f) f.seek(0) parser = self.get_feed_parser(provider, f.name) items = parser.parse(f.name, provider) return [items]
def test_raise_apiRequestError(self): with assert_raises(IngestApiError) as error_context: ex = Exception("Testing apiRequestError") raise IngestApiError.apiRequestError(ex, self.provider) exception = error_context.exception self.assertTrue(exception.code == 4003) self.assertTrue(exception.message == "API ingest has request error") self.assertIsNotNone(exception.system_exception) self.assertEquals(exception.system_exception.args[0], "Testing apiRequestError") self.assertEqual(len(self.mock_logger_handler.messages['error']), 1) self.assertEqual( self.mock_logger_handler.messages['error'][0], "IngestApiError Error 4003 - API ingest has request error: " "Testing apiRequestError on channel TestProvider")
class EventHTTPFeedingService(HTTPFeedingServiceBase): """ Feeding Service class which can read events using HTTP """ NAME = 'event_http' label = 'Event HTTP feed' service = 'events' fields = [ { 'id': 'url', 'type': 'text', 'label': 'Feed URL', 'placeholder': 'Feed URL', 'required': True } ] ERRORS = [IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description()] HTTP_AUTH = False def _update(self, provider, update): """ Fetch events from external API. :param provider: Ingest Provider Details. :type provider: dict :param update: Any update that is required on provider. :type update: dict :return: a list of events which can be saved. """ response = self.get_url(self.config['url']) parser = self.get_feed_parser(provider) logger.info('Ingesting events with {} parser'.format(parser.__class__.__name__)) logger.info('Ingesting content: {} ...'.format(str(response.content)[:4000])) if hasattr(parser, 'parse_http'): items = parser.parse_http(response.content, provider) else: items = parser.parse(response.content) if isinstance(items, list): yield items else: yield [items]
def _get_tree(self, endpoint, payload=None): """ Get xml response for given API endpoint and payload. :param: endpoint :type endpoint: str :param: payload :type payload: str """ if payload is None: payload = {} payload['token'] = self._get_auth_token(self.provider, update=True) url = self._get_absolute_url(endpoint) try: response = requests.get(url, params=payload, timeout=15) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found %s' % payload) try: return etree.fromstring( response.content) # workaround for http mock lib except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider)
def _get_tree(self, endpoint, payload=None): """ Get xml response for given API endpoint and payload. :param: endpoint :type endpoint: str :param: payload :type payload: str """ if payload is None: payload = {} payload['token'] = self._get_auth_token(self.provider, update=True) url = self._get_absolute_url(endpoint) try: response = requests.get(url, params=payload, timeout=15) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found %s' % payload) try: return etree.fromstring(response.content) # workaround for http mock lib except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider)
def _update(self, provider, update): try: config = provider['config'] user = config['username'] password = config['password'] except KeyError: SuperdeskIngestError.notConfiguredError(Exception('username and password are needed')) url_override = config.get('url', '').strip() if not url_override.startswith('http'): SuperdeskIngestError.notConfiguredError(Exception('if URL is set, it must be a valid http link')) if url_override: params = {'user': user, 'password': password, 'maksAntal': 50} else: params = {'user': user, 'password': password, 'maksAntal': 50, 'waitAcknowledge': 'true'} try: r = requests.get(url_override or URL, params=params) except Exception: raise IngestApiError.apiRequestError(Exception('error while doing the request')) try: root_elt = etree.fromstring(r.text) except Exception: raise IngestApiError.apiRequestError(Exception('error while parsing the request answer')) try: if root_elt.xpath('(//error/text())[1]')[0] != '0': err_msg = root_elt.xpath('(//errormsg/text())[1]')[0] raise IngestApiError.apiRequestError(Exception('error code returned by API: {msg}'.format(msg=err_msg))) except IndexError: raise IngestApiError.apiRequestError(Exception('Invalid XML, <error> element not found')) parser = self.get_feed_parser(provider) items = [] for elt in root_elt.xpath('//RBNews'): item = parser.parse(elt, provider) items.append(item) if not url_override: try: queue_id = elt.xpath('.//ServiceQueueId/text()')[0] except IndexError: raise IngestApiError.apiRequestError(Exception('missing ServiceQueueId element')) ack_params = {'user': user, 'password': password, 'servicequeueid': queue_id} try: requests.get(URL_ACK, params=ack_params) except Exception: raise IngestApiError.apiRequestError(Exception('error while doing the request')) return [items]
def _request(self, url): try: response = requests.get(url, params={}, timeout=120) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found') return response.content
def _request(self, url): try: response = requests.get(url, params={}, timeout=120) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found') return response.content
def _update(self, provider, update): user = provider['config']['wufoo_username'] wufoo_data = { "url": WUFOO_URL.format(subdomain=user), "user": user, "api_key": provider['config']['wufoo_api_key'], "form_query_entries_tpl": WUFOO_QUERY_FORM + WUFOO_QUERY_ENTRIES, "update": update} try: parser = self.get_feed_parser(provider, None) except requests.exceptions.Timeout as ex: raise IngestApiError.apiTimeoutError(ex, provider) except requests.exceptions.TooManyRedirects as ex: raise IngestApiError.apiRedirectError(ex, provider) except requests.exceptions.RequestException as ex: raise IngestApiError.apiRequestError(ex, provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) items = parser.parse(wufoo_data, provider) return [items]
def _update(self, provider, update): user = provider['config']['wufoo_username'] wufoo_data = { "url": WUFOO_URL.format(subdomain=user), "user": user, "api_key": provider['config']['wufoo_api_key'], "form_query_entries_tpl": WUFOO_QUERY_FORM + WUFOO_QUERY_ENTRIES, "update": update} try: parser = self.get_feed_parser(provider, None) except requests.exceptions.Timeout as ex: raise IngestApiError.apiTimeoutError(ex, provider) except requests.exceptions.TooManyRedirects as ex: raise IngestApiError.apiRedirectError(ex, provider) except requests.exceptions.RequestException as ex: raise IngestApiError.apiRequestError(ex, provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) items = parser.parse(wufoo_data, provider) return [items]
def get_tree(self, endpoint, payload=None): """Get xml response for given API endpoint and payload.""" if payload is None: payload = {} payload['token'] = self.get_token() url = self.get_url(endpoint) try: response = requests.get(url, params=payload, timeout=21.0) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError(error, self.provider) if response.status_code == 404: raise IngestApiError.apiNotFoundError( LookupError('Not found %s' % payload), self.provider) try: # workaround for httmock lib # return etree.fromstring(response.text.encode('utf-8')) return etree.fromstring(response.content) except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError(error, self.provider)
def get_tree(self, endpoint, payload=None): """Get xml response for given API endpoint and payload.""" if payload is None: payload = {} payload['token'] = self.get_token() url = self.get_url(endpoint) try: response = requests.get(url, params=payload, timeout=21.0) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError(error, self.provider) if response.status_code == 404: raise IngestApiError.apiNotFoundError(LookupError('Not found %s' % payload), self.provider) try: # workaround for httmock lib # return etree.fromstring(response.text.encode('utf-8')) return etree.fromstring(response.content) except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError(error, self.provider)
class ReutersHTTPFeedingService(HTTPFeedingService): """ Feeding Service class which can read article(s) using HTTP provided by Reuters. """ NAME = 'reuters_http' ERRORS = [IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description()] DATE_FORMAT = '%Y.%m.%d.%H.%M' def _update(self, provider): updated = utcnow() last_updated = provider.get('last_updated') ttl_minutes = app.config['INGEST_EXPIRY_MINUTES'] if not last_updated or last_updated < updated - datetime.timedelta(minutes=ttl_minutes): last_updated = updated - datetime.timedelta(minutes=ttl_minutes) self.provider = provider provider_config = provider.get('config') if not provider_config: provider_config = {} provider['config'] = provider_config if 'url' not in provider_config: provider_config['url'] = 'http://rmb.reuters.com/rmd/rest/xml' if 'auth_url' not in provider_config: provider_config['auth_url'] = 'https://commerce.reuters.com/rmd/rest/xml/login' self.URL = provider_config.get('url') for channel in self._get_channels(): for guid in self._get_article_ids(channel, last_updated, updated): items = self.fetch_ingest(guid) if items: yield items def _get_channels(self): """Get subscribed channels.""" channels = [] tree = self._get_tree('channels') for channel in tree.findall('channelInformation'): channels.append(channel.find('alias').text) return channels def _get_tree(self, endpoint, payload=None): """ Get xml response for given API endpoint and payload. :param: endpoint :type endpoint: str :param: payload :type payload: str """ if payload is None: payload = {} payload['token'] = self._get_auth_token(self.provider, update=True) url = self._get_absolute_url(endpoint) try: response = requests.get(url, params=payload, timeout=15) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found %s' % payload) try: return etree.fromstring(response.content) # workaround for http mock lib except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) def _get_absolute_url(self, endpoint): """ Get absolute URL for given endpoint. :param: endpoint :type endpoint: str """ return '/'.join([self.URL, endpoint]) def _get_article_ids(self, channel, last_updated, updated): """ Get article ids which should be upserted. """ ids = set() payload = {'channel': channel, 'fieldsRef': 'id', 'dateRange': "%s-%s" % (self._format_date(last_updated), self._format_date(updated))} logger.info('Reuters requesting Date Range |{}| for channel {}'.format(payload['dateRange'], channel)) tree = self._get_tree('items', payload) for result in tree.findall('result'): ids.add(result.find('guid').text) return ids def _format_date(self, date): return date.strftime(self.DATE_FORMAT) def fetch_ingest(self, guid): items = self._parse_items(guid) result_items = [] while items: item = items.pop() self.add_timestamps(item) try: items.extend(self._fetch_items_in_package(item)) result_items.append(item) except LookupError as err: self.log_item_error(err, item, self.provider) return [] return result_items def _parse_items(self, guid): """ Parse item message and return given items. """ payload = {'id': guid} tree = self._get_tree('item', payload) parser = self.get_feed_parser(self.provider, tree) items = parser.parse(tree, self.provider) return items def _fetch_items_in_package(self, item): """ Fetch remote assets for given item. """ items = [] for group in item.get('groups', []): for ref in group.get('refs', []): if 'residRef' in ref: items.extend(self._parse_items(ref.get('residRef'))) return items
class HTTPFeedingService(FeedingService, metaclass=ABCMeta): """ Feeding Service class which can read article(s) using HTTP. """ ERRORS = [IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description()] label = 'HTTP' def __init__(self): super().__init__() self.token = None def _generate_token_and_update_provider(self, provider): """ Generates Authentication Token and updates the given provider with the authentication token. :param provider: dict - Ingest provider details to which the current directory has been configured :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource` :return: Authentication Token :rtype: str """ token = {'auth_token': self._generate_auth_token(provider), 'created': utcnow()} get_resource_service('ingest_providers').system_update(provider[config.ID_FIELD], updates={'tokens': token}, original=provider) provider['tokens'] = token return token['auth_token'] def _generate_auth_token(self, provider): """ Generates Authentication Token as per the configuration in Ingest Provider. :param provider: dict - Ingest provider details to which the current directory has been configured :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource` :return: token details if successfully authenticated :rtype: str :raises: IngestApiError.apiGeneralError() if auth_url is missing in the Ingest Provider configuration """ session = requests.Session() session.mount('https://', SSLAdapter()) auth_url = provider.get('config', {}).get('auth_url', None) if not auth_url: raise IngestApiError.apiGeneralError(provider=provider, exception=KeyError( ''' Ingest Provider {} is missing Authentication URL. Please check the configuration. '''.format(provider['name'])) ) payload = { 'username': provider.get('config', {}).get('username', ''), 'password': provider.get('config', {}).get('password', ''), } response = session.get(auth_url, params=payload, verify=False, timeout=30) if response.status_code < 200 or response.status_code >= 300: try: response.raise_for_status() except Exception: err = IngestApiError.apiAuthError(provider=provider) self.close_provider(provider, err, force=True) raise err tree = etree.fromstring(response.content) # workaround for http mock lib return tree.text def _is_valid_token(self, token): """Check if the given token is still valid. Most of authentication tokens issued by Ingest Providers are valid for 12 hours. :param token: Token information :type token: dict :return: True if valid, False otherwise :rtype: bool """ ttl = timedelta(hours=12) created = arrow.get(token.get('created')).datetime return created + ttl >= utcnow() and token.get('auth_token') def _get_auth_token(self, provider, update=False): """ Gets authentication token for given provider instance and save it in db based on the given update flag. :param provider: dict - Ingest provider details to which the current directory has been configured :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource` :param update: a flag which dictates whether to save the authentication token in Ingest Provider record or not. Saves if the value is True, defaults to False. :type update: bool :return: Authentication Token :rtype: str """ token = provider.get('tokens') if token and self._is_valid_token(token): return token.get('auth_token') return self._generate_token_and_update_provider(provider) if update else ''
class AAPSportsHTTPFeedingService(HTTPFeedingService): label = 'AAP Sports Results Feed' NAME = 'aap_sports_http' ERRORS = [ IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description() ] """ Defines the collection service to be used with this ingest feeding service. """ service = 'events' fields = [ { 'id': 'login_url', 'type': 'text', 'label': 'Login Url', 'placeholder': 'Login Url', 'required': True, 'errors': { 4006: 'Server not found.', 4000: 'Unexpected server response' } }, { 'id': 'fixtures_url', 'type': 'text', 'label': 'Fixtures Url', 'placeholder': 'Fixtures Url', 'required': True }, { 'id': 'username', 'type': 'text', 'label': 'Username', 'placeholder': 'Username', 'required': True }, { 'id': 'password', 'type': 'password', 'label': 'Password', 'placeholder': 'Password', 'required': True, 'errors': { 4007: 'Authentication error.' } }, { 'id': 'sports', 'type': 'text', 'label': 'Sports', 'placeholder': 'Comma separate list of sports ids', 'required': True, 'default': '1,2,3,4,10' }, ] def _update(self, provider, update): self.provider = provider parser = self.get_feed_parser(provider) # get the current year, it is used to filter fixtures for this year and next year = int(utcnow().year) % 100 config = provider.get('config', {}) content = self._request( config.get('login_url').format(config.get('username'), config.get('password'))) # get the configured sports configured_sports = config.get('sports').split(',') xml = ET.fromstring(content) if xml.attrib['Status_Code'] == 'OK': session = xml.attrib['Status_Session'] content = self._request( config.get('fixtures_url').format(session, '', '', '')) xml = ET.fromstring(content) for s in xml.findall('.//Sports/Sport'): sport_id = s.attrib['SportID'] if sport_id not in configured_sports: continue sport_name = s.attrib['SportName'] content = self._request( config.get('fixtures_url').format(session, sport_id, '', '')) sport_xml = ET.fromstring(content) for c in sport_xml.findall('.//Competition'): comp_id = c.attrib.get('Comp_ID') comp_name = c.attrib.get('Comp_Name') content = self._request( config.get('fixtures_url').format( session, sport_id, comp_id, '')) comp_xml = ET.fromstring(content) for season in comp_xml.findall('.//Season'): season_id = season.attrib.get('SeasonID') if str(year) in season_id or str(year + 1) in season_id: content = self._request( config.get('fixtures_url').format( session, sport_id, comp_id, season_id)) fixture_xml = ET.fromstring(content) logger.info('Parsing {}/{} {}/{}'.format( sport_id, sport_name, comp_id, comp_name)) items = parser.parse( { 'fixture_xml': fixture_xml, 'sport_id': sport_id, 'sport_name': sport_name, 'comp_name': comp_name, 'comp_id': comp_id }, provider) if len(items) > 0: yield items def _request(self, url): try: response = requests.get(url, params={}, timeout=120) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found') return response.content
class RitzauFeedingService(FeedingService): """ Feeding Service class which can retrieve articles from Ritzau web service """ NAME = 'ritzau' ERRORS = [ IngestApiError.apiRequestError().get_error_description(), SuperdeskIngestError.notConfiguredError().get_error_description() ] def _update(self, provider, update): try: config = provider['config'] user = config['username'] password = config['password'] except KeyError as e: SuperdeskIngestError.notConfiguredError( Exception('username and password are needed')) url_override = config.get('url', '').strip() if not url_override.startswith('http'): SuperdeskIngestError.notConfiguredError( Exception('if URL is set, it must be a valid http link')) if url_override: params = {'user': user, 'password': password, 'maksAntal': 50} else: params = { 'user': user, 'password': password, 'maksAntal': 50, 'waitAcknowledge': 'true' } try: r = requests.get(url_override or URL, params=params) except Exception as e: raise IngestApiError.apiRequestError( Exception('error while doing the request')) try: root_elt = etree.fromstring(r.text) except Exception as e: raise IngestApiError.apiRequestError( Exception('error while parsing the request answer')) try: if root_elt.xpath('(//error/text())[1]')[0] != '0': err_msg = root_elt.xpath('(//errormsg/text())[1]')[0] raise IngestApiError.apiRequestError( Exception('error code returned by API: {msg}'.format( msg=err_msg))) except IndexError as e: raise IngestApiError.apiRequestError( Exception('Invalid XML, <error> element not found')) parser = self.get_feed_parser(provider) items = [] for elt in root_elt.xpath('//RBNews'): item = parser.parse(elt, provider) items.append(item) if not url_override: try: queue_id = elt.xpath('.//ServiceQueueId/text()')[0] except IndexError: raise IngestApiError.apiRequestError( Exception('missing ServiceQueueId element')) ack_params = { 'user': user, 'password': password, 'servicequeueid': queue_id } try: requests.get(URL_ACK, params=ack_params) except Exception as e: raise IngestApiError.apiRequestError( Exception('error while doing the request')) return [items]
def _update(self, provider, update): self.HTTP_URL = provider.get('config', {}).get('api_url', '') self.provider = provider # Set the apikey parameter we're going to use it on all calls params = dict() params['apikey'] = provider.get('config', {}).get('apikey') # Use the next link if one is available in the config if provider.get('config', {}).get('next_link'): r = self.get_url(url=provider.get('config', {}).get('next_link'), params=params, verify=False, allow_redirects=True) r.raise_for_status() else: id_list = provider.get('config', {}).get('productList', '').strip() recovery_time = provider.get('config', {}).get('recoverytime', '1').strip() if recovery_time == '': recovery_time = '1' start = (utcnow() - timedelta(hours=int(recovery_time)) ).isoformat()[:19] + 'Z' # If there has been a list of products defined then we format them for the request, if not all # allowed products will be returned. if id_list: # we remove spaces and empty values from id_list to do a clean list id_list = ' OR '.join( [id_.strip() for id_ in id_list.split(',') if id_.strip()]) params[ 'q'] = 'productid:(' + id_list + ') AND mindate:>{}'.format( start) else: params['q'] = 'mindate:>{}'.format(start) params['page_size'] = '100' r = self.get_url(params=params, verify=False, allow_redirects=True) r.raise_for_status() try: response = json.loads(r.text) except Exception: raise IngestApiError.apiRequestError( Exception('error parsing response')) nextLink = response.get('data', {}).get('next_page') # Got the same next link as last time so nothing new if nextLink == provider.get('config', {}).get('next_link'): logger.info('Nothing new from AP Media') return [] if len(response.get('data', {}).get('items', [])) > 0: try: sequence_number = int( provider.get('config', {}).get('sequence', 0)) with ftp_connect({ 'username': provider.get('config', {}).get('ftp_user', ''), 'password': provider.get('config', {}).get('ftp_password', ''), 'host': provider.get('config', {}).get('ftp_server', ''), 'path': provider.get('config', {}).get('ftp_path', '') }) as ftp: for item in response.get('data', {}).get('items', []): try: if item['item']['type'] == 'picture': image_ref = item['item']['renditions']['main'][ 'href'] if provider.get('config', {}).get( 'filenametemplate', '') == '': filename = to_ascii( item['item']['renditions']['main'] ['originalfilename']) else: # The filename is generated by applying the date format string in the template filename = datetime.now().strftime( provider.get('config', {}).get( 'filenametemplate', '')) # and appending the sequence number filename += '-' + str( sequence_number).zfill(4) + '.jpg' sequence_number = (sequence_number + 1) % 10000 logger.info( 'file: {} versioncreated: {}'.format( filename, item['item']['versioncreated'])) r = requests.get(url=image_ref, params={ 'apikey': provider.get( 'config', {}).get('apikey') }) r.raise_for_status() try: ftp.storbinary('STOR {}'.format(filename), BytesIO(r.content)) except ftplib.all_errors as e: logger.error(e) # Any exception processing an indivisual item is swallowed except Exception as ex: logger.exception(ex) except Exception as ex: logger.exception(ex) # Save the link for next time upd_provider = provider.get('config') upd_provider['next_link'] = nextLink upd_provider['recoverytime'] = '' upd_provider['sequence'] = str(sequence_number) update['config'] = upd_provider return None
class ReutersHTTPFeedingService(HTTPFeedingService): """ Feeding Service class which can read article(s) using HTTP provided by Reuters. """ NAME = 'reuters_http' ERRORS = [ IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description() ] DATE_FORMAT = '%Y.%m.%d.%H.%M' def _update(self, provider): updated = utcnow() last_updated = provider.get('last_updated') ttl_minutes = app.config['INGEST_EXPIRY_MINUTES'] if not last_updated or last_updated < updated - datetime.timedelta( minutes=ttl_minutes): last_updated = updated - datetime.timedelta(minutes=ttl_minutes) self.provider = provider provider_config = provider.get('config') if not provider_config: provider_config = {} provider['config'] = provider_config if 'url' not in provider_config: provider_config['url'] = 'http://rmb.reuters.com/rmd/rest/xml' if 'auth_url' not in provider_config: provider_config[ 'auth_url'] = 'https://commerce.reuters.com/rmd/rest/xml/login' self.URL = provider_config.get('url') for channel in self._get_channels(): ids = self._get_article_ids(channel, last_updated, updated) for id in ids: try: items = self.fetch_ingest(id) if items: yield items # if there was an exception processing the one of the bunch log it and continue except Exception as ex: logger.warn( 'Reuters item {} has not been retrieved'.format(id)) logger.exception(ex) def _get_channels(self): """Get subscribed channels.""" channels = [] tree = self._get_tree('channels') for channel in tree.findall('channelInformation'): channels.append(channel.find('alias').text) return channels def _get_tree(self, endpoint, payload=None): """ Get xml response for given API endpoint and payload. :param: endpoint :type endpoint: str :param: payload :type payload: str """ if payload is None: payload = {} payload['token'] = self._get_auth_token(self.provider, update=True) url = self._get_absolute_url(endpoint) try: response = requests.get(url, params=payload, timeout=15) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found %s' % payload) try: return etree.fromstring( response.content) # workaround for http mock lib except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) def _get_absolute_url(self, endpoint): """ Get absolute URL for given endpoint. :param: endpoint :type endpoint: str """ return '/'.join([self.URL, endpoint]) def _get_article_ids(self, channel, last_updated, updated): """ Get article ids which should be upserted also save the poll token that is returned. """ ids = set() payload = {'channel': channel, 'fieldsRef': 'id'} # check if the channel has a pollToken if not fall back to dateRange last_poll_token = self._get_poll_token(channel) if last_poll_token is not None: logger.info( "Reuters requesting channel {} with poll token {}".format( channel, last_poll_token)) payload['pollToken'] = last_poll_token else: payload['dateRange'] = "%s-%s" % (self._format_date(last_updated), self._format_date(updated)) logger.info( "Reuters requesting channel {} with dateRange {}".format( channel, payload['dateRange'])) tree = self._get_tree('items', payload) status_code = tree.find('status').get( 'code') if tree.tag == 'results' else tree.get('code') # check the returned status if status_code != '10': logger.warn( "Reuters channel request returned status code {}".format( status_code)) # status code 30 indicates failure if status_code == '30': # invalid token logger.warn("Reuters error on channel {} code {} {}".format( channel, tree.find('error').get('code'), tree.find('error').text)) if tree.find('error').get('code') == '2100': self._save_poll_token(channel, None) logger.warn( "Reuters channel invalid token reseting {}".format( status_code)) return ids # extract the returned poll token if there is one poll_token = tree.find('pollToken') if poll_token is not None: # a new token indicated new content if poll_token.text != last_poll_token: logger.info("Reuters channel {} new token {}".format( channel, poll_token.text)) self._save_poll_token(channel, poll_token.text) else: # the token has not changed, so nothing new logger.info("Reuters channel {} nothing new".format(channel)) return ids else: logger.info( "Reuters channel {} retrieved no token".format(channel)) return ids for result in tree.findall('result'): id = result.find('id').text ids.add(id) logger.info("Reuters id : {}".format(id)) return ids def _save_poll_token(self, channel, poll_token): """ Saves the poll token for the passed channel in the config section of the :param channel: :param poll_token: :return: """ # get the provider in case it has been updated by another channel ingest_provider_service = superdesk.get_resource_service( 'ingest_providers') provider = ingest_provider_service.find_one( req=None, _id=self.provider[superdesk.config.ID_FIELD]) provider_token = provider.get('tokens') if 'poll_tokens' not in provider_token: provider_token['poll_tokens'] = {channel: poll_token} else: provider_token['poll_tokens'][channel] = poll_token upd_provider = {'tokens': provider_token} ingest_provider_service.system_update( self.provider[superdesk.config.ID_FIELD], upd_provider, self.provider) def _get_poll_token(self, channel): """ Get the poll token from provider config if it is available. :param channel: :return: token """ if 'tokens' in self.provider and 'poll_tokens' in self.provider[ 'tokens']: return self.provider.get('tokens').get('poll_tokens').get( channel, None) def _format_date(self, date): return date.strftime(self.DATE_FORMAT) def fetch_ingest(self, id): items = self._parse_items(id) result_items = [] while items: item = items.pop() self.add_timestamps(item) try: items.extend(self._fetch_items_in_package(item)) result_items.append(item) except LookupError as err: self.log_item_error(err, item, self.provider) return [] return result_items def _parse_items(self, id): """ Parse item message and return given items. """ payload = {'id': id} tree = self._get_tree('item', payload) parser = self.get_feed_parser(self.provider, tree) items = parser.parse(tree, self.provider) return items def _fetch_items_in_package(self, item): """ Fetch remote assets for given item. """ items = [] for group in item.get('groups', []): for ref in group.get('refs', []): if 'residRef' in ref: items.extend(self._parse_items(ref.get('residRef'))) return items def prepare_href(self, href, mimetype=None): (scheme, netloc, path, params, query, fragment) = urlparse(href) new_href = urlunparse((scheme, netloc, path, '', '', '')) return '%s?auth_token=%s' % ( new_href, self._get_auth_token(self.provider, update=True))
def _update(self, provider, update): try: config = provider["config"] id_list = config["idList"] # before "products" was hardcoded as value for "idListType" id_list_type = config.get("idListType", "products") if not id_list.strip(): raise KeyError except KeyError: raise SuperdeskIngestError.notConfiguredError(Exception("idList is needed")) # we check if the provider has been closed since the last update try: last_closed = provider["last_closed"]["closed_at"] last_updated = provider["last_updated"] except KeyError: pass else: if last_closed > last_updated and "private" in provider: # we reset the private data so only last page of items will be retrieved (cf. SDESK-4372) logger.info("reseting private data for provider {source}".format(source=provider.get("source"))) del provider["private"] # we remove spaces and empty values from id_list to do a clean list id_list = ",".join([id_.strip() for id_ in id_list.split(",") if id_.strip()]) params = { "idList": id_list, "idListType": id_list_type, "format": "5", "maxItems": "25", } try: min_date_time = provider["private"]["min_date_time"] sequence_number = provider["private"]["sequence_number"] except KeyError: # the provider is new or re-opened, we want last items # so we need reverse-chronological order chronological = False else: params["minDateTime"] = min_date_time params["sequenceNumber"] = sequence_number params["sortOrder"] = "chronological" chronological = True r = self.get_url(params=params) try: root_elt = etree.fromstring(r.content) except Exception: raise IngestApiError.apiRequestError(Exception("error while doing the request")) parser = self.get_feed_parser(provider) items = parser.parse(root_elt, provider) if not chronological: items.reverse() try: min_date_time = root_elt.xpath('//iptc:timestamp[@role="minDateTime"]/text()', namespaces=NS)[0].strip() sequence_number = root_elt.xpath("//iptc:transmitId/text()", namespaces=NS)[0].strip() except IndexError: raise IngestApiError.apiRequestError(Exception("missing minDateTime or transmitId")) else: update.setdefault("private", {}) update["private"]["min_date_time"] = min_date_time update["private"]["sequence_number"] = sequence_number return [items]
class HTTPFeedingServiceBase(FeedingService): """ Base class for feeding services using HTTP. This class contains helpers to make the creation of HTTP based feeding services easier. There are a couple of class attributes you can use: ======================= =========== Attribute Explanation ======================= =========== HTTP_URL Main URL of your service, will be used by default in get_url HTTP_TIMEOUT Timeout of requests in seconds HTTP_DEFAULT_PARAMETERS Parameters used in every ``get`` requests. Will be updated with params set in arguments HTTP_AUTH Indicate if HTTP authentication is needed for your service. If None, the authentication will be determined by the existence of user and password. Will be overriden by auth_required config if it exists. ======================= =========== In addition, you have some pre-filled fields: =============== =========== Field Explanation =============== =========== AUTH_FIELDS username and password fields AUTH_REQ_FIELDS username and password fields + auth_required field to indicate if they are needed =============== =========== When ingest is updated, the provider is automatically saved to ``self.provider``. ``config`` property allows to access easily the user configuration. ``auth_info`` property returns a dictionary with ``username`` and ``password`` ``get_url`` method do a HTTP Get request. url can be ommited in which case HTTP_URL will be used. Authentication parameters are set automatically, and errors are catched appropriately. Extra arguments are used directly in *requests* call. """ ERRORS = [ IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiGeneralError().get_error_description(), SuperdeskIngestError.notConfiguredError().get_error_description() ] # override this parameter with the main URL to use HTTP_URL = None # timeout in seconds HTTP_TIMEOUT = 30 # if some parameters are used in every request, put them here HTTP_DEFAULT_PARAMETERS = None # Set to True if authentication is mandatory, False if there is no authentication # and None to add authentication if user and password are defined. # If auth_required is defined in config fields, it will override this value. HTTP_AUTH = True # use this when auth is always required AUTH_FIELDS = [{ 'id': 'username', 'type': 'text', 'label': 'Username', 'placeholder': 'Username', 'required': True }, { 'id': 'password', 'type': 'password', 'label': 'Password', 'placeholder': 'Password', 'required': True }] # use this when auth depends of a "auth_required" flag (set by user) AUTH_REQ_FIELDS = [{ 'id': 'auth_required', 'type': 'boolean', 'label': 'Requires Authentication', 'placeholder': 'Requires Authentication', 'required': False }, { 'id': 'username', 'type': 'text', 'label': 'Username', 'placeholder': 'Username', 'required_expression': '{auth_required}', 'show_expression': '{auth_required}' }, { 'id': 'password', 'type': 'password', 'label': 'Password', 'placeholder': 'Password', 'required_expression': '{auth_required}', 'show_expression': '{auth_required}' }] def __init__(self): self.token = None @property def auth_info(self): """Helper method to retrieve a dict with username and password when set""" username = self.config.get('username', '') password = self.config.get('password', '') if not username or not password: return None return {'username': username, 'password': password} @property def config(self): return self.provider.setdefault('config', {}) def validate_config(self): """ Validate provider config according to `cls.fields` :param config: Ingest provider configuration :type config: dict :return: """ # validate required config fields required_keys = [ field['id'] for field in self.fields if field.get('required', False) ] if not set(self.config.keys()).issuperset(required_keys): raise SuperdeskIngestError.notConfiguredError( Exception('{} are required.'.format(', '.join(required_keys)))) url = self.config.get('url').strip() if not url: try: url_field = next({f for f in self.fields if f['id'] == u'url'}) except StopIteration: url_required = False else: url_required = url_field.get('required', False) if url_required: raise SuperdeskIngestError.notConfiguredError( Exception('URL is a required field.')) else: # validate url if not url.startswith('http'): raise SuperdeskIngestError.notConfiguredError( Exception('URL must be a valid HTTP link.')) def get_url(self, url=None, **kwargs): """Do an HTTP Get on URL :param string url: url to use (None to use self.HTTP_URL) :param **kwargs: extra parameter for requests :return requests.Response: response """ if not url: url = self.HTTP_URL config = self.config user = config.get('username') password = config.get('password') if user: user = user.strip() if password: password = password.strip() auth_required = config.get('auth_required', self.HTTP_AUTH) if auth_required is None: # auth_required may not be user in the feeding service # in this case with use authentification only if user # and password are set. auth_required = bool(user and password) if auth_required: if not user: raise SuperdeskIngestError.notConfiguredError( "user is not configured") if not password: raise SuperdeskIngestError.notConfiguredError( "password is not configured") kwargs.setdefault('auth', (user, password)) params = kwargs.pop("params", {}) if params or self.HTTP_DEFAULT_PARAMETERS: # if we have default parameters, we want them to be overriden # by conflicting params given in arguments if self.HTTP_DEFAULT_PARAMETERS: params.update(self.HTTP_DEFAULT_PARAMETERS) kwargs["params"] = params try: response = requests.get(url, timeout=self.HTTP_TIMEOUT, **kwargs) except requests.exceptions.Timeout as exception: raise IngestApiError.apiTimeoutError(exception, self.provider) except requests.exceptions.ConnectionError as exception: raise IngestApiError.apiConnectionError(exception, self.provider) except requests.exceptions.RequestException as exception: raise IngestApiError.apiRequestError(exception, self.provider) except Exception as exception: traceback.print_exc() raise IngestApiError.apiGeneralError(exception, self.provider) if not response.ok: exception = Exception(response.reason) if response.status_code in (401, 403): raise IngestApiError.apiAuthError(exception, self.provider) elif response.status_code == 404: raise IngestApiError.apiNotFoundError(exception, self.provider) else: raise IngestApiError.apiGeneralError(exception, self.provider) return response def update(self, provider, update): self.provider = provider self.validate_config() return super().update(provider, update)
def get_url(self, url=None, **kwargs): """Do an HTTP Get on URL :param string url: url to use (None to use self.HTTP_URL) :param **kwargs: extra parameter for requests :return requests.Response: response """ if not url: url = self.HTTP_URL config = self.config user = config.get('username') password = config.get('password') if user: user = user.strip() if password: password = password.strip() auth_required = config.get('auth_required', self.HTTP_AUTH) if auth_required is None: # auth_required may not be user in the feeding service # in this case with use authentification only if user # and password are set. auth_required = bool(user and password) if auth_required: if not user: raise SuperdeskIngestError.notConfiguredError( "user is not configured") if not password: raise SuperdeskIngestError.notConfiguredError( "password is not configured") kwargs.setdefault('auth', (user, password)) params = kwargs.pop("params", {}) if params or self.HTTP_DEFAULT_PARAMETERS: # if we have default parameters, we want them to be overriden # by conflicting params given in arguments if self.HTTP_DEFAULT_PARAMETERS: params.update(self.HTTP_DEFAULT_PARAMETERS) kwargs["params"] = params try: response = requests.get(url, timeout=self.HTTP_TIMEOUT, **kwargs) except requests.exceptions.Timeout as exception: raise IngestApiError.apiTimeoutError(exception, self.provider) except requests.exceptions.ConnectionError as exception: raise IngestApiError.apiConnectionError(exception, self.provider) except requests.exceptions.RequestException as exception: raise IngestApiError.apiRequestError(exception, self.provider) except Exception as exception: traceback.print_exc() raise IngestApiError.apiGeneralError(exception, self.provider) if not response.ok: exception = Exception(response.reason) if response.status_code in (401, 403): raise IngestApiError.apiAuthError(exception, self.provider) elif response.status_code == 404: raise IngestApiError.apiNotFoundError(exception, self.provider) else: raise IngestApiError.apiGeneralError(exception, self.provider) return response
class EventHTTPFeedingService(HTTPFeedingService): """ Feeding Service class which can read events using HTTP """ NAME = 'event_http' ERRORS = [ IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description() ] label = 'Event HTTP feed' """ Defines the collection service to be used with this ingest feeding service. """ service = 'events' fields = [{ 'id': 'url', 'type': 'text', 'label': 'Feed URL', 'placeholder': 'Feed URL', 'required': True }] def _update(self, provider, update): updated = utcnow() last_updated = provider.get('last_updated') ttl_minutes = app.config['INGEST_EXPIRY_MINUTES'] if not last_updated or last_updated < updated - datetime.timedelta( minutes=ttl_minutes): last_updated = updated - datetime.timedelta(minutes=ttl_minutes) self.provider = provider provider_config = provider.get('config') if not provider_config: provider_config = {} provider['config'] = provider_config self.URL = provider_config.get('url') payload = {} parser = self.get_feed_parser(provider) try: response = requests.get(self.URL, params=payload, timeout=15) # TODO: check if file has been updated since provider last_updated # although some ptovider do not include 'Last-Modified' in headers # so unsure how to do this logger.info('Http Headers: %s', response.headers) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found %s' % payload) logger.info('Ingesting: %s', str(response.content)) if isinstance(parser, NTBEventXMLFeedParser): xml = ET.fromstring(response.content) items = parser.parse(xml, provider) elif isinstance(parser, IcsTwoFeedParser): cal = Calendar.from_ical(response.content) items = parser.parse(cal, provider) else: items = parser.parse(response.content) if isinstance(items, list): yield items else: yield [items]
def _update(self, provider, update): self.HTTP_URL = provider.get("config", {}).get("api_url", "") self.provider = provider # Set the apikey parameter we're going to use it on all calls params = dict() params["apikey"] = provider.get("config", {}).get("apikey") # Use the next link if one is available in the config if provider.get("config", {}).get("next_link"): r = self.get_url(url=provider.get("config", {}).get("next_link"), params=params, verify=False, allow_redirects=True) r.raise_for_status() else: id_list = provider.get("config", {}).get("productList", "").strip() recovery_time = provider.get("config", {}).get("recoverytime", "1") recovery_time = recovery_time.strip() if recovery_time else "" if recovery_time == "": recovery_time = "1" start = datetime.strftime( utcnow() - timedelta(hours=int(recovery_time)), "%Y-%m-%dT%H:%M:%SZ") # If there has been a list of products defined then we format them for the request, if not all # allowed products will be returned. if id_list: # we remove spaces and empty values from id_list to do a clean list id_list = " OR ".join( [id_.strip() for id_ in id_list.split(",") if id_.strip()]) params[ "q"] = "productid:(" + id_list + ") AND mindate:>{}".format( start) else: params["q"] = "mindate:>{}".format(start) params["page_size"] = "100" params["versions"] = "all" logger.info("AP Media Start/Recovery time: {} params {}".format( recovery_time, params)) r = self.get_url(params=params, verify=False, allow_redirects=True) r.raise_for_status() try: response = json.loads(r.text) except Exception: raise IngestApiError.apiRequestError( Exception("error parsing response")) nextLink = response.get("data", {}).get("next_page") # Got the same next link as last time so nothing new if nextLink == provider.get("config", {}).get("next_link"): logger.info("Nothing new from AP Media") return [] parser = self.get_feed_parser(provider) parsed_items = [] for item in response.get("data", {}).get("items", []): try: # Get the item meta data logger.info('Get AP meta data for "{}" uri: {}'.format( item.get("item", {}).get("headline"), item.get("item", {}).get("uri"))) r = self.api_get(item.get("item", {}).get("uri"), provider) complete_item = json.loads(r.text) # Get the nitf rendition of the item nitf_ref = (complete_item.get("data", {}).get("item", {}).get( "renditions", {}).get("nitf", {}).get("href")) if nitf_ref: logger.info("Get AP nitf : {}".format(nitf_ref)) r = self.api_get(nitf_ref, provider) root_elt = etree.fromstring(r.content) nitf_item = nitf.NITFFeedParser().parse(root_elt) complete_item["nitf"] = nitf_item else: if item.get("item", {}).get("type") == "text": logger.warning("No NITF for story {}".format( item.get("item", {}).get("uri"))) associations = complete_item["data"]["item"].get( "associations") if associations: complete_item["associations"] = {} for key, assoc in associations.items(): logger.info('Get AP association "%s"', assoc.get("headline")) try: related_json = self.api_get( assoc["uri"], provider).json() complete_item["associations"][key] = related_json except IngestApiError: logger.warning("Could not fetch AP association", extra=assoc) parsed_items.append(parser.parse(complete_item, provider)) # Any exception processing an indivisual item is swallowed except Exception as ex: logger.exception(ex) # Save the link for next time upd_provider = provider.get("config") upd_provider["next_link"] = nextLink upd_provider["recoverytime"] = None update["config"] = upd_provider return [parsed_items]
def _update(self, provider, update): self.HTTP_URL = provider.get('config', {}).get('api_url', '') self.provider = provider # Set the apikey parameter we're going to use it on all calls params = dict() params['apikey'] = provider.get('config', {}).get('apikey') # Use the next link if one is available in the config if provider.get('config', {}).get('next_link'): r = self.get_url(url=provider.get('config', {}).get('next_link'), params=params, verify=False, allow_redirects=True) r.raise_for_status() else: id_list = provider.get('config', {}).get('productList', '').strip() recovery_time = provider.get('config', {}).get('recoverytime', '1') recovery_time = recovery_time.strip() if recovery_time else '' if recovery_time == '': recovery_time = '1' start = datetime.strftime(utcnow() - timedelta(hours=int(recovery_time)), '%Y-%m-%dT%H:%M:%SZ') # If there has been a list of products defined then we format them for the request, if not all # allowed products will be returned. if id_list: # we remove spaces and empty values from id_list to do a clean list id_list = ' OR '.join([id_.strip() for id_ in id_list.split(',') if id_.strip()]) params['q'] = 'productid:(' + id_list + ') AND mindate:>{}'.format(start) else: params['q'] = 'mindate:>{}'.format(start) params['page_size'] = '100' params['versions'] = 'all' logger.info('AP Media Start/Recovery time: {} params {}'.format(recovery_time, params)) r = self.get_url(params=params, verify=False, allow_redirects=True) r.raise_for_status() try: response = json.loads(r.text) except Exception: raise IngestApiError.apiRequestError(Exception('error parsing response')) nextLink = response.get('data', {}).get('next_page') # Got the same next link as last time so nothing new if nextLink == provider.get('config', {}).get('next_link'): logger.info('Nothing new from AP Media') return [] parser = self.get_feed_parser(provider) parsed_items = [] for item in response.get('data', {}).get('items', []): try: # Get the item meta data r = self.get_url(url=item.get('item', {}).get('uri'), params={'apikey': provider.get('config', {}).get('apikey')}, verify=False, allow_redirects=True) logger.info('Get AP meta data for "{}" uri: {}'.format(item.get('item', {}).get('headline'), item.get('item', {}).get('uri'))) r.raise_for_status() complete_item = json.loads(r.text) # Get the nitf rendition of the item nitf_ref = complete_item.get('data', {}).get('item', {}).get('renditions', {}).get('nitf', {}).get( 'href') if nitf_ref: logger.info('Get AP nitf : {}'.format(nitf_ref)) r = self.get_url(url=nitf_ref, params={'apikey': provider.get('config', {}).get('apikey')}, verify=False, allow_redirects=True) r.raise_for_status() root_elt = etree.fromstring(r.content) nitf_item = nitf.NITFFeedParser().parse(root_elt) complete_item['nitf'] = nitf_item else: if item.get('item', {}).get('type') == 'text': logger.warning('No NITF for story {}'.format(item.get('item', {}).get('uri'))) parsed_items.append(parser.parse(complete_item, provider)) # Any exception processing an indivisual item is swallowed except Exception as ex: logger.exception(ex) # Save the link for next time upd_provider = provider.get('config') upd_provider['next_link'] = nextLink upd_provider['recoverytime'] = None update['config'] = upd_provider return [parsed_items]
class RitzauFeedingService(HTTPFeedingServiceBase): """ Feeding Service class which can retrieve articles from Ritzau web service """ NAME = 'ritzau' ERRORS = [IngestApiError.apiRequestError().get_error_description(), SuperdeskIngestError.notConfiguredError().get_error_description()] label = 'Ritzau feed API' fields = HTTPFeedingServiceBase.AUTH_FIELDS + [ { 'id': 'url', 'type': 'text', 'label': 'URL', 'placeholder': 'fill this field only for advanced uses', 'required': False } ] HTTP_URL = 'https://services.ritzau.dk/ritzaurest/Services.svc/xml/news/NewsQueue' # auth is done with params HTTP_AUTH = False def _update(self, provider, update): config = self.config try: user, password = self.config['username'], self.config['password'] except KeyError: SuperdeskIngestError.notConfiguredError(Exception('username and password are needed')) url_override = config.get('url', '').strip() if not url_override.startswith('http'): SuperdeskIngestError.notConfiguredError(Exception('if URL is set, it must be a valid http link')) if url_override: params = {'user': user, 'password': password, 'maksAntal': 50} else: params = {'user': user, 'password': password, 'maksAntal': 50, 'waitAcknowledge': 'true'} r = self.get_url(url_override, params=params) try: root_elt = etree.fromstring(r.text) except Exception: raise IngestApiError.apiRequestError(Exception('error while parsing the request answer')) try: if root_elt.xpath('(//error/text())[1]')[0] != '0': err_msg = root_elt.xpath('(//errormsg/text())[1]')[0] raise IngestApiError.apiRequestError(Exception('error code returned by API: {msg}'.format(msg=err_msg))) except IndexError: raise IngestApiError.apiRequestError(Exception('Invalid XML, <error> element not found')) parser = self.get_feed_parser(provider) items = [] for elt in root_elt.xpath('//RBNews'): item = parser.parse(elt, provider) items.append(item) if not url_override: try: queue_id = elt.xpath('.//ServiceQueueId/text()')[0] except IndexError: raise IngestApiError.apiRequestError(Exception('missing ServiceQueueId element')) ack_params = {'user': user, 'password': password, 'servicequeueid': queue_id} self.get_url(URL_ACK, params=ack_params) return [items]
from superdesk.io.ingest_service import IngestService from superdesk.utc import utcnow from superdesk.etree import etree, ParseError from superdesk.io import register_provider from .newsml_2_0 import NewsMLTwoParser from .reuters_token import get_token from superdesk.errors import IngestApiError from flask import current_app as app PROVIDER = 'reuters' errors = [IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description()] class ReutersIngestService(IngestService): """Reuters ingest service.""" DATE_FORMAT = '%Y.%m.%d.%H.%M' URL = 'http://rmb.reuters.com/rmd/rest/xml' token = None def __init__(self): self.parser = NewsMLTwoParser()
class ReutersHTTPFeedingService(HTTPFeedingService): """ Feeding Service class which can read article(s) using HTTP provided by Reuters. """ NAME = "reuters_http" ERRORS = [ IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description(), ] DATE_FORMAT = "%Y.%m.%d.%H.%M" label = "Reuters feed API" fields = [ { "id": "url", "type": "text", "label": "Feed URL", "placeholder": "Feed URL", "required": True, "default": "http://rmb.reuters.com/rmd/rest/xml", }, { "id": "auth_url", "type": "text", "label": "URL for Authentication", "placeholder": "authentication url", "required": True, "default": "https://commerce.reuters.com/rmd/rest/xml/login", }, {"id": "username", "type": "text", "label": "Username", "placeholder": "Username", "required": True}, {"id": "password", "type": "password", "label": "Password", "placeholder": "Password", "required": True}, ] session = None def _update(self, provider, update): updated = utcnow() last_updated = provider.get("last_updated") ttl_minutes = app.config["INGEST_EXPIRY_MINUTES"] if not last_updated or last_updated < updated - datetime.timedelta(minutes=ttl_minutes): last_updated = updated - datetime.timedelta(minutes=ttl_minutes) self.provider = provider provider_config = provider.get("config") if not provider_config: provider_config = {} provider["config"] = provider_config provider_config.setdefault("url", "http://rmb.reuters.com/rmd/rest/xml") provider_config.setdefault("auth_url", "https://commerce.reuters.com/rmd/rest/xml/login") self.URL = provider_config.get("url") for channel in self._get_channels(): ids = self._get_article_ids(channel, last_updated, updated) for id in ids: try: items = self.fetch_ingest(id) if items: yield items # if there was an exception processing the one of the bunch log it and continue except Exception as ex: logger.warn("Reuters item {} has not been retrieved".format(id)) logger.exception(ex) def _get_channels(self): """Get subscribed channels.""" channels = [] tree = self._get_tree("channels") for channel in tree.findall("channelInformation"): channels.append(channel.find("alias").text) return channels def _get_tree(self, endpoint, payload=None): """Get xml response for given API endpoint and payload. :param: endpoint :type endpoint: str :param: payload :type payload: str """ if payload is None: payload = {} payload["token"] = self._get_auth_token(self.provider, update=True) url = self._get_absolute_url(endpoint) if not self.session: self.session = requests.Session() retries = 0 while True: try: response = self.session.get(url, params=payload, timeout=(30, 15)) except requests.exceptions.Timeout as ex: if retries < 3: logger.warn("Reuters API timeout retrying, retries {}".format(retries)) retries += 1 continue raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError(_("Not found {payload}").format(payload=payload)) break try: return etree.fromstring(response.content) # workaround for http mock lib except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) def _get_absolute_url(self, endpoint): """ Get absolute URL for given endpoint. :param: endpoint :type endpoint: str """ return "/".join([self.URL, endpoint]) def _get_article_ids(self, channel, last_updated, updated): """ Get article ids which should be upserted also save the poll token that is returned. """ ids = set() payload = {"channel": channel, "fieldsRef": "id"} # check if the channel has a pollToken if not fall back to dateRange last_poll_token = self._get_poll_token(channel) if last_poll_token is not None: logger.info("Reuters requesting channel {} with poll token {}".format(channel, last_poll_token)) payload["pollToken"] = last_poll_token else: payload["dateRange"] = "%s-%s" % (self._format_date(last_updated), self._format_date(updated)) logger.info("Reuters requesting channel {} with dateRange {}".format(channel, payload["dateRange"])) tree = self._get_tree("items", payload) status_code = tree.find("status").get("code") if tree.tag == "results" else tree.get("code") # check the returned status if status_code != "10": logger.warn("Reuters channel request returned status code {}".format(status_code)) # status code 30 indicates failure if status_code == "30": # invalid token logger.warn( "Reuters error on channel {} code {} {}".format( channel, tree.find("error").get("code"), tree.find("error").text ) ) if tree.find("error").get("code") == "2100": self._save_poll_token(channel, None) logger.warn("Reuters channel invalid token reseting {}".format(status_code)) return ids # extract the returned poll token if there is one poll_token = tree.find("pollToken") if poll_token is not None: # a new token indicated new content if poll_token.text != last_poll_token: logger.info("Reuters channel {} new token {}".format(channel, poll_token.text)) self._save_poll_token(channel, poll_token.text) else: # the token has not changed, so nothing new logger.info("Reuters channel {} nothing new".format(channel)) return ids else: logger.info("Reuters channel {} retrieved no token".format(channel)) return ids for result in tree.findall("result"): id = result.find("id").text ids.add(id) logger.info("Reuters id : {}".format(id)) return ids def _save_poll_token(self, channel, poll_token): """Saves the poll token for the passed channel in the config section of the :param channel: :param poll_token: :return: """ # get the provider in case it has been updated by another channel ingest_provider_service = superdesk.get_resource_service("ingest_providers") provider = ingest_provider_service.find_one(req=None, _id=self.provider[superdesk.config.ID_FIELD]) provider_token = provider.get("tokens") if "poll_tokens" not in provider_token: provider_token["poll_tokens"] = {channel: poll_token} else: provider_token["poll_tokens"][channel] = poll_token upd_provider = {"tokens": provider_token} ingest_provider_service.system_update(self.provider[superdesk.config.ID_FIELD], upd_provider, self.provider) def _get_poll_token(self, channel): """Get the poll token from provider config if it is available. :param channel: :return: token """ if "tokens" in self.provider and "poll_tokens" in self.provider["tokens"]: return self.provider.get("tokens").get("poll_tokens").get(channel, None) def _format_date(self, date): return date.strftime(self.DATE_FORMAT) def fetch_ingest(self, id): items = self._parse_items(id) result_items = [] while items: item = items.pop() self.localize_timestamps(item) try: items.extend(self._fetch_items_in_package(item)) result_items.append(item) except LookupError as err: self.log_item_error(err, item, self.provider) return [] return result_items def _parse_items(self, id): """ Parse item message and return given items. """ payload = {"id": id} tree = self._get_tree("item", payload) parser = self.get_feed_parser(self.provider, tree) items = parser.parse(tree, self.provider) return items def _fetch_items_in_package(self, item): """ Fetch remote assets for given item. """ items = [] for group in item.get("groups", []): for ref in group.get("refs", []): if "residRef" in ref: items.extend(self._parse_items(ref.get("residRef"))) return items def prepare_href(self, href, mimetype=None): (scheme, netloc, path, params, query, fragment) = urlparse(href) new_href = urlunparse((scheme, netloc, path, "", "", "")) return "%s?auth_token=%s" % (new_href, self._get_auth_token(self.provider, update=True))
class APFeedingService(FeedingService): """ Feeding Service class which can retrieve articles from Associated Press web service """ NAME = 'ap' ERRORS = [ IngestApiError.apiRequestError().get_error_description(), SuperdeskIngestError.notConfiguredError().get_error_description() ] label = 'AP feed API' fields = [{ 'id': 'username', 'type': 'text', 'label': 'Username', 'placeholder': 'Username', 'required': True }, { 'id': 'password', 'type': 'password', 'label': 'Password', 'placeholder': 'Password', 'required': True }, { 'id': 'idList', 'type': 'text', 'label': 'Id List', 'placeholder': 'use coma separated ids for multiple values', 'required': False }] def config_test(self, provider=None): super().config_test(provider) def _update(self, provider, update): try: config = provider['config'] user = config['username'] password = config['password'] id_list = config['idList'] if not user.strip() or not password.strip() or not id_list.strip(): raise KeyError except KeyError: raise SuperdeskIngestError.notConfiguredError( Exception('username, password and idList are needed')) # we remove spaces and empty values from id_list to do a clean list id_list = ','.join( [id_.strip() for id_ in id_list.split(',') if id_.strip()]) params = { 'idList': id_list, 'idListType': 'products', 'format': '5', 'maxItems': '25', 'sortOrder': 'chronological' } try: min_date_time = provider['private']['min_date_time'] sequence_number = provider['private']['sequence_number'] except KeyError: pass else: params['minDateTime'] = min_date_time params['sequenceNumber'] = sequence_number try: r = requests.get(URL, auth=(user, password), params=params) except Exception: raise IngestApiError.apiRequestError( Exception('error while doing the request')) try: root_elt = etree.fromstring(r.content) except Exception: raise IngestApiError.apiRequestError( Exception('error while doing the request')) parser = self.get_feed_parser(provider) items = parser.parse(root_elt, provider) try: min_date_time = root_elt.xpath( '//iptc:timestamp[@role="minDateTime"]/text()', namespaces=NS)[0].strip() sequence_number = root_elt.xpath('//iptc:transmitId/text()', namespaces=NS)[0].strip() except IndexError: raise IngestApiError.apiRequestError( Exception('missing minDateTime or transmitId')) else: update.setdefault('private', {}) update['private']['min_date_time'] = min_date_time update['private']['sequence_number'] = sequence_number return [items]