def _fetch_data(self, config, provider): url = config['url'] api_key = config['api_key'] last_update = provider.get( 'last_updated', utcfromtimestamp(0)).strftime('%Y-%m-%dT%H:%M:%S') # Results are pagified so we'll read this many at a time offset_jump = 10 params = {'start': last_update, 'limit': offset_jump} headers = {'apikey': api_key} items = [] offset = 0 while True: params['offset'] = offset try: response = requests.get(url, params=params, headers=headers, timeout=30) except requests.exceptions.ConnectionError as err: raise IngestApiError.apiConnectionError(exception=err) if response.ok: # The total number of results are given to us in json, get them # via a regex to read the field so we don't have to convert the # whole thing to json pointlessly item_ident = re.search('\"total\": *[0-9]*', response.text).group() results_str = re.search('[0-9]+', item_ident).group() if results_str is None: raise IngestApiError.apiGeneralError( Exception(response.text), provider) num_results = int(results_str) if num_results > 0: items.append(response.text) if offset >= num_results: return items offset += offset_jump else: if re.match('Error: No API Key provided', response.text): raise IngestApiError.apiAuthError(Exception(response.text), provider) elif response.status_code == 404: raise IngestApiError.apiNotFoundError( Exception(response.reason), provider) else: raise IngestApiError.apiGeneralError( Exception(response.reason), provider) return items
def _test(self, provider): config = provider.get('config', {}) url = config['url'] api_key = config['api_key'] # limit the data to a single article and filter out all article fields # to save bandwidth params = {'limit': 1, 'fields': 'id'} headers = {'apikey': api_key} try: response = requests.get(url, params=params, headers=headers, timeout=30) except requests.exceptions.ConnectionError as err: raise IngestApiError.apiConnectionError(exception=err) if not response.ok: if response.status_code == 404: raise IngestApiError.apiNotFoundError( Exception(response.reason), provider) else: raise IngestApiError.apiGeneralError( Exception(response.reason), provider)
def _fetch_data(self, config, provider): url = config['url'] api_key = config['api_key'] last_update = provider.get('last_updated', utcfromtimestamp(0)).strftime('%Y-%m-%dT%H:%M:%S') # Results are pagified so we'll read this many at a time offset_jump = 10 params = {'start': last_update, 'limit': offset_jump} headers = {'apikey': api_key} items = [] offset = 0 while True: params['offset'] = offset try: response = requests.get(url, params=params, headers=headers, timeout=30) except requests.exceptions.ConnectionError as err: raise IngestApiError.apiConnectionError(exception=err) if response.ok: # The total number of results are given to us in json, get them # via a regex to read the field so we don't have to convert the # whole thing to json pointlessly item_ident = re.search('\"total\": *[0-9]*', response.text).group() results_str = re.search('[0-9]+', item_ident).group() if results_str is None: raise IngestApiError.apiGeneralError( Exception(response.text), provider) num_results = int(results_str) if num_results > 0: items.append(response.text) if offset >= num_results: return items offset += offset_jump else: if re.match('Error: No API Key provided', response.text): raise IngestApiError.apiAuthError( Exception(response.text), provider) elif response.status_code == 404: raise IngestApiError.apiNotFoundError( Exception(response.reason), provider) else: raise IngestApiError.apiGeneralError( Exception(response.reason), provider) return items
def _fetch_data(self, config, provider): """Fetch the latest feed data. :param dict config: RSS resource configuration :param provider: data provider instance, needed as an argument when raising ingest errors :return: fetched RSS data :rtype: str :raises IngestApiError: if fetching data fails for any reason (e.g. authentication error, resource not found, etc.) """ url = config['url'] if config.get('auth_required', False): auth = (config.get('username'), config.get('password')) self.auth_info = { 'username': config.get('username', ''), 'password': config.get('password', '') } else: auth = None try: response = requests.get(url, auth=auth, timeout=30) except requests.exceptions.ConnectionError as err: raise IngestApiError.apiConnectionError(exception=err, provider=provider) except requests.exceptions.RequestException as err: raise IngestApiError.apiURLError(exception=err, provider=provider) if response.ok: return response.content else: if response.status_code in (401, 403): raise IngestApiError.apiAuthError(Exception(response.reason), provider) elif response.status_code == 404: raise IngestApiError.apiNotFoundError( Exception(response.reason), provider) else: raise IngestApiError.apiGeneralError( Exception(response.reason), provider)
def _fetch_data(self, config, provider): """Fetch the latest feed data. :param dict config: RSS resource configuration :param provider: data provider instance, needed as an argument when raising ingest errors :return: fetched RSS data :rtype: str :raises IngestApiError: if fetching data fails for any reason (e.g. authentication error, resource not found, etc.) """ url = config['url'] if config.get('auth_required', False): auth = (config.get('username'), config.get('password')) self.auth_info = { 'username': config.get('username', ''), 'password': config.get('password', '') } else: auth = None try: response = requests.get(url, auth=auth, timeout=30) except requests.exceptions.ConnectionError as err: raise IngestApiError.apiConnectionError(exception=err, provider=provider) except requests.exceptions.RequestException as err: raise IngestApiError.apiURLError(exception=err, provider=provider) if response.ok: return response.content else: if response.status_code in (401, 403): raise IngestApiError.apiAuthError( Exception(response.reason), provider) elif response.status_code == 404: raise IngestApiError.apiNotFoundError( Exception(response.reason), provider) else: raise IngestApiError.apiGeneralError( Exception(response.reason), provider)
class NTBEventsApiFeedingService(HTTPFeedingServiceBase): """ Feeding Service class which can read events from NTB API using HTTP """ NAME = 'ntb_events_api' ERRORS = [ SuperdeskIngestError.notConfiguredError().get_error_description(), IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiConnectionError().get_error_description(), ] REQUESTS_PER_UPDATE = 4 EVENTS_PER_REQUEST = 25 HTTP_TIMEOUT = 20 label = 'NTB Events API' fields = [ { 'id': 'url', 'type': 'text', 'label': 'Feed URL', 'placeholder': 'Feed URL', 'required': True }] + HTTPFeedingServiceBase.AUTH_FIELDS service = 'events' def _update(self, provider, update): """ Fetch events from external API. :param provider: Ingest Provider Details. :type provider: dict :param update: Any update that is required on provider. :type update: dict :return: a list of events which can be saved. """ all_items = OrderedDict() self._provider = provider provider_private = self._provider.get('private', {}) offset = provider_private.get('search', {}).get('offset', 0) for _ in range(self.REQUESTS_PER_UPDATE): response = self._send_request(offset + len(all_items)) xml = etree.fromstring(response.content) items = self._parse_events(xml=xml) if items: all_items.update(items) else: break if all_items: update['private'] = { 'search': { 'offset': offset + len(all_items) } } all_items = self._filter_items(all_items) else: update['is_closed'] = True update['last_closed'] = { 'closed_at': utcnow(), 'message': 'Ingesting was finished.' } return [all_items] def _send_request(self, offset): """ Execute http request to external API :param offset: offset provided in request payload :type offset: int :return: http response :raises IngestApiError.apiTimeoutError :raises IngestApiError.apiConnectionError :raises IngestApiError.apiRequestError :raises IngestApiError.apiGeneralError :raises IngestApiError.apiAuthError :raises IngestApiError.apiNotFoundError """ payload = { 'search.offset': offset, 'search.showNumResults': self.EVENTS_PER_REQUEST } url = self._provider['config']['url'].strip() return self.get_url(url, params=payload) def _parse_events(self, xml): """ Parse xml document and returns list of events :param xml: xml document :type xml: lxml.etree._Element :return: a list of events """ parser = self.get_feed_parser(self._provider, article=xml) return OrderedDict( (item['ntb_id'], item) for item in parser.parse(xml) ) def _filter_items(self, items): """ Remove events which are exist in the db. :param items: dict with events, ntbId used as a key :type items: dict :return: a list of events """ req = ParsedRequest() req.projection = json.dumps({'ntb_id': 1, 'guid': 1, ITEM_STATE: 1}) req.max_results = len(items) existing_items = superdesk.get_resource_service('events').get_from_mongo( req, { 'ntb_id': { '$in': [ntb_id for ntb_id in items.keys()] } } ) for existing_item in existing_items: if existing_item.get(ITEM_STATE) == WORKFLOW_STATE.INGESTED: # update event items[existing_item['ntb_id']][GUID_FIELD] = existing_item[GUID_FIELD] else: # remove event when it has a state different from 'ingested' del items[existing_item['ntb_id']] return [items[i] for i in items.keys()]
def get_url(self, url=None, **kwargs): """Do an HTTP Get on URL :param string url: url to use (None to use self.HTTP_URL) :param **kwargs: extra parameter for requests :return requests.Response: response """ if not url: url = self.HTTP_URL config = self.config user = config.get('username') password = config.get('password') if user: user = user.strip() if password: password = password.strip() auth_required = config.get('auth_required', self.HTTP_AUTH) if auth_required is None: # auth_required may not be user in the feeding service # in this case with use authentification only if user # and password are set. auth_required = bool(user and password) if auth_required: if not user: raise SuperdeskIngestError.notConfiguredError( "user is not configured") if not password: raise SuperdeskIngestError.notConfiguredError( "password is not configured") kwargs.setdefault('auth', (user, password)) params = kwargs.pop("params", {}) if params or self.HTTP_DEFAULT_PARAMETERS: # if we have default parameters, we want them to be overriden # by conflicting params given in arguments if self.HTTP_DEFAULT_PARAMETERS: params.update(self.HTTP_DEFAULT_PARAMETERS) kwargs["params"] = params try: response = requests.get(url, timeout=self.HTTP_TIMEOUT, **kwargs) except requests.exceptions.Timeout as exception: raise IngestApiError.apiTimeoutError(exception, self.provider) except requests.exceptions.ConnectionError as exception: raise IngestApiError.apiConnectionError(exception, self.provider) except requests.exceptions.RequestException as exception: raise IngestApiError.apiRequestError(exception, self.provider) except Exception as exception: traceback.print_exc() raise IngestApiError.apiGeneralError(exception, self.provider) if not response.ok: exception = Exception(response.reason) if response.status_code in (401, 403): raise IngestApiError.apiAuthError(exception, self.provider) elif response.status_code == 404: raise IngestApiError.apiNotFoundError(exception, self.provider) else: raise IngestApiError.apiGeneralError(exception, self.provider) return response