Example #1
0
    def _update(self, provider, update):
        try:
            config = provider['config']
            user = config['username']
            password = config['password']
            id_list = config['idList']
        except KeyError as e:
            SuperdeskIngestError.notConfiguredError(
                Exception('username, password and idList are needed'))

        # we remove spaces and empty values from id_list to do a clean list
        id_list = ','.join(
            [id_.strip() for id_ in id_list.split(',') if id_.strip()])

        params = {
            'idList': id_list,
            'idListType': 'products',
            'format': '5',
            'maxItems': '25',
            'sortOrder': 'chronological'
        }
        try:
            min_date_time = provider['private']['min_date_time']
            sequence_number = provider['private']['sequence_number']
        except KeyError:
            pass
        else:
            params['minDateTime'] = min_date_time
            params['sequenceNumber'] = sequence_number

        try:
            r = requests.get(URL, auth=(user, password), params=params)
        except Exception as e:
            raise IngestApiError.apiRequestError(
                Exception('error while doing the request'))

        try:
            root_elt = etree.fromstring(r.content)
        except Exception as e:
            raise IngestApiError.apiRequestError(
                Exception('error while doing the request'))

        parser = self.get_feed_parser(provider)
        items = parser.parse(root_elt, provider)

        try:
            min_date_time = root_elt.xpath(
                '//iptc:timestamp[@role="minDateTime"]/text()',
                namespaces=NS)[0].strip()
            sequence_number = root_elt.xpath('//iptc:transmitId/text()',
                                             namespaces=NS)[0].strip()
        except IndexError:
            raise IngestApiError.apiRequestError(
                Exception('missing minDateTime or transmitId'))
        else:
            update.setdefault('private', {})
            update['private']['min_date_time'] = min_date_time
            update['private']['sequence_number'] = sequence_number

        return [items]
Example #2
0
    def _update(self, provider, update):
        config = self.config
        try:
            user, password = self.config["username"], self.config["password"]
        except KeyError:
            SuperdeskIngestError.notConfiguredError(
                Exception("username and password are needed"))

        url_override = config.get("url", "").strip()
        if not url_override.startswith("http"):
            SuperdeskIngestError.notConfiguredError(
                Exception("if URL is set, it must be a valid http link"))

        if url_override:
            params = {"user": user, "password": password, "maksAntal": 50}
        else:
            params = {
                "user": user,
                "password": password,
                "maksAntal": 50,
                "waitAcknowledge": "true"
            }

        r = self.get_url(url_override, params=params)

        try:
            root_elt = etree.fromstring(r.text)
        except Exception:
            raise IngestApiError.apiRequestError(
                Exception("error while parsing the request answer"))

        try:
            if root_elt.xpath("(//error/text())[1]")[0] != "0":
                err_msg = root_elt.xpath("(//errormsg/text())[1]")[0]
                raise IngestApiError.apiRequestError(
                    Exception("error code returned by API: {msg}".format(
                        msg=err_msg)))
        except IndexError:
            raise IngestApiError.apiRequestError(
                Exception("Invalid XML, <error> element not found"))

        parser = self.get_feed_parser(provider)
        items = []
        for elt in root_elt.xpath("//RBNews"):
            item = parser.parse(elt, provider)
            items.append(item)
            if not url_override:
                try:
                    queue_id = elt.xpath(".//ServiceQueueId/text()")[0]
                except IndexError:
                    raise IngestApiError.apiRequestError(
                        Exception("missing ServiceQueueId element"))
                ack_params = {
                    "user": user,
                    "password": password,
                    "servicequeueid": queue_id
                }
                self.get_url(URL_ACK, params=ack_params)

        return [items]
Example #3
0
    def _update(self, provider, update):
        try:
            config = provider['config']
            user = config['username']
            password = config['password']
            id_list = config['idList']
            # before "products" was hardcoded as value for "idListType"
            id_list_type = config.get('idListType', 'products')
            if not user.strip() or not password.strip() or not id_list.strip():
                raise KeyError
        except KeyError:
            raise SuperdeskIngestError.notConfiguredError(Exception('username, password and idList are needed'))

        # we remove spaces and empty values from id_list to do a clean list
        id_list = ','.join([id_.strip() for id_ in id_list.split(',') if id_.strip()])

        params = {'idList': id_list,
                  'idListType': id_list_type,
                  'format': '5',
                  'maxItems': '25',
                  'sortOrder': 'chronological'}
        try:
            min_date_time = provider['private']['min_date_time']
            sequence_number = provider['private']['sequence_number']
        except KeyError:
            pass
        else:
            params['minDateTime'] = min_date_time
            params['sequenceNumber'] = sequence_number

        try:
            r = requests.get(URL, auth=(user, password), params=params)
        except Exception:
            raise IngestApiError.apiRequestError(Exception('error while doing the request'))

        try:
            root_elt = etree.fromstring(r.content)
        except Exception:
            raise IngestApiError.apiRequestError(Exception('error while doing the request'))

        parser = self.get_feed_parser(provider)
        items = parser.parse(root_elt, provider)

        try:
            min_date_time = root_elt.xpath('//iptc:timestamp[@role="minDateTime"]/text()', namespaces=NS)[0].strip()
            sequence_number = root_elt.xpath('//iptc:transmitId/text()', namespaces=NS)[0].strip()
        except IndexError:
            raise IngestApiError.apiRequestError(Exception('missing minDateTime or transmitId'))
        else:
            update.setdefault('private', {})
            update['private']['min_date_time'] = min_date_time
            update['private']['sequence_number'] = sequence_number

        return [items]
Example #4
0
    def _get_tree(self, endpoint, payload=None):
        """Get xml response for given API endpoint and payload.

        :param: endpoint
        :type endpoint: str
        :param: payload
        :type payload: str
        """

        if payload is None:
            payload = {}

        payload['token'] = self._get_auth_token(self.provider, update=True)
        url = self._get_absolute_url(endpoint)

        if not self.session:
            self.session = requests.Session()

        retries = 0
        while True:
            try:
                response = self.session.get(url,
                                            params=payload,
                                            timeout=(30, 15))
            except requests.exceptions.Timeout as ex:
                if retries < 3:
                    logger.warn(
                        'Reuters API timeout retrying, retries {}'.format(
                            retries))
                    retries += 1
                    continue
                raise IngestApiError.apiTimeoutError(ex, self.provider)
            except requests.exceptions.TooManyRedirects as ex:
                # Tell the user their URL was bad and try a different one
                raise IngestApiError.apiRedirectError(ex, self.provider)
            except requests.exceptions.RequestException as ex:
                # catastrophic error. bail.
                raise IngestApiError.apiRequestError(ex, self.provider)
            except Exception as error:
                traceback.print_exc()
                raise IngestApiError.apiGeneralError(error, self.provider)

            if response.status_code == 404:
                raise LookupError(
                    _('Not found {payload}').format(payload=payload))

            break

        try:
            return etree.fromstring(
                response.content)  # workaround for http mock lib
        except UnicodeEncodeError as error:
            traceback.print_exc()
            raise IngestApiError.apiUnicodeError(error, self.provider)
        except ParseError as error:
            traceback.print_exc()
            raise IngestApiError.apiParseError(error, self.provider)
        except Exception as error:
            traceback.print_exc()
            raise IngestApiError.apiGeneralError(error, self.provider)
Example #5
0
class WufooFeedingService(FeedingService):
    """
    Feeding Service class which can read article(s) using Wufoo API
    """

    NAME = "wufoo"

    ERRORS = [
        IngestApiError.apiTimeoutError().get_error_description(),
        IngestApiError.apiRedirectError().get_error_description(),
        IngestApiError.apiRequestError().get_error_description(),
        IngestApiError.apiGeneralError().get_error_description(),
    ]

    label = "Wufoo feed API"

    fields = [
        {
            "id": "wufoo_username",
            "type": "text",
            "label": "Login",
            "placeholder": "Wufoo login",
            "required": True
        },
        {
            "id": "wufoo_api_key",
            "type": "password",
            "label": "API key",
            "placeholder": "Wufoo API Key",
            "required": True,
        },
    ]

    def __init__(self):
        super().__init__()
        self.fields_cache = {}

    def _update(self, provider, update):
        user = provider["config"]["wufoo_username"]
        wufoo_data = {
            "url": WUFOO_URL.format(subdomain=user),
            "user": user,
            "api_key": provider["config"]["wufoo_api_key"],
            "form_query_entries_tpl": WUFOO_QUERY_FORM + WUFOO_QUERY_ENTRIES,
            "update": update,
        }
        try:
            parser = self.get_feed_parser(provider, None)
        except requests.exceptions.Timeout as ex:
            raise IngestApiError.apiTimeoutError(ex, provider)
        except requests.exceptions.TooManyRedirects as ex:
            raise IngestApiError.apiRedirectError(ex, provider)
        except requests.exceptions.RequestException as ex:
            raise IngestApiError.apiRequestError(ex, provider)
        except Exception as error:
            traceback.print_exc()
            raise IngestApiError.apiGeneralError(error, self.provider)
        items = parser.parse(wufoo_data, provider)
        return [items]
Example #6
0
    def _update(self, provider, update):
        updated = utcnow()

        last_updated = provider.get('last_updated')
        ttl_minutes = app.config['INGEST_EXPIRY_MINUTES']
        if not last_updated or last_updated < updated - datetime.timedelta(
                minutes=ttl_minutes):
            last_updated = updated - datetime.timedelta(minutes=ttl_minutes)

        self.provider = provider
        provider_config = provider.get('config')
        if not provider_config:
            provider_config = {}
            provider['config'] = provider_config

        self.URL = provider_config.get('url')
        payload = {}

        parser = self.get_feed_parser(provider)

        try:
            response = requests.get(self.URL, params=payload, timeout=15)
            # TODO: check if file has been updated since provider last_updated
            # although some ptovider do not include 'Last-Modified' in headers
            # so unsure how to do this
            logger.info('Http Headers: %s', response.headers)
        except requests.exceptions.Timeout as ex:
            # Maybe set up for a retry, or continue in a retry loop
            raise IngestApiError.apiTimeoutError(ex, self.provider)
        except requests.exceptions.TooManyRedirects as ex:
            # Tell the user their URL was bad and try a different one
            raise IngestApiError.apiRedirectError(ex, self.provider)
        except requests.exceptions.RequestException as ex:
            # catastrophic error. bail.
            raise IngestApiError.apiRequestError(ex, self.provider)
        except Exception as error:
            traceback.print_exc()
            raise IngestApiError.apiGeneralError(error, self.provider)

        if response.status_code == 404:
            raise LookupError('Not found %s' % payload)

        logger.info('Ingesting: %s', str(response.content))

        if isinstance(parser, NTBEventXMLFeedParser):
            xml = ET.fromstring(response.content)
            items = parser.parse(xml, provider)
        elif isinstance(parser, IcsTwoFeedParser):
            cal = Calendar.from_ical(response.content)
            items = parser.parse(cal, provider)
        else:
            items = parser.parse(response.content)

        if isinstance(items, list):
            yield items
        else:
            yield [items]
Example #7
0
class WufooFeedingService(FeedingService):
    """
    Feeding Service class which can read article(s) using Wufoo API
    """

    NAME = 'wufoo'

    ERRORS = [
        IngestApiError.apiTimeoutError().get_error_description(),
        IngestApiError.apiRedirectError().get_error_description(),
        IngestApiError.apiRequestError().get_error_description(),
        IngestApiError.apiGeneralError().get_error_description()
    ]

    label = 'Wufoo feed API'

    fields = [{
        'id': 'wufoo_username',
        'type': 'text',
        'label': 'Login',
        'placeholder': 'Wufoo login',
        'required': True
    }, {
        'id': 'wufoo_api_key',
        'type': 'password',
        'label': 'API key',
        'placeholder': 'Wufoo API Key',
        'required': True
    }]

    parser_restricted_values = ['wufoo']

    def __init__(self):
        self.fields_cache = {}

    def _update(self, provider, update):
        user = provider['config']['wufoo_username']
        wufoo_data = {
            "url": WUFOO_URL.format(subdomain=user),
            "user": user,
            "api_key": provider['config']['wufoo_api_key'],
            "form_query_entries_tpl": WUFOO_QUERY_FORM + WUFOO_QUERY_ENTRIES,
            "update": update
        }
        try:
            parser = self.get_feed_parser(provider, None)
        except requests.exceptions.Timeout as ex:
            raise IngestApiError.apiTimeoutError(ex, provider)
        except requests.exceptions.TooManyRedirects as ex:
            raise IngestApiError.apiRedirectError(ex, provider)
        except requests.exceptions.RequestException as ex:
            raise IngestApiError.apiRequestError(ex, provider)
        except Exception as error:
            traceback.print_exc()
            raise IngestApiError.apiGeneralError(error, self.provider)
        items = parser.parse(wufoo_data, provider)
        return [items]
Example #8
0
    def _update(self, provider, update):
        config = self.config
        try:
            user, password = self.config['username'], self.config['password']
        except KeyError:
            SuperdeskIngestError.notConfiguredError(Exception('username and password are needed'))

        url_override = config.get('url', '').strip()
        if not url_override.startswith('http'):
            SuperdeskIngestError.notConfiguredError(Exception('if URL is set, it must be a valid http link'))

        if url_override:
            params = {'user': user, 'password': password, 'maksAntal': 50}
        else:
            params = {'user': user, 'password': password, 'maksAntal': 50, 'waitAcknowledge': 'true'}

        r = self.get_url(url_override, params=params)

        try:
            root_elt = etree.fromstring(r.text)
        except Exception:
            raise IngestApiError.apiRequestError(Exception('error while parsing the request answer'))

        try:
            if root_elt.xpath('(//error/text())[1]')[0] != '0':
                err_msg = root_elt.xpath('(//errormsg/text())[1]')[0]
                raise IngestApiError.apiRequestError(Exception('error code returned by API: {msg}'.format(msg=err_msg)))
        except IndexError:
            raise IngestApiError.apiRequestError(Exception('Invalid XML, <error> element not found'))

        parser = self.get_feed_parser(provider)
        items = []
        for elt in root_elt.xpath('//RBNews'):
            item = parser.parse(elt, provider)
            items.append(item)
            if not url_override:
                try:
                    queue_id = elt.xpath('.//ServiceQueueId/text()')[0]
                except IndexError:
                    raise IngestApiError.apiRequestError(Exception('missing ServiceQueueId element'))
                ack_params = {'user': user, 'password': password, 'servicequeueid': queue_id}
                self.get_url(URL_ACK, params=ack_params)

        return [items]
Example #9
0
    def _get_tree(self, endpoint, payload=None):
        """Get xml response for given API endpoint and payload.

        :param: endpoint
        :type endpoint: str
        :param: payload
        :type payload: str
        """

        if payload is None:
            payload = {}

        payload['token'] = self._get_auth_token(self.provider, update=True)
        url = self._get_absolute_url(endpoint)

        if not self.session:
            self.session = requests.Session()

        retries = 0
        while True:
            try:
                response = self.session.get(url, params=payload, timeout=(30, 15))
            except requests.exceptions.Timeout as ex:
                if retries < 3:
                    logger.warn('Reuters API timeout retrying, retries {}'.format(retries))
                    retries += 1
                    continue
                raise IngestApiError.apiTimeoutError(ex, self.provider)
            except requests.exceptions.TooManyRedirects as ex:
                # Tell the user their URL was bad and try a different one
                raise IngestApiError.apiRedirectError(ex, self.provider)
            except requests.exceptions.RequestException as ex:
                # catastrophic error. bail.
                raise IngestApiError.apiRequestError(ex, self.provider)
            except Exception as error:
                traceback.print_exc()
                raise IngestApiError.apiGeneralError(error, self.provider)

            if response.status_code == 404:
                raise LookupError('Not found %s' % payload)

            break

        try:
            return etree.fromstring(response.content)  # workaround for http mock lib
        except UnicodeEncodeError as error:
            traceback.print_exc()
            raise IngestApiError.apiUnicodeError(error, self.provider)
        except ParseError as error:
            traceback.print_exc()
            raise IngestApiError.apiParseError(error, self.provider)
        except Exception as error:
            traceback.print_exc()
            raise IngestApiError.apiGeneralError(error, self.provider)
Example #10
0
 def test_raise_apiRequestError(self):
     with assert_raises(IngestApiError) as error_context:
         ex = Exception("Testing apiRequestError")
         raise IngestApiError.apiRequestError(ex, self.provider)
     exception = error_context.exception
     self.assertTrue(exception.code == 4003)
     self.assertTrue(exception.message == "API ingest has request error")
     self.assertIsNotNone(exception.system_exception)
     self.assertEqual(exception.system_exception.args[0], "Testing apiRequestError")
     self.assertEqual(len(self.mock_logger_handler.messages['error']), 1)
     self.assertEqual(self.mock_logger_handler.messages['error'][0],
                      "IngestApiError Error 4003 - API ingest has request error: "
                      "Testing apiRequestError on channel TestProvider")
Example #11
0
class NewsworthyFeedingService(FeedingService):
    """
    Feeding Service class which can retrieve articles from Newsworthy web service
    """

    NAME = 'newsworthy'

    ERRORS = [IngestApiError.apiRequestError().get_error_description(),
              SuperdeskIngestError.notConfiguredError().get_error_description()]

    label = 'Newsworthy'

    fields = [
        {
            'id': 'url', 'type': 'text', 'label': 'Use this URL for webhook',
            'default_value': '',
            'readonly': True,
        },
        {
            'id': 'username', 'type': 'text', 'label': 'Username',
            'required': True
        },
        {
            'id': 'password', 'type': 'password', 'label': 'Password',
            'required': True
        },
        {
            'id': 'secret', 'type': 'password', 'label': 'Shared Secret',
            'placeholder': 'Shared Secret', 'required': False
        },
    ]

    def _update(self, provider, update):
        try:
            data = provider['newsworthy_data']
        except IndexError:
            return [[]]
        if data['hook']['event'] == EVENT_UNPUBLISHED:
            logger.info("ignoring unpublish event on following data:\n{data}".format(data=data))
            return [[]]

        # we have to write to a temporary file because feed parser expect a file path
        # FIXME: it would be better to use the data directly
        with NamedTemporaryFile('w') as f:
            json.dump(data['data'], f)
            f.seek(0)
            parser = self.get_feed_parser(provider, f.name)
            items = parser.parse(f.name, provider)

        return [items]
Example #12
0
 def test_raise_apiRequestError(self):
     with assert_raises(IngestApiError) as error_context:
         ex = Exception("Testing apiRequestError")
         raise IngestApiError.apiRequestError(ex, self.provider)
     exception = error_context.exception
     self.assertTrue(exception.code == 4003)
     self.assertTrue(exception.message == "API ingest has request error")
     self.assertIsNotNone(exception.system_exception)
     self.assertEquals(exception.system_exception.args[0],
                       "Testing apiRequestError")
     self.assertEqual(len(self.mock_logger_handler.messages['error']), 1)
     self.assertEqual(
         self.mock_logger_handler.messages['error'][0],
         "IngestApiError Error 4003 - API ingest has request error: "
         "Testing apiRequestError on channel TestProvider")
Example #13
0
class EventHTTPFeedingService(HTTPFeedingServiceBase):
    """
    Feeding Service class which can read events using HTTP
    """

    NAME = 'event_http'
    label = 'Event HTTP feed'
    service = 'events'
    fields = [
        {
            'id': 'url', 'type': 'text', 'label': 'Feed URL',
            'placeholder': 'Feed URL', 'required': True
        }
    ]
    ERRORS = [IngestApiError.apiTimeoutError().get_error_description(),
              IngestApiError.apiRedirectError().get_error_description(),
              IngestApiError.apiRequestError().get_error_description(),
              IngestApiError.apiUnicodeError().get_error_description(),
              IngestApiError.apiParseError().get_error_description(),
              IngestApiError.apiGeneralError().get_error_description()]
    HTTP_AUTH = False

    def _update(self, provider, update):
        """
        Fetch events from external API.

        :param provider: Ingest Provider Details.
        :type provider: dict
        :param update: Any update that is required on provider.
        :type update: dict
        :return: a list of events which can be saved.
        """

        response = self.get_url(self.config['url'])
        parser = self.get_feed_parser(provider)

        logger.info('Ingesting events with {} parser'.format(parser.__class__.__name__))
        logger.info('Ingesting content: {} ...'.format(str(response.content)[:4000]))

        if hasattr(parser, 'parse_http'):
            items = parser.parse_http(response.content, provider)
        else:
            items = parser.parse(response.content)

        if isinstance(items, list):
            yield items
        else:
            yield [items]
Example #14
0
    def _get_tree(self, endpoint, payload=None):
        """
        Get xml response for given API endpoint and payload.
        :param: endpoint
        :type endpoint: str
        :param: payload
        :type payload: str
        """

        if payload is None:
            payload = {}

        payload['token'] = self._get_auth_token(self.provider, update=True)
        url = self._get_absolute_url(endpoint)

        try:
            response = requests.get(url, params=payload, timeout=15)
        except requests.exceptions.Timeout as ex:
            # Maybe set up for a retry, or continue in a retry loop
            raise IngestApiError.apiTimeoutError(ex, self.provider)
        except requests.exceptions.TooManyRedirects as ex:
            # Tell the user their URL was bad and try a different one
            raise IngestApiError.apiRedirectError(ex, self.provider)
        except requests.exceptions.RequestException as ex:
            # catastrophic error. bail.
            raise IngestApiError.apiRequestError(ex, self.provider)
        except Exception as error:
            traceback.print_exc()
            raise IngestApiError.apiGeneralError(error, self.provider)

        if response.status_code == 404:
            raise LookupError('Not found %s' % payload)

        try:
            return etree.fromstring(
                response.content)  # workaround for http mock lib
        except UnicodeEncodeError as error:
            traceback.print_exc()
            raise IngestApiError.apiUnicodeError(error, self.provider)
        except ParseError as error:
            traceback.print_exc()
            raise IngestApiError.apiParseError(error, self.provider)
        except Exception as error:
            traceback.print_exc()
            raise IngestApiError.apiGeneralError(error, self.provider)
Example #15
0
    def _get_tree(self, endpoint, payload=None):
        """
        Get xml response for given API endpoint and payload.
        :param: endpoint
        :type endpoint: str
        :param: payload
        :type payload: str
        """

        if payload is None:
            payload = {}

        payload['token'] = self._get_auth_token(self.provider, update=True)
        url = self._get_absolute_url(endpoint)

        try:
            response = requests.get(url, params=payload, timeout=15)
        except requests.exceptions.Timeout as ex:
            # Maybe set up for a retry, or continue in a retry loop
            raise IngestApiError.apiTimeoutError(ex, self.provider)
        except requests.exceptions.TooManyRedirects as ex:
            # Tell the user their URL was bad and try a different one
            raise IngestApiError.apiRedirectError(ex, self.provider)
        except requests.exceptions.RequestException as ex:
            # catastrophic error. bail.
            raise IngestApiError.apiRequestError(ex, self.provider)
        except Exception as error:
            traceback.print_exc()
            raise IngestApiError.apiGeneralError(error, self.provider)

        if response.status_code == 404:
            raise LookupError('Not found %s' % payload)

        try:
            return etree.fromstring(response.content)  # workaround for http mock lib
        except UnicodeEncodeError as error:
            traceback.print_exc()
            raise IngestApiError.apiUnicodeError(error, self.provider)
        except ParseError as error:
            traceback.print_exc()
            raise IngestApiError.apiParseError(error, self.provider)
        except Exception as error:
            traceback.print_exc()
            raise IngestApiError.apiGeneralError(error, self.provider)
Example #16
0
    def _update(self, provider, update):
        try:
            config = provider['config']
            user = config['username']
            password = config['password']
        except KeyError:
            SuperdeskIngestError.notConfiguredError(Exception('username and password are needed'))

        url_override = config.get('url', '').strip()
        if not url_override.startswith('http'):
            SuperdeskIngestError.notConfiguredError(Exception('if URL is set, it must be a valid http link'))

        if url_override:
            params = {'user': user, 'password': password, 'maksAntal': 50}
        else:
            params = {'user': user, 'password': password, 'maksAntal': 50, 'waitAcknowledge': 'true'}

        try:
            r = requests.get(url_override or URL, params=params)
        except Exception:
            raise IngestApiError.apiRequestError(Exception('error while doing the request'))

        try:
            root_elt = etree.fromstring(r.text)
        except Exception:
            raise IngestApiError.apiRequestError(Exception('error while parsing the request answer'))

        try:
            if root_elt.xpath('(//error/text())[1]')[0] != '0':
                err_msg = root_elt.xpath('(//errormsg/text())[1]')[0]
                raise IngestApiError.apiRequestError(Exception('error code returned by API: {msg}'.format(msg=err_msg)))
        except IndexError:
            raise IngestApiError.apiRequestError(Exception('Invalid XML, <error> element not found'))

        parser = self.get_feed_parser(provider)
        items = []
        for elt in root_elt.xpath('//RBNews'):
            item = parser.parse(elt, provider)
            items.append(item)
            if not url_override:
                try:
                    queue_id = elt.xpath('.//ServiceQueueId/text()')[0]
                except IndexError:
                    raise IngestApiError.apiRequestError(Exception('missing ServiceQueueId element'))
                ack_params = {'user': user, 'password': password, 'servicequeueid': queue_id}
                try:
                    requests.get(URL_ACK, params=ack_params)
                except Exception:
                    raise IngestApiError.apiRequestError(Exception('error while doing the request'))

        return [items]
    def _request(self, url):
        try:
            response = requests.get(url, params={}, timeout=120)
        except requests.exceptions.Timeout as ex:
            # Maybe set up for a retry, or continue in a retry loop
            raise IngestApiError.apiTimeoutError(ex, self.provider)
        except requests.exceptions.TooManyRedirects as ex:
            # Tell the user their URL was bad and try a different one
            raise IngestApiError.apiRedirectError(ex, self.provider)
        except requests.exceptions.RequestException as ex:
            # catastrophic error. bail.
            raise IngestApiError.apiRequestError(ex, self.provider)
        except Exception as error:
            traceback.print_exc()
            raise IngestApiError.apiGeneralError(error, self.provider)

        if response.status_code == 404:
            raise LookupError('Not found')

        return response.content
Example #18
0
    def _request(self, url):
        try:
            response = requests.get(url, params={}, timeout=120)
        except requests.exceptions.Timeout as ex:
            # Maybe set up for a retry, or continue in a retry loop
            raise IngestApiError.apiTimeoutError(ex, self.provider)
        except requests.exceptions.TooManyRedirects as ex:
            # Tell the user their URL was bad and try a different one
            raise IngestApiError.apiRedirectError(ex, self.provider)
        except requests.exceptions.RequestException as ex:
            # catastrophic error. bail.
            raise IngestApiError.apiRequestError(ex, self.provider)
        except Exception as error:
            traceback.print_exc()
            raise IngestApiError.apiGeneralError(error, self.provider)

        if response.status_code == 404:
            raise LookupError('Not found')

        return response.content
Example #19
0
 def _update(self, provider, update):
     user = provider['config']['wufoo_username']
     wufoo_data = {
         "url": WUFOO_URL.format(subdomain=user),
         "user": user,
         "api_key": provider['config']['wufoo_api_key'],
         "form_query_entries_tpl": WUFOO_QUERY_FORM + WUFOO_QUERY_ENTRIES,
         "update": update}
     try:
         parser = self.get_feed_parser(provider, None)
     except requests.exceptions.Timeout as ex:
         raise IngestApiError.apiTimeoutError(ex, provider)
     except requests.exceptions.TooManyRedirects as ex:
         raise IngestApiError.apiRedirectError(ex, provider)
     except requests.exceptions.RequestException as ex:
         raise IngestApiError.apiRequestError(ex, provider)
     except Exception as error:
         traceback.print_exc()
         raise IngestApiError.apiGeneralError(error, self.provider)
     items = parser.parse(wufoo_data, provider)
     return [items]
Example #20
0
 def _update(self, provider, update):
     user = provider['config']['wufoo_username']
     wufoo_data = {
         "url": WUFOO_URL.format(subdomain=user),
         "user": user,
         "api_key": provider['config']['wufoo_api_key'],
         "form_query_entries_tpl": WUFOO_QUERY_FORM + WUFOO_QUERY_ENTRIES,
         "update": update}
     try:
         parser = self.get_feed_parser(provider, None)
     except requests.exceptions.Timeout as ex:
         raise IngestApiError.apiTimeoutError(ex, provider)
     except requests.exceptions.TooManyRedirects as ex:
         raise IngestApiError.apiRedirectError(ex, provider)
     except requests.exceptions.RequestException as ex:
         raise IngestApiError.apiRequestError(ex, provider)
     except Exception as error:
         traceback.print_exc()
         raise IngestApiError.apiGeneralError(error, self.provider)
     items = parser.parse(wufoo_data, provider)
     return [items]
Example #21
0
    def get_tree(self, endpoint, payload=None):
        """Get xml response for given API endpoint and payload."""
        if payload is None:
            payload = {}
        payload['token'] = self.get_token()
        url = self.get_url(endpoint)

        try:
            response = requests.get(url, params=payload, timeout=21.0)
        except requests.exceptions.Timeout as ex:
            # Maybe set up for a retry, or continue in a retry loop
            raise IngestApiError.apiTimeoutError(ex, self.provider)
        except requests.exceptions.TooManyRedirects as ex:
            # Tell the user their URL was bad and try a different one
            raise IngestApiError.apiRedirectError(ex, self.provider)
        except requests.exceptions.RequestException as ex:
            # catastrophic error. bail.
            raise IngestApiError.apiRequestError(ex, self.provider)
        except Exception as error:
            traceback.print_exc()
            raise IngestApiError(error, self.provider)

        if response.status_code == 404:
            raise IngestApiError.apiNotFoundError(
                LookupError('Not found %s' % payload), self.provider)

        try:
            # workaround for httmock lib
            # return etree.fromstring(response.text.encode('utf-8'))
            return etree.fromstring(response.content)
        except UnicodeEncodeError as error:
            traceback.print_exc()
            raise IngestApiError.apiUnicodeError(error, self.provider)
        except ParseError as error:
            traceback.print_exc()
            raise IngestApiError.apiParseError(error, self.provider)
        except Exception as error:
            traceback.print_exc()
            raise IngestApiError(error, self.provider)
Example #22
0
    def get_tree(self, endpoint, payload=None):
        """Get xml response for given API endpoint and payload."""
        if payload is None:
            payload = {}
        payload['token'] = self.get_token()
        url = self.get_url(endpoint)

        try:
            response = requests.get(url, params=payload, timeout=21.0)
        except requests.exceptions.Timeout as ex:
            # Maybe set up for a retry, or continue in a retry loop
            raise IngestApiError.apiTimeoutError(ex, self.provider)
        except requests.exceptions.TooManyRedirects as ex:
            # Tell the user their URL was bad and try a different one
            raise IngestApiError.apiRedirectError(ex, self.provider)
        except requests.exceptions.RequestException as ex:
            # catastrophic error. bail.
            raise IngestApiError.apiRequestError(ex, self.provider)
        except Exception as error:
            traceback.print_exc()
            raise IngestApiError(error, self.provider)

        if response.status_code == 404:
            raise IngestApiError.apiNotFoundError(LookupError('Not found %s' % payload), self.provider)

        try:
            # workaround for httmock lib
            # return etree.fromstring(response.text.encode('utf-8'))
            return etree.fromstring(response.content)
        except UnicodeEncodeError as error:
            traceback.print_exc()
            raise IngestApiError.apiUnicodeError(error, self.provider)
        except ParseError as error:
            traceback.print_exc()
            raise IngestApiError.apiParseError(error, self.provider)
        except Exception as error:
            traceback.print_exc()
            raise IngestApiError(error, self.provider)
Example #23
0
class ReutersHTTPFeedingService(HTTPFeedingService):
    """
    Feeding Service class which can read article(s) using HTTP provided by Reuters.
    """

    NAME = 'reuters_http'

    ERRORS = [IngestApiError.apiTimeoutError().get_error_description(),
              IngestApiError.apiRedirectError().get_error_description(),
              IngestApiError.apiRequestError().get_error_description(),
              IngestApiError.apiUnicodeError().get_error_description(),
              IngestApiError.apiParseError().get_error_description(),
              IngestApiError.apiGeneralError().get_error_description()]

    DATE_FORMAT = '%Y.%m.%d.%H.%M'

    def _update(self, provider):
        updated = utcnow()

        last_updated = provider.get('last_updated')
        ttl_minutes = app.config['INGEST_EXPIRY_MINUTES']
        if not last_updated or last_updated < updated - datetime.timedelta(minutes=ttl_minutes):
            last_updated = updated - datetime.timedelta(minutes=ttl_minutes)

        self.provider = provider
        provider_config = provider.get('config')
        if not provider_config:
            provider_config = {}
            provider['config'] = provider_config

        if 'url' not in provider_config:
            provider_config['url'] = 'http://rmb.reuters.com/rmd/rest/xml'

        if 'auth_url' not in provider_config:
            provider_config['auth_url'] = 'https://commerce.reuters.com/rmd/rest/xml/login'

        self.URL = provider_config.get('url')

        for channel in self._get_channels():
            for guid in self._get_article_ids(channel, last_updated, updated):
                items = self.fetch_ingest(guid)
                if items:
                    yield items

    def _get_channels(self):
        """Get subscribed channels."""
        channels = []
        tree = self._get_tree('channels')
        for channel in tree.findall('channelInformation'):
            channels.append(channel.find('alias').text)

        return channels

    def _get_tree(self, endpoint, payload=None):
        """
        Get xml response for given API endpoint and payload.
        :param: endpoint
        :type endpoint: str
        :param: payload
        :type payload: str
        """

        if payload is None:
            payload = {}

        payload['token'] = self._get_auth_token(self.provider, update=True)
        url = self._get_absolute_url(endpoint)

        try:
            response = requests.get(url, params=payload, timeout=15)
        except requests.exceptions.Timeout as ex:
            # Maybe set up for a retry, or continue in a retry loop
            raise IngestApiError.apiTimeoutError(ex, self.provider)
        except requests.exceptions.TooManyRedirects as ex:
            # Tell the user their URL was bad and try a different one
            raise IngestApiError.apiRedirectError(ex, self.provider)
        except requests.exceptions.RequestException as ex:
            # catastrophic error. bail.
            raise IngestApiError.apiRequestError(ex, self.provider)
        except Exception as error:
            traceback.print_exc()
            raise IngestApiError.apiGeneralError(error, self.provider)

        if response.status_code == 404:
            raise LookupError('Not found %s' % payload)

        try:
            return etree.fromstring(response.content)  # workaround for http mock lib
        except UnicodeEncodeError as error:
            traceback.print_exc()
            raise IngestApiError.apiUnicodeError(error, self.provider)
        except ParseError as error:
            traceback.print_exc()
            raise IngestApiError.apiParseError(error, self.provider)
        except Exception as error:
            traceback.print_exc()
            raise IngestApiError.apiGeneralError(error, self.provider)

    def _get_absolute_url(self, endpoint):
        """
        Get absolute URL for given endpoint.

        :param: endpoint
        :type endpoint: str
        """
        return '/'.join([self.URL, endpoint])

    def _get_article_ids(self, channel, last_updated, updated):
        """
        Get article ids which should be upserted.
        """

        ids = set()
        payload = {'channel': channel, 'fieldsRef': 'id',
                   'dateRange': "%s-%s" % (self._format_date(last_updated), self._format_date(updated))}

        logger.info('Reuters requesting Date Range |{}| for channel {}'.format(payload['dateRange'], channel))
        tree = self._get_tree('items', payload)
        for result in tree.findall('result'):
            ids.add(result.find('guid').text)

        return ids

    def _format_date(self, date):
        return date.strftime(self.DATE_FORMAT)

    def fetch_ingest(self, guid):
        items = self._parse_items(guid)
        result_items = []
        while items:
            item = items.pop()
            self.add_timestamps(item)
            try:
                items.extend(self._fetch_items_in_package(item))
                result_items.append(item)
            except LookupError as err:
                self.log_item_error(err, item, self.provider)
                return []

        return result_items

    def _parse_items(self, guid):
        """
        Parse item message and return given items.
        """

        payload = {'id': guid}
        tree = self._get_tree('item', payload)

        parser = self.get_feed_parser(self.provider, tree)
        items = parser.parse(tree, self.provider)

        return items

    def _fetch_items_in_package(self, item):
        """
        Fetch remote assets for given item.
        """
        items = []
        for group in item.get('groups', []):
            for ref in group.get('refs', []):
                if 'residRef' in ref:
                    items.extend(self._parse_items(ref.get('residRef')))

        return items
class HTTPFeedingService(FeedingService, metaclass=ABCMeta):
    """
    Feeding Service class which can read article(s) using HTTP.
    """

    ERRORS = [IngestApiError.apiTimeoutError().get_error_description(),
              IngestApiError.apiRedirectError().get_error_description(),
              IngestApiError.apiRequestError().get_error_description(),
              IngestApiError.apiUnicodeError().get_error_description(),
              IngestApiError.apiParseError().get_error_description(),
              IngestApiError.apiGeneralError().get_error_description()]

    label = 'HTTP'

    def __init__(self):
        super().__init__()
        self.token = None

    def _generate_token_and_update_provider(self, provider):
        """
        Generates Authentication Token and updates the given provider with the authentication token.

        :param provider: dict - Ingest provider details to which the current directory has been configured
        :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource`
        :return: Authentication Token
        :rtype: str
        """
        token = {'auth_token': self._generate_auth_token(provider), 'created': utcnow()}
        get_resource_service('ingest_providers').system_update(provider[config.ID_FIELD], updates={'tokens': token},
                                                               original=provider)
        provider['tokens'] = token
        return token['auth_token']

    def _generate_auth_token(self, provider):
        """
        Generates Authentication Token as per the configuration in Ingest Provider.

        :param provider: dict - Ingest provider details to which the current directory has been configured
        :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource`
        :return: token details if successfully authenticated
        :rtype: str
        :raises: IngestApiError.apiGeneralError() if auth_url is missing in the Ingest Provider configuration
        """
        session = requests.Session()
        session.mount('https://', SSLAdapter())

        auth_url = provider.get('config', {}).get('auth_url', None)
        if not auth_url:
            raise IngestApiError.apiGeneralError(provider=provider,
                                                 exception=KeyError(
                                                     '''
                                                     Ingest Provider {} is missing Authentication URL.
                                                     Please check the configuration.
                                                     '''.format(provider['name']))
                                                 )

        payload = {
            'username': provider.get('config', {}).get('username', ''),
            'password': provider.get('config', {}).get('password', ''),
        }

        response = session.get(auth_url, params=payload, verify=False, timeout=30)
        if response.status_code < 200 or response.status_code >= 300:
            try:
                response.raise_for_status()
            except Exception:
                err = IngestApiError.apiAuthError(provider=provider)
                self.close_provider(provider, err, force=True)
                raise err

        tree = etree.fromstring(response.content)  # workaround for http mock lib
        return tree.text

    def _is_valid_token(self, token):
        """Check if the given token is still valid.

        Most of authentication tokens issued by Ingest Providers are valid for 12 hours.

        :param token: Token information
        :type token: dict
        :return: True if valid, False otherwise
        :rtype: bool
        """
        ttl = timedelta(hours=12)
        created = arrow.get(token.get('created')).datetime

        return created + ttl >= utcnow() and token.get('auth_token')

    def _get_auth_token(self, provider, update=False):
        """
        Gets authentication token for given provider instance and save it in db based on the given update flag.

        :param provider: dict - Ingest provider details to which the current directory has been configured
        :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource`
        :param update: a flag which dictates whether to save the authentication token in Ingest Provider record or not.
                       Saves if the value is True, defaults to False.
        :type update: bool
        :return: Authentication Token
        :rtype: str
        """
        token = provider.get('tokens')

        if token and self._is_valid_token(token):
            return token.get('auth_token')

        return self._generate_token_and_update_provider(provider) if update else ''
Example #25
0
class AAPSportsHTTPFeedingService(HTTPFeedingService):
    label = 'AAP Sports Results Feed'
    NAME = 'aap_sports_http'
    ERRORS = [
        IngestApiError.apiTimeoutError().get_error_description(),
        IngestApiError.apiRedirectError().get_error_description(),
        IngestApiError.apiRequestError().get_error_description(),
        IngestApiError.apiUnicodeError().get_error_description(),
        IngestApiError.apiParseError().get_error_description(),
        IngestApiError.apiGeneralError().get_error_description()
    ]
    """
    Defines the collection service to be used with this ingest feeding service.
    """
    service = 'events'

    fields = [
        {
            'id': 'login_url',
            'type': 'text',
            'label': 'Login Url',
            'placeholder': 'Login Url',
            'required': True,
            'errors': {
                4006: 'Server not found.',
                4000: 'Unexpected server response'
            }
        },
        {
            'id': 'fixtures_url',
            'type': 'text',
            'label': 'Fixtures Url',
            'placeholder': 'Fixtures Url',
            'required': True
        },
        {
            'id': 'username',
            'type': 'text',
            'label': 'Username',
            'placeholder': 'Username',
            'required': True
        },
        {
            'id': 'password',
            'type': 'password',
            'label': 'Password',
            'placeholder': 'Password',
            'required': True,
            'errors': {
                4007: 'Authentication error.'
            }
        },
        {
            'id': 'sports',
            'type': 'text',
            'label': 'Sports',
            'placeholder': 'Comma separate list of sports ids',
            'required': True,
            'default': '1,2,3,4,10'
        },
    ]

    def _update(self, provider, update):
        self.provider = provider
        parser = self.get_feed_parser(provider)

        # get the current year, it is used to filter fixtures for this year and next
        year = int(utcnow().year) % 100
        config = provider.get('config', {})
        content = self._request(
            config.get('login_url').format(config.get('username'),
                                           config.get('password')))
        # get the configured sports
        configured_sports = config.get('sports').split(',')
        xml = ET.fromstring(content)
        if xml.attrib['Status_Code'] == 'OK':
            session = xml.attrib['Status_Session']
            content = self._request(
                config.get('fixtures_url').format(session, '', '', ''))
            xml = ET.fromstring(content)
            for s in xml.findall('.//Sports/Sport'):
                sport_id = s.attrib['SportID']
                if sport_id not in configured_sports:
                    continue
                sport_name = s.attrib['SportName']
                content = self._request(
                    config.get('fixtures_url').format(session, sport_id, '',
                                                      ''))
                sport_xml = ET.fromstring(content)
                for c in sport_xml.findall('.//Competition'):
                    comp_id = c.attrib.get('Comp_ID')
                    comp_name = c.attrib.get('Comp_Name')
                    content = self._request(
                        config.get('fixtures_url').format(
                            session, sport_id, comp_id, ''))
                    comp_xml = ET.fromstring(content)
                    for season in comp_xml.findall('.//Season'):
                        season_id = season.attrib.get('SeasonID')
                        if str(year) in season_id or str(year +
                                                         1) in season_id:
                            content = self._request(
                                config.get('fixtures_url').format(
                                    session, sport_id, comp_id, season_id))
                            fixture_xml = ET.fromstring(content)
                            logger.info('Parsing {}/{} {}/{}'.format(
                                sport_id, sport_name, comp_id, comp_name))
                            items = parser.parse(
                                {
                                    'fixture_xml': fixture_xml,
                                    'sport_id': sport_id,
                                    'sport_name': sport_name,
                                    'comp_name': comp_name,
                                    'comp_id': comp_id
                                }, provider)
                            if len(items) > 0:
                                yield items

    def _request(self, url):
        try:
            response = requests.get(url, params={}, timeout=120)
        except requests.exceptions.Timeout as ex:
            # Maybe set up for a retry, or continue in a retry loop
            raise IngestApiError.apiTimeoutError(ex, self.provider)
        except requests.exceptions.TooManyRedirects as ex:
            # Tell the user their URL was bad and try a different one
            raise IngestApiError.apiRedirectError(ex, self.provider)
        except requests.exceptions.RequestException as ex:
            # catastrophic error. bail.
            raise IngestApiError.apiRequestError(ex, self.provider)
        except Exception as error:
            traceback.print_exc()
            raise IngestApiError.apiGeneralError(error, self.provider)

        if response.status_code == 404:
            raise LookupError('Not found')

        return response.content
Example #26
0
class RitzauFeedingService(FeedingService):
    """
    Feeding Service class which can retrieve articles from Ritzau web service
    """

    NAME = 'ritzau'
    ERRORS = [
        IngestApiError.apiRequestError().get_error_description(),
        SuperdeskIngestError.notConfiguredError().get_error_description()
    ]

    def _update(self, provider, update):
        try:
            config = provider['config']
            user = config['username']
            password = config['password']
        except KeyError as e:
            SuperdeskIngestError.notConfiguredError(
                Exception('username and password are needed'))

        url_override = config.get('url', '').strip()
        if not url_override.startswith('http'):
            SuperdeskIngestError.notConfiguredError(
                Exception('if URL is set, it must be a valid http link'))

        if url_override:
            params = {'user': user, 'password': password, 'maksAntal': 50}
        else:
            params = {
                'user': user,
                'password': password,
                'maksAntal': 50,
                'waitAcknowledge': 'true'
            }

        try:
            r = requests.get(url_override or URL, params=params)
        except Exception as e:
            raise IngestApiError.apiRequestError(
                Exception('error while doing the request'))

        try:
            root_elt = etree.fromstring(r.text)
        except Exception as e:
            raise IngestApiError.apiRequestError(
                Exception('error while parsing the request answer'))

        try:
            if root_elt.xpath('(//error/text())[1]')[0] != '0':
                err_msg = root_elt.xpath('(//errormsg/text())[1]')[0]
                raise IngestApiError.apiRequestError(
                    Exception('error code returned by API: {msg}'.format(
                        msg=err_msg)))
        except IndexError as e:
            raise IngestApiError.apiRequestError(
                Exception('Invalid XML, <error> element not found'))

        parser = self.get_feed_parser(provider)
        items = []
        for elt in root_elt.xpath('//RBNews'):
            item = parser.parse(elt, provider)
            items.append(item)
            if not url_override:
                try:
                    queue_id = elt.xpath('.//ServiceQueueId/text()')[0]
                except IndexError:
                    raise IngestApiError.apiRequestError(
                        Exception('missing ServiceQueueId element'))
                ack_params = {
                    'user': user,
                    'password': password,
                    'servicequeueid': queue_id
                }
                try:
                    requests.get(URL_ACK, params=ack_params)
                except Exception as e:
                    raise IngestApiError.apiRequestError(
                        Exception('error while doing the request'))

        return [items]
Example #27
0
    def _update(self, provider, update):
        self.HTTP_URL = provider.get('config', {}).get('api_url', '')
        self.provider = provider

        # Set the apikey parameter we're going to use it on all calls
        params = dict()
        params['apikey'] = provider.get('config', {}).get('apikey')

        # Use the next link if one is available in the config
        if provider.get('config', {}).get('next_link'):
            r = self.get_url(url=provider.get('config', {}).get('next_link'),
                             params=params,
                             verify=False,
                             allow_redirects=True)
            r.raise_for_status()
        else:
            id_list = provider.get('config', {}).get('productList', '').strip()
            recovery_time = provider.get('config',
                                         {}).get('recoverytime', '1').strip()
            if recovery_time == '':
                recovery_time = '1'
            start = (utcnow() - timedelta(hours=int(recovery_time))
                     ).isoformat()[:19] + 'Z'
            # If there has been a list of products defined then we format them for the request, if not all
            # allowed products will be returned.
            if id_list:
                # we remove spaces and empty values from id_list to do a clean list
                id_list = ' OR '.join(
                    [id_.strip() for id_ in id_list.split(',') if id_.strip()])
                params[
                    'q'] = 'productid:(' + id_list + ') AND mindate:>{}'.format(
                        start)
            else:
                params['q'] = 'mindate:>{}'.format(start)
            params['page_size'] = '100'

            r = self.get_url(params=params, verify=False, allow_redirects=True)
            r.raise_for_status()
        try:
            response = json.loads(r.text)
        except Exception:
            raise IngestApiError.apiRequestError(
                Exception('error parsing response'))

        nextLink = response.get('data', {}).get('next_page')
        # Got the same next link as last time so nothing new
        if nextLink == provider.get('config', {}).get('next_link'):
            logger.info('Nothing new from AP Media')
            return []

        if len(response.get('data', {}).get('items', [])) > 0:
            try:
                sequence_number = int(
                    provider.get('config', {}).get('sequence', 0))
                with ftp_connect({
                        'username':
                        provider.get('config', {}).get('ftp_user', ''),
                        'password':
                        provider.get('config', {}).get('ftp_password', ''),
                        'host':
                        provider.get('config', {}).get('ftp_server', ''),
                        'path':
                        provider.get('config', {}).get('ftp_path', '')
                }) as ftp:
                    for item in response.get('data', {}).get('items', []):
                        try:
                            if item['item']['type'] == 'picture':
                                image_ref = item['item']['renditions']['main'][
                                    'href']
                                if provider.get('config', {}).get(
                                        'filenametemplate', '') == '':
                                    filename = to_ascii(
                                        item['item']['renditions']['main']
                                        ['originalfilename'])
                                else:
                                    # The filename is generated by applying the date format string in the template
                                    filename = datetime.now().strftime(
                                        provider.get('config', {}).get(
                                            'filenametemplate', ''))
                                    # and appending the sequence number
                                    filename += '-' + str(
                                        sequence_number).zfill(4) + '.jpg'
                                    sequence_number = (sequence_number +
                                                       1) % 10000

                                logger.info(
                                    'file: {} versioncreated: {}'.format(
                                        filename,
                                        item['item']['versioncreated']))
                                r = requests.get(url=image_ref,
                                                 params={
                                                     'apikey':
                                                     provider.get(
                                                         'config',
                                                         {}).get('apikey')
                                                 })
                                r.raise_for_status()
                                try:
                                    ftp.storbinary('STOR {}'.format(filename),
                                                   BytesIO(r.content))
                                except ftplib.all_errors as e:
                                    logger.error(e)

                        # Any exception processing an indivisual item is swallowed
                        except Exception as ex:
                            logger.exception(ex)
            except Exception as ex:
                logger.exception(ex)

        # Save the link for next time
        upd_provider = provider.get('config')
        upd_provider['next_link'] = nextLink
        upd_provider['recoverytime'] = ''
        upd_provider['sequence'] = str(sequence_number)
        update['config'] = upd_provider

        return None
Example #28
0
class ReutersHTTPFeedingService(HTTPFeedingService):
    """
    Feeding Service class which can read article(s) using HTTP provided by Reuters.
    """

    NAME = 'reuters_http'

    ERRORS = [
        IngestApiError.apiTimeoutError().get_error_description(),
        IngestApiError.apiRedirectError().get_error_description(),
        IngestApiError.apiRequestError().get_error_description(),
        IngestApiError.apiUnicodeError().get_error_description(),
        IngestApiError.apiParseError().get_error_description(),
        IngestApiError.apiGeneralError().get_error_description()
    ]

    DATE_FORMAT = '%Y.%m.%d.%H.%M'

    def _update(self, provider):
        updated = utcnow()

        last_updated = provider.get('last_updated')
        ttl_minutes = app.config['INGEST_EXPIRY_MINUTES']
        if not last_updated or last_updated < updated - datetime.timedelta(
                minutes=ttl_minutes):
            last_updated = updated - datetime.timedelta(minutes=ttl_minutes)

        self.provider = provider
        provider_config = provider.get('config')
        if not provider_config:
            provider_config = {}
            provider['config'] = provider_config

        if 'url' not in provider_config:
            provider_config['url'] = 'http://rmb.reuters.com/rmd/rest/xml'

        if 'auth_url' not in provider_config:
            provider_config[
                'auth_url'] = 'https://commerce.reuters.com/rmd/rest/xml/login'

        self.URL = provider_config.get('url')

        for channel in self._get_channels():
            ids = self._get_article_ids(channel, last_updated, updated)
            for id in ids:
                try:
                    items = self.fetch_ingest(id)
                    if items:
                        yield items
                # if there was an exception processing the one of the bunch log it and continue
                except Exception as ex:
                    logger.warn(
                        'Reuters item {} has not been retrieved'.format(id))
                    logger.exception(ex)

    def _get_channels(self):
        """Get subscribed channels."""
        channels = []
        tree = self._get_tree('channels')
        for channel in tree.findall('channelInformation'):
            channels.append(channel.find('alias').text)

        return channels

    def _get_tree(self, endpoint, payload=None):
        """
        Get xml response for given API endpoint and payload.
        :param: endpoint
        :type endpoint: str
        :param: payload
        :type payload: str
        """

        if payload is None:
            payload = {}

        payload['token'] = self._get_auth_token(self.provider, update=True)
        url = self._get_absolute_url(endpoint)

        try:
            response = requests.get(url, params=payload, timeout=15)
        except requests.exceptions.Timeout as ex:
            # Maybe set up for a retry, or continue in a retry loop
            raise IngestApiError.apiTimeoutError(ex, self.provider)
        except requests.exceptions.TooManyRedirects as ex:
            # Tell the user their URL was bad and try a different one
            raise IngestApiError.apiRedirectError(ex, self.provider)
        except requests.exceptions.RequestException as ex:
            # catastrophic error. bail.
            raise IngestApiError.apiRequestError(ex, self.provider)
        except Exception as error:
            traceback.print_exc()
            raise IngestApiError.apiGeneralError(error, self.provider)

        if response.status_code == 404:
            raise LookupError('Not found %s' % payload)

        try:
            return etree.fromstring(
                response.content)  # workaround for http mock lib
        except UnicodeEncodeError as error:
            traceback.print_exc()
            raise IngestApiError.apiUnicodeError(error, self.provider)
        except ParseError as error:
            traceback.print_exc()
            raise IngestApiError.apiParseError(error, self.provider)
        except Exception as error:
            traceback.print_exc()
            raise IngestApiError.apiGeneralError(error, self.provider)

    def _get_absolute_url(self, endpoint):
        """
        Get absolute URL for given endpoint.

        :param: endpoint
        :type endpoint: str
        """
        return '/'.join([self.URL, endpoint])

    def _get_article_ids(self, channel, last_updated, updated):
        """
        Get article ids which should be upserted also save the poll token that is returned.
        """
        ids = set()
        payload = {'channel': channel, 'fieldsRef': 'id'}

        # check if the channel has a pollToken if not fall back to dateRange
        last_poll_token = self._get_poll_token(channel)
        if last_poll_token is not None:
            logger.info(
                "Reuters requesting channel {} with poll token {}".format(
                    channel, last_poll_token))
            payload['pollToken'] = last_poll_token
        else:
            payload['dateRange'] = "%s-%s" % (self._format_date(last_updated),
                                              self._format_date(updated))
            logger.info(
                "Reuters requesting channel {} with dateRange {}".format(
                    channel, payload['dateRange']))

        tree = self._get_tree('items', payload)
        status_code = tree.find('status').get(
            'code') if tree.tag == 'results' else tree.get('code')
        # check the returned status
        if status_code != '10':
            logger.warn(
                "Reuters channel request returned status code {}".format(
                    status_code))
            # status code 30 indicates failure
            if status_code == '30':
                # invalid token
                logger.warn("Reuters error on channel {} code {} {}".format(
                    channel,
                    tree.find('error').get('code'),
                    tree.find('error').text))
                if tree.find('error').get('code') == '2100':
                    self._save_poll_token(channel, None)
                    logger.warn(
                        "Reuters channel invalid token reseting {}".format(
                            status_code))
                return ids

        # extract the returned poll token if there is one
        poll_token = tree.find('pollToken')
        if poll_token is not None:
            # a new token indicated new content
            if poll_token.text != last_poll_token:
                logger.info("Reuters channel {} new token {}".format(
                    channel, poll_token.text))
                self._save_poll_token(channel, poll_token.text)
            else:
                # the token has not changed, so nothing new
                logger.info("Reuters channel {} nothing new".format(channel))
                return ids
        else:
            logger.info(
                "Reuters channel {} retrieved no token".format(channel))
            return ids

        for result in tree.findall('result'):
            id = result.find('id').text
            ids.add(id)
            logger.info("Reuters id : {}".format(id))

        return ids

    def _save_poll_token(self, channel, poll_token):
        """
        Saves the poll token for the passed channel in the config section of the
        :param channel:
        :param poll_token:
        :return:
        """
        # get the provider in case it has been updated by another channel
        ingest_provider_service = superdesk.get_resource_service(
            'ingest_providers')
        provider = ingest_provider_service.find_one(
            req=None, _id=self.provider[superdesk.config.ID_FIELD])
        provider_token = provider.get('tokens')
        if 'poll_tokens' not in provider_token:
            provider_token['poll_tokens'] = {channel: poll_token}
        else:
            provider_token['poll_tokens'][channel] = poll_token
        upd_provider = {'tokens': provider_token}
        ingest_provider_service.system_update(
            self.provider[superdesk.config.ID_FIELD], upd_provider,
            self.provider)

    def _get_poll_token(self, channel):
        """
        Get the poll token from provider config if it is available.
        :param channel:
        :return: token
        """
        if 'tokens' in self.provider and 'poll_tokens' in self.provider[
                'tokens']:
            return self.provider.get('tokens').get('poll_tokens').get(
                channel, None)

    def _format_date(self, date):
        return date.strftime(self.DATE_FORMAT)

    def fetch_ingest(self, id):
        items = self._parse_items(id)
        result_items = []
        while items:
            item = items.pop()
            self.add_timestamps(item)
            try:
                items.extend(self._fetch_items_in_package(item))
                result_items.append(item)
            except LookupError as err:
                self.log_item_error(err, item, self.provider)
                return []

        return result_items

    def _parse_items(self, id):
        """
        Parse item message and return given items.
        """

        payload = {'id': id}
        tree = self._get_tree('item', payload)

        parser = self.get_feed_parser(self.provider, tree)
        items = parser.parse(tree, self.provider)

        return items

    def _fetch_items_in_package(self, item):
        """
        Fetch remote assets for given item.
        """
        items = []
        for group in item.get('groups', []):
            for ref in group.get('refs', []):
                if 'residRef' in ref:
                    items.extend(self._parse_items(ref.get('residRef')))

        return items

    def prepare_href(self, href, mimetype=None):
        (scheme, netloc, path, params, query, fragment) = urlparse(href)
        new_href = urlunparse((scheme, netloc, path, '', '', ''))
        return '%s?auth_token=%s' % (
            new_href, self._get_auth_token(self.provider, update=True))
Example #29
0
    def _update(self, provider, update):
        try:
            config = provider["config"]
            id_list = config["idList"]
            # before "products" was hardcoded as value for "idListType"
            id_list_type = config.get("idListType", "products")
            if not id_list.strip():
                raise KeyError
        except KeyError:
            raise SuperdeskIngestError.notConfiguredError(Exception("idList is needed"))

        # we check if the provider has been closed since the last update
        try:
            last_closed = provider["last_closed"]["closed_at"]
            last_updated = provider["last_updated"]
        except KeyError:
            pass
        else:
            if last_closed > last_updated and "private" in provider:
                # we reset the private data so only last page of items will be retrieved (cf. SDESK-4372)
                logger.info("reseting private data for provider {source}".format(source=provider.get("source")))
                del provider["private"]

        # we remove spaces and empty values from id_list to do a clean list
        id_list = ",".join([id_.strip() for id_ in id_list.split(",") if id_.strip()])

        params = {
            "idList": id_list,
            "idListType": id_list_type,
            "format": "5",
            "maxItems": "25",
        }
        try:
            min_date_time = provider["private"]["min_date_time"]
            sequence_number = provider["private"]["sequence_number"]
        except KeyError:
            # the provider is new or re-opened, we want last items
            # so we need reverse-chronological order
            chronological = False
        else:
            params["minDateTime"] = min_date_time
            params["sequenceNumber"] = sequence_number
            params["sortOrder"] = "chronological"
            chronological = True

        r = self.get_url(params=params)

        try:
            root_elt = etree.fromstring(r.content)
        except Exception:
            raise IngestApiError.apiRequestError(Exception("error while doing the request"))

        parser = self.get_feed_parser(provider)
        items = parser.parse(root_elt, provider)
        if not chronological:
            items.reverse()

        try:
            min_date_time = root_elt.xpath('//iptc:timestamp[@role="minDateTime"]/text()', namespaces=NS)[0].strip()
            sequence_number = root_elt.xpath("//iptc:transmitId/text()", namespaces=NS)[0].strip()
        except IndexError:
            raise IngestApiError.apiRequestError(Exception("missing minDateTime or transmitId"))
        else:
            update.setdefault("private", {})
            update["private"]["min_date_time"] = min_date_time
            update["private"]["sequence_number"] = sequence_number

        return [items]
class HTTPFeedingServiceBase(FeedingService):
    """
    Base class for feeding services using HTTP.

    This class contains helpers to make the creation of HTTP based feeding services
    easier.

    There are a couple of class attributes you can use:

    =======================  ===========
    Attribute                Explanation
    =======================  ===========
    HTTP_URL                 Main URL of your service, will be used by default in get_url
    HTTP_TIMEOUT             Timeout of requests in seconds
    HTTP_DEFAULT_PARAMETERS  Parameters used in every ``get`` requests.
                             Will be updated with params set in arguments
    HTTP_AUTH                Indicate if HTTP authentication is needed for your service.
                             If None, the authentication will be determined by the existence
                             of user and password. Will be overriden by auth_required config
                             if it exists.
    =======================  ===========

    In addition, you have some pre-filled fields:

    ===============  ===========
    Field            Explanation
    ===============  ===========
    AUTH_FIELDS      username and password fields
    AUTH_REQ_FIELDS  username and password fields + auth_required field to indicate
                     if they are needed
    ===============  ===========

    When ingest is updated, the provider is automatically saved to ``self.provider``.
    ``config`` property allows to access easily the user configuration.
    ``auth_info`` property returns a dictionary with ``username`` and ``password``

    ``get_url`` method do a HTTP Get request. url can be ommited in which case HTTP_URL will be used.
    Authentication parameters are set automatically, and errors are catched appropriately.
    Extra arguments are used directly in *requests* call.

    """

    ERRORS = [
        IngestApiError.apiTimeoutError().get_error_description(),
        IngestApiError.apiRequestError().get_error_description(),
        IngestApiError.apiGeneralError().get_error_description(),
        SuperdeskIngestError.notConfiguredError().get_error_description()
    ]

    # override this parameter with the main URL to use
    HTTP_URL = None
    # timeout in seconds
    HTTP_TIMEOUT = 30
    # if some parameters are used in every request, put them here
    HTTP_DEFAULT_PARAMETERS = None
    # Set to True if authentication is mandatory, False if there is no authentication
    # and None to add authentication if user and password are defined.
    # If auth_required is defined in config fields, it will override this value.
    HTTP_AUTH = True

    # use this when auth is always required
    AUTH_FIELDS = [{
        'id': 'username',
        'type': 'text',
        'label': 'Username',
        'placeholder': 'Username',
        'required': True
    }, {
        'id': 'password',
        'type': 'password',
        'label': 'Password',
        'placeholder': 'Password',
        'required': True
    }]

    # use this when auth depends of a "auth_required" flag (set by user)
    AUTH_REQ_FIELDS = [{
        'id': 'auth_required',
        'type': 'boolean',
        'label': 'Requires Authentication',
        'placeholder': 'Requires Authentication',
        'required': False
    }, {
        'id': 'username',
        'type': 'text',
        'label': 'Username',
        'placeholder': 'Username',
        'required_expression': '{auth_required}',
        'show_expression': '{auth_required}'
    }, {
        'id': 'password',
        'type': 'password',
        'label': 'Password',
        'placeholder': 'Password',
        'required_expression': '{auth_required}',
        'show_expression': '{auth_required}'
    }]

    def __init__(self):
        self.token = None

    @property
    def auth_info(self):
        """Helper method to retrieve a dict with username and password when set"""
        username = self.config.get('username', '')
        password = self.config.get('password', '')
        if not username or not password:
            return None
        return {'username': username, 'password': password}

    @property
    def config(self):
        return self.provider.setdefault('config', {})

    def validate_config(self):
        """
        Validate provider config according to `cls.fields`

        :param config: Ingest provider configuration
        :type config: dict
        :return:
        """
        # validate required config fields
        required_keys = [
            field['id'] for field in self.fields
            if field.get('required', False)
        ]
        if not set(self.config.keys()).issuperset(required_keys):
            raise SuperdeskIngestError.notConfiguredError(
                Exception('{} are required.'.format(', '.join(required_keys))))

        url = self.config.get('url').strip()
        if not url:
            try:
                url_field = next({f for f in self.fields if f['id'] == u'url'})
            except StopIteration:
                url_required = False
            else:
                url_required = url_field.get('required', False)
            if url_required:
                raise SuperdeskIngestError.notConfiguredError(
                    Exception('URL is a required field.'))
        else:
            # validate url
            if not url.startswith('http'):
                raise SuperdeskIngestError.notConfiguredError(
                    Exception('URL must be a valid HTTP link.'))

    def get_url(self, url=None, **kwargs):
        """Do an HTTP Get on URL

        :param string url: url to use (None to use self.HTTP_URL)
        :param **kwargs: extra parameter for requests
        :return requests.Response: response
        """
        if not url:
            url = self.HTTP_URL
        config = self.config
        user = config.get('username')
        password = config.get('password')
        if user:
            user = user.strip()
        if password:
            password = password.strip()

        auth_required = config.get('auth_required', self.HTTP_AUTH)
        if auth_required is None:
            # auth_required may not be user in the feeding service
            # in this case with use authentification only if user
            # and password are set.
            auth_required = bool(user and password)

        if auth_required:
            if not user:
                raise SuperdeskIngestError.notConfiguredError(
                    "user is not configured")
            if not password:
                raise SuperdeskIngestError.notConfiguredError(
                    "password is not configured")
            kwargs.setdefault('auth', (user, password))

        params = kwargs.pop("params", {})
        if params or self.HTTP_DEFAULT_PARAMETERS:
            # if we have default parameters, we want them to be overriden
            # by conflicting params given in arguments
            if self.HTTP_DEFAULT_PARAMETERS:
                params.update(self.HTTP_DEFAULT_PARAMETERS)
            kwargs["params"] = params

        try:
            response = requests.get(url, timeout=self.HTTP_TIMEOUT, **kwargs)
        except requests.exceptions.Timeout as exception:
            raise IngestApiError.apiTimeoutError(exception, self.provider)
        except requests.exceptions.ConnectionError as exception:
            raise IngestApiError.apiConnectionError(exception, self.provider)
        except requests.exceptions.RequestException as exception:
            raise IngestApiError.apiRequestError(exception, self.provider)
        except Exception as exception:
            traceback.print_exc()
            raise IngestApiError.apiGeneralError(exception, self.provider)

        if not response.ok:
            exception = Exception(response.reason)
            if response.status_code in (401, 403):
                raise IngestApiError.apiAuthError(exception, self.provider)
            elif response.status_code == 404:
                raise IngestApiError.apiNotFoundError(exception, self.provider)
            else:
                raise IngestApiError.apiGeneralError(exception, self.provider)

        return response

    def update(self, provider, update):
        self.provider = provider
        self.validate_config()
        return super().update(provider, update)
    def get_url(self, url=None, **kwargs):
        """Do an HTTP Get on URL

        :param string url: url to use (None to use self.HTTP_URL)
        :param **kwargs: extra parameter for requests
        :return requests.Response: response
        """
        if not url:
            url = self.HTTP_URL
        config = self.config
        user = config.get('username')
        password = config.get('password')
        if user:
            user = user.strip()
        if password:
            password = password.strip()

        auth_required = config.get('auth_required', self.HTTP_AUTH)
        if auth_required is None:
            # auth_required may not be user in the feeding service
            # in this case with use authentification only if user
            # and password are set.
            auth_required = bool(user and password)

        if auth_required:
            if not user:
                raise SuperdeskIngestError.notConfiguredError(
                    "user is not configured")
            if not password:
                raise SuperdeskIngestError.notConfiguredError(
                    "password is not configured")
            kwargs.setdefault('auth', (user, password))

        params = kwargs.pop("params", {})
        if params or self.HTTP_DEFAULT_PARAMETERS:
            # if we have default parameters, we want them to be overriden
            # by conflicting params given in arguments
            if self.HTTP_DEFAULT_PARAMETERS:
                params.update(self.HTTP_DEFAULT_PARAMETERS)
            kwargs["params"] = params

        try:
            response = requests.get(url, timeout=self.HTTP_TIMEOUT, **kwargs)
        except requests.exceptions.Timeout as exception:
            raise IngestApiError.apiTimeoutError(exception, self.provider)
        except requests.exceptions.ConnectionError as exception:
            raise IngestApiError.apiConnectionError(exception, self.provider)
        except requests.exceptions.RequestException as exception:
            raise IngestApiError.apiRequestError(exception, self.provider)
        except Exception as exception:
            traceback.print_exc()
            raise IngestApiError.apiGeneralError(exception, self.provider)

        if not response.ok:
            exception = Exception(response.reason)
            if response.status_code in (401, 403):
                raise IngestApiError.apiAuthError(exception, self.provider)
            elif response.status_code == 404:
                raise IngestApiError.apiNotFoundError(exception, self.provider)
            else:
                raise IngestApiError.apiGeneralError(exception, self.provider)

        return response
Example #32
0
class EventHTTPFeedingService(HTTPFeedingService):
    """
    Feeding Service class which can read events using HTTP
    """

    NAME = 'event_http'
    ERRORS = [
        IngestApiError.apiTimeoutError().get_error_description(),
        IngestApiError.apiRedirectError().get_error_description(),
        IngestApiError.apiRequestError().get_error_description(),
        IngestApiError.apiUnicodeError().get_error_description(),
        IngestApiError.apiParseError().get_error_description(),
        IngestApiError.apiGeneralError().get_error_description()
    ]

    label = 'Event HTTP feed'
    """
    Defines the collection service to be used with this ingest feeding service.
    """
    service = 'events'

    fields = [{
        'id': 'url',
        'type': 'text',
        'label': 'Feed URL',
        'placeholder': 'Feed URL',
        'required': True
    }]

    def _update(self, provider, update):
        updated = utcnow()

        last_updated = provider.get('last_updated')
        ttl_minutes = app.config['INGEST_EXPIRY_MINUTES']
        if not last_updated or last_updated < updated - datetime.timedelta(
                minutes=ttl_minutes):
            last_updated = updated - datetime.timedelta(minutes=ttl_minutes)

        self.provider = provider
        provider_config = provider.get('config')
        if not provider_config:
            provider_config = {}
            provider['config'] = provider_config

        self.URL = provider_config.get('url')
        payload = {}

        parser = self.get_feed_parser(provider)

        try:
            response = requests.get(self.URL, params=payload, timeout=15)
            # TODO: check if file has been updated since provider last_updated
            # although some ptovider do not include 'Last-Modified' in headers
            # so unsure how to do this
            logger.info('Http Headers: %s', response.headers)
        except requests.exceptions.Timeout as ex:
            # Maybe set up for a retry, or continue in a retry loop
            raise IngestApiError.apiTimeoutError(ex, self.provider)
        except requests.exceptions.TooManyRedirects as ex:
            # Tell the user their URL was bad and try a different one
            raise IngestApiError.apiRedirectError(ex, self.provider)
        except requests.exceptions.RequestException as ex:
            # catastrophic error. bail.
            raise IngestApiError.apiRequestError(ex, self.provider)
        except Exception as error:
            traceback.print_exc()
            raise IngestApiError.apiGeneralError(error, self.provider)

        if response.status_code == 404:
            raise LookupError('Not found %s' % payload)

        logger.info('Ingesting: %s', str(response.content))

        if isinstance(parser, NTBEventXMLFeedParser):
            xml = ET.fromstring(response.content)
            items = parser.parse(xml, provider)
        elif isinstance(parser, IcsTwoFeedParser):
            cal = Calendar.from_ical(response.content)
            items = parser.parse(cal, provider)
        else:
            items = parser.parse(response.content)

        if isinstance(items, list):
            yield items
        else:
            yield [items]
Example #33
0
    def _update(self, provider, update):
        self.HTTP_URL = provider.get("config", {}).get("api_url", "")
        self.provider = provider

        # Set the apikey parameter we're going to use it on all calls
        params = dict()
        params["apikey"] = provider.get("config", {}).get("apikey")

        # Use the next link if one is available in the config
        if provider.get("config", {}).get("next_link"):
            r = self.get_url(url=provider.get("config", {}).get("next_link"),
                             params=params,
                             verify=False,
                             allow_redirects=True)
            r.raise_for_status()
        else:
            id_list = provider.get("config", {}).get("productList", "").strip()
            recovery_time = provider.get("config", {}).get("recoverytime", "1")
            recovery_time = recovery_time.strip() if recovery_time else ""
            if recovery_time == "":
                recovery_time = "1"
            start = datetime.strftime(
                utcnow() - timedelta(hours=int(recovery_time)),
                "%Y-%m-%dT%H:%M:%SZ")
            # If there has been a list of products defined then we format them for the request, if not all
            # allowed products will be returned.
            if id_list:
                # we remove spaces and empty values from id_list to do a clean list
                id_list = " OR ".join(
                    [id_.strip() for id_ in id_list.split(",") if id_.strip()])
                params[
                    "q"] = "productid:(" + id_list + ") AND mindate:>{}".format(
                        start)
            else:
                params["q"] = "mindate:>{}".format(start)
            params["page_size"] = "100"
            params["versions"] = "all"

            logger.info("AP Media Start/Recovery time: {} params {}".format(
                recovery_time, params))
            r = self.get_url(params=params, verify=False, allow_redirects=True)
            r.raise_for_status()
        try:
            response = json.loads(r.text)
        except Exception:
            raise IngestApiError.apiRequestError(
                Exception("error parsing response"))

        nextLink = response.get("data", {}).get("next_page")
        # Got the same next link as last time so nothing new
        if nextLink == provider.get("config", {}).get("next_link"):
            logger.info("Nothing new from AP Media")
            return []

        parser = self.get_feed_parser(provider)
        parsed_items = []
        for item in response.get("data", {}).get("items", []):
            try:
                # Get the item meta data
                logger.info('Get AP meta data for "{}" uri: {}'.format(
                    item.get("item", {}).get("headline"),
                    item.get("item", {}).get("uri")))
                r = self.api_get(item.get("item", {}).get("uri"), provider)
                complete_item = json.loads(r.text)

                # Get the nitf rendition of the item
                nitf_ref = (complete_item.get("data", {}).get("item", {}).get(
                    "renditions", {}).get("nitf", {}).get("href"))
                if nitf_ref:
                    logger.info("Get AP nitf : {}".format(nitf_ref))
                    r = self.api_get(nitf_ref, provider)
                    root_elt = etree.fromstring(r.content)
                    nitf_item = nitf.NITFFeedParser().parse(root_elt)
                    complete_item["nitf"] = nitf_item
                else:
                    if item.get("item", {}).get("type") == "text":
                        logger.warning("No NITF for story {}".format(
                            item.get("item", {}).get("uri")))

                associations = complete_item["data"]["item"].get(
                    "associations")
                if associations:
                    complete_item["associations"] = {}
                    for key, assoc in associations.items():
                        logger.info('Get AP association "%s"',
                                    assoc.get("headline"))
                        try:
                            related_json = self.api_get(
                                assoc["uri"], provider).json()
                            complete_item["associations"][key] = related_json
                        except IngestApiError:
                            logger.warning("Could not fetch AP association",
                                           extra=assoc)

                parsed_items.append(parser.parse(complete_item, provider))

            # Any exception processing an indivisual item is swallowed
            except Exception as ex:
                logger.exception(ex)

        # Save the link for next time
        upd_provider = provider.get("config")
        upd_provider["next_link"] = nextLink
        upd_provider["recoverytime"] = None
        update["config"] = upd_provider

        return [parsed_items]
Example #34
0
    def _update(self, provider, update):
        self.HTTP_URL = provider.get('config', {}).get('api_url', '')
        self.provider = provider

        # Set the apikey parameter we're going to use it on all calls
        params = dict()
        params['apikey'] = provider.get('config', {}).get('apikey')

        # Use the next link if one is available in the config
        if provider.get('config', {}).get('next_link'):
            r = self.get_url(url=provider.get('config', {}).get('next_link'), params=params,
                             verify=False, allow_redirects=True)
            r.raise_for_status()
        else:
            id_list = provider.get('config', {}).get('productList', '').strip()
            recovery_time = provider.get('config', {}).get('recoverytime', '1')
            recovery_time = recovery_time.strip() if recovery_time else ''
            if recovery_time == '':
                recovery_time = '1'
            start = datetime.strftime(utcnow() - timedelta(hours=int(recovery_time)), '%Y-%m-%dT%H:%M:%SZ')
            # If there has been a list of products defined then we format them for the request, if not all
            # allowed products will be returned.
            if id_list:
                # we remove spaces and empty values from id_list to do a clean list
                id_list = ' OR '.join([id_.strip() for id_ in id_list.split(',') if id_.strip()])
                params['q'] = 'productid:(' + id_list + ') AND mindate:>{}'.format(start)
            else:
                params['q'] = 'mindate:>{}'.format(start)
            params['page_size'] = '100'
            params['versions'] = 'all'

            logger.info('AP Media Start/Recovery time: {} params {}'.format(recovery_time, params))
            r = self.get_url(params=params, verify=False, allow_redirects=True)
            r.raise_for_status()
        try:
            response = json.loads(r.text)
        except Exception:
            raise IngestApiError.apiRequestError(Exception('error parsing response'))

        nextLink = response.get('data', {}).get('next_page')
        # Got the same next link as last time so nothing new
        if nextLink == provider.get('config', {}).get('next_link'):
            logger.info('Nothing new from AP Media')
            return []

        parser = self.get_feed_parser(provider)
        parsed_items = []
        for item in response.get('data', {}).get('items', []):
            try:
                # Get the item meta data
                r = self.get_url(url=item.get('item', {}).get('uri'),
                                 params={'apikey': provider.get('config', {}).get('apikey')}, verify=False,
                                 allow_redirects=True)
                logger.info('Get AP meta data for "{}" uri: {}'.format(item.get('item', {}).get('headline'),
                                                                       item.get('item', {}).get('uri')))
                r.raise_for_status()
                complete_item = json.loads(r.text)

                # Get the nitf rendition of the item
                nitf_ref = complete_item.get('data', {}).get('item', {}).get('renditions', {}).get('nitf', {}).get(
                    'href')
                if nitf_ref:
                    logger.info('Get AP nitf : {}'.format(nitf_ref))
                    r = self.get_url(url=nitf_ref, params={'apikey': provider.get('config', {}).get('apikey')},
                                     verify=False, allow_redirects=True)
                    r.raise_for_status()
                    root_elt = etree.fromstring(r.content)
                    nitf_item = nitf.NITFFeedParser().parse(root_elt)
                    complete_item['nitf'] = nitf_item
                else:
                    if item.get('item', {}).get('type') == 'text':
                        logger.warning('No NITF for story {}'.format(item.get('item', {}).get('uri')))

                parsed_items.append(parser.parse(complete_item, provider))

            # Any exception processing an indivisual item is swallowed
            except Exception as ex:
                logger.exception(ex)

        # Save the link for next time
        upd_provider = provider.get('config')
        upd_provider['next_link'] = nextLink
        upd_provider['recoverytime'] = None
        update['config'] = upd_provider

        return [parsed_items]
Example #35
0
class RitzauFeedingService(HTTPFeedingServiceBase):
    """
    Feeding Service class which can retrieve articles from Ritzau web service
    """

    NAME = 'ritzau'

    ERRORS = [IngestApiError.apiRequestError().get_error_description(),
              SuperdeskIngestError.notConfiguredError().get_error_description()]

    label = 'Ritzau feed API'

    fields = HTTPFeedingServiceBase.AUTH_FIELDS + [
        {
            'id': 'url', 'type': 'text', 'label': 'URL',
            'placeholder': 'fill this field only for advanced uses', 'required': False
        }
    ]

    HTTP_URL = 'https://services.ritzau.dk/ritzaurest/Services.svc/xml/news/NewsQueue'
    # auth is done with params
    HTTP_AUTH = False

    def _update(self, provider, update):
        config = self.config
        try:
            user, password = self.config['username'], self.config['password']
        except KeyError:
            SuperdeskIngestError.notConfiguredError(Exception('username and password are needed'))

        url_override = config.get('url', '').strip()
        if not url_override.startswith('http'):
            SuperdeskIngestError.notConfiguredError(Exception('if URL is set, it must be a valid http link'))

        if url_override:
            params = {'user': user, 'password': password, 'maksAntal': 50}
        else:
            params = {'user': user, 'password': password, 'maksAntal': 50, 'waitAcknowledge': 'true'}

        r = self.get_url(url_override, params=params)

        try:
            root_elt = etree.fromstring(r.text)
        except Exception:
            raise IngestApiError.apiRequestError(Exception('error while parsing the request answer'))

        try:
            if root_elt.xpath('(//error/text())[1]')[0] != '0':
                err_msg = root_elt.xpath('(//errormsg/text())[1]')[0]
                raise IngestApiError.apiRequestError(Exception('error code returned by API: {msg}'.format(msg=err_msg)))
        except IndexError:
            raise IngestApiError.apiRequestError(Exception('Invalid XML, <error> element not found'))

        parser = self.get_feed_parser(provider)
        items = []
        for elt in root_elt.xpath('//RBNews'):
            item = parser.parse(elt, provider)
            items.append(item)
            if not url_override:
                try:
                    queue_id = elt.xpath('.//ServiceQueueId/text()')[0]
                except IndexError:
                    raise IngestApiError.apiRequestError(Exception('missing ServiceQueueId element'))
                ack_params = {'user': user, 'password': password, 'servicequeueid': queue_id}
                self.get_url(URL_ACK, params=ack_params)

        return [items]
Example #36
0
from superdesk.io.ingest_service import IngestService

from superdesk.utc import utcnow
from superdesk.etree import etree, ParseError
from superdesk.io import register_provider
from .newsml_2_0 import NewsMLTwoParser
from .reuters_token import get_token
from superdesk.errors import IngestApiError
from flask import current_app as app


PROVIDER = 'reuters'
errors = [IngestApiError.apiTimeoutError().get_error_description(),
          IngestApiError.apiRedirectError().get_error_description(),
          IngestApiError.apiRequestError().get_error_description(),
          IngestApiError.apiUnicodeError().get_error_description(),
          IngestApiError.apiParseError().get_error_description(),
          IngestApiError.apiGeneralError().get_error_description()]


class ReutersIngestService(IngestService):
    """Reuters ingest service."""

    DATE_FORMAT = '%Y.%m.%d.%H.%M'
    URL = 'http://rmb.reuters.com/rmd/rest/xml'
    token = None

    def __init__(self):
        self.parser = NewsMLTwoParser()
Example #37
0
class ReutersHTTPFeedingService(HTTPFeedingService):
    """
    Feeding Service class which can read article(s) using HTTP provided by Reuters.
    """

    NAME = "reuters_http"

    ERRORS = [
        IngestApiError.apiTimeoutError().get_error_description(),
        IngestApiError.apiRedirectError().get_error_description(),
        IngestApiError.apiRequestError().get_error_description(),
        IngestApiError.apiUnicodeError().get_error_description(),
        IngestApiError.apiParseError().get_error_description(),
        IngestApiError.apiGeneralError().get_error_description(),
    ]

    DATE_FORMAT = "%Y.%m.%d.%H.%M"

    label = "Reuters feed API"

    fields = [
        {
            "id": "url",
            "type": "text",
            "label": "Feed URL",
            "placeholder": "Feed URL",
            "required": True,
            "default": "http://rmb.reuters.com/rmd/rest/xml",
        },
        {
            "id": "auth_url",
            "type": "text",
            "label": "URL for Authentication",
            "placeholder": "authentication url",
            "required": True,
            "default": "https://commerce.reuters.com/rmd/rest/xml/login",
        },
        {"id": "username", "type": "text", "label": "Username", "placeholder": "Username", "required": True},
        {"id": "password", "type": "password", "label": "Password", "placeholder": "Password", "required": True},
    ]

    session = None

    def _update(self, provider, update):
        updated = utcnow()

        last_updated = provider.get("last_updated")
        ttl_minutes = app.config["INGEST_EXPIRY_MINUTES"]
        if not last_updated or last_updated < updated - datetime.timedelta(minutes=ttl_minutes):
            last_updated = updated - datetime.timedelta(minutes=ttl_minutes)

        self.provider = provider
        provider_config = provider.get("config")
        if not provider_config:
            provider_config = {}
            provider["config"] = provider_config

        provider_config.setdefault("url", "http://rmb.reuters.com/rmd/rest/xml")
        provider_config.setdefault("auth_url", "https://commerce.reuters.com/rmd/rest/xml/login")
        self.URL = provider_config.get("url")

        for channel in self._get_channels():
            ids = self._get_article_ids(channel, last_updated, updated)
            for id in ids:
                try:
                    items = self.fetch_ingest(id)
                    if items:
                        yield items
                # if there was an exception processing the one of the bunch log it and continue
                except Exception as ex:
                    logger.warn("Reuters item {} has not been retrieved".format(id))
                    logger.exception(ex)

    def _get_channels(self):
        """Get subscribed channels."""
        channels = []
        tree = self._get_tree("channels")
        for channel in tree.findall("channelInformation"):
            channels.append(channel.find("alias").text)

        return channels

    def _get_tree(self, endpoint, payload=None):
        """Get xml response for given API endpoint and payload.

        :param: endpoint
        :type endpoint: str
        :param: payload
        :type payload: str
        """

        if payload is None:
            payload = {}

        payload["token"] = self._get_auth_token(self.provider, update=True)
        url = self._get_absolute_url(endpoint)

        if not self.session:
            self.session = requests.Session()

        retries = 0
        while True:
            try:
                response = self.session.get(url, params=payload, timeout=(30, 15))
            except requests.exceptions.Timeout as ex:
                if retries < 3:
                    logger.warn("Reuters API timeout retrying, retries {}".format(retries))
                    retries += 1
                    continue
                raise IngestApiError.apiTimeoutError(ex, self.provider)
            except requests.exceptions.TooManyRedirects as ex:
                # Tell the user their URL was bad and try a different one
                raise IngestApiError.apiRedirectError(ex, self.provider)
            except requests.exceptions.RequestException as ex:
                # catastrophic error. bail.
                raise IngestApiError.apiRequestError(ex, self.provider)
            except Exception as error:
                traceback.print_exc()
                raise IngestApiError.apiGeneralError(error, self.provider)

            if response.status_code == 404:
                raise LookupError(_("Not found {payload}").format(payload=payload))

            break

        try:
            return etree.fromstring(response.content)  # workaround for http mock lib
        except UnicodeEncodeError as error:
            traceback.print_exc()
            raise IngestApiError.apiUnicodeError(error, self.provider)
        except ParseError as error:
            traceback.print_exc()
            raise IngestApiError.apiParseError(error, self.provider)
        except Exception as error:
            traceback.print_exc()
            raise IngestApiError.apiGeneralError(error, self.provider)

    def _get_absolute_url(self, endpoint):
        """
        Get absolute URL for given endpoint.

        :param: endpoint
        :type endpoint: str
        """
        return "/".join([self.URL, endpoint])

    def _get_article_ids(self, channel, last_updated, updated):
        """
        Get article ids which should be upserted also save the poll token that is returned.
        """
        ids = set()
        payload = {"channel": channel, "fieldsRef": "id"}

        # check if the channel has a pollToken if not fall back to dateRange
        last_poll_token = self._get_poll_token(channel)
        if last_poll_token is not None:
            logger.info("Reuters requesting channel {} with poll token {}".format(channel, last_poll_token))
            payload["pollToken"] = last_poll_token
        else:
            payload["dateRange"] = "%s-%s" % (self._format_date(last_updated), self._format_date(updated))
            logger.info("Reuters requesting channel {} with dateRange {}".format(channel, payload["dateRange"]))

        tree = self._get_tree("items", payload)
        status_code = tree.find("status").get("code") if tree.tag == "results" else tree.get("code")
        # check the returned status
        if status_code != "10":
            logger.warn("Reuters channel request returned status code {}".format(status_code))
            # status code 30 indicates failure
            if status_code == "30":
                # invalid token
                logger.warn(
                    "Reuters error on channel {} code {} {}".format(
                        channel, tree.find("error").get("code"), tree.find("error").text
                    )
                )
                if tree.find("error").get("code") == "2100":
                    self._save_poll_token(channel, None)
                    logger.warn("Reuters channel invalid token reseting {}".format(status_code))
                return ids

        # extract the returned poll token if there is one
        poll_token = tree.find("pollToken")
        if poll_token is not None:
            # a new token indicated new content
            if poll_token.text != last_poll_token:
                logger.info("Reuters channel {} new token {}".format(channel, poll_token.text))
                self._save_poll_token(channel, poll_token.text)
            else:
                # the token has not changed, so nothing new
                logger.info("Reuters channel {} nothing new".format(channel))
                return ids
        else:
            logger.info("Reuters channel {} retrieved no token".format(channel))
            return ids

        for result in tree.findall("result"):
            id = result.find("id").text
            ids.add(id)
            logger.info("Reuters id : {}".format(id))

        return ids

    def _save_poll_token(self, channel, poll_token):
        """Saves the poll token for the passed channel in the config section of the

        :param channel:
        :param poll_token:
        :return:
        """
        # get the provider in case it has been updated by another channel
        ingest_provider_service = superdesk.get_resource_service("ingest_providers")
        provider = ingest_provider_service.find_one(req=None, _id=self.provider[superdesk.config.ID_FIELD])
        provider_token = provider.get("tokens")
        if "poll_tokens" not in provider_token:
            provider_token["poll_tokens"] = {channel: poll_token}
        else:
            provider_token["poll_tokens"][channel] = poll_token
        upd_provider = {"tokens": provider_token}
        ingest_provider_service.system_update(self.provider[superdesk.config.ID_FIELD], upd_provider, self.provider)

    def _get_poll_token(self, channel):
        """Get the poll token from provider config if it is available.

        :param channel:
        :return: token
        """
        if "tokens" in self.provider and "poll_tokens" in self.provider["tokens"]:
            return self.provider.get("tokens").get("poll_tokens").get(channel, None)

    def _format_date(self, date):
        return date.strftime(self.DATE_FORMAT)

    def fetch_ingest(self, id):
        items = self._parse_items(id)
        result_items = []
        while items:
            item = items.pop()
            self.localize_timestamps(item)
            try:
                items.extend(self._fetch_items_in_package(item))
                result_items.append(item)
            except LookupError as err:
                self.log_item_error(err, item, self.provider)
                return []

        return result_items

    def _parse_items(self, id):
        """
        Parse item message and return given items.
        """

        payload = {"id": id}
        tree = self._get_tree("item", payload)

        parser = self.get_feed_parser(self.provider, tree)
        items = parser.parse(tree, self.provider)

        return items

    def _fetch_items_in_package(self, item):
        """
        Fetch remote assets for given item.
        """
        items = []
        for group in item.get("groups", []):
            for ref in group.get("refs", []):
                if "residRef" in ref:
                    items.extend(self._parse_items(ref.get("residRef")))

        return items

    def prepare_href(self, href, mimetype=None):
        (scheme, netloc, path, params, query, fragment) = urlparse(href)
        new_href = urlunparse((scheme, netloc, path, "", "", ""))
        return "%s?auth_token=%s" % (new_href, self._get_auth_token(self.provider, update=True))
Example #38
0
class APFeedingService(FeedingService):
    """
    Feeding Service class which can retrieve articles from Associated Press web service
    """

    NAME = 'ap'

    ERRORS = [
        IngestApiError.apiRequestError().get_error_description(),
        SuperdeskIngestError.notConfiguredError().get_error_description()
    ]

    label = 'AP feed API'

    fields = [{
        'id': 'username',
        'type': 'text',
        'label': 'Username',
        'placeholder': 'Username',
        'required': True
    }, {
        'id': 'password',
        'type': 'password',
        'label': 'Password',
        'placeholder': 'Password',
        'required': True
    }, {
        'id': 'idList',
        'type': 'text',
        'label': 'Id List',
        'placeholder': 'use coma separated ids for multiple values',
        'required': False
    }]

    def config_test(self, provider=None):
        super().config_test(provider)

    def _update(self, provider, update):
        try:
            config = provider['config']
            user = config['username']
            password = config['password']
            id_list = config['idList']
            if not user.strip() or not password.strip() or not id_list.strip():
                raise KeyError
        except KeyError:
            raise SuperdeskIngestError.notConfiguredError(
                Exception('username, password and idList are needed'))

        # we remove spaces and empty values from id_list to do a clean list
        id_list = ','.join(
            [id_.strip() for id_ in id_list.split(',') if id_.strip()])

        params = {
            'idList': id_list,
            'idListType': 'products',
            'format': '5',
            'maxItems': '25',
            'sortOrder': 'chronological'
        }
        try:
            min_date_time = provider['private']['min_date_time']
            sequence_number = provider['private']['sequence_number']
        except KeyError:
            pass
        else:
            params['minDateTime'] = min_date_time
            params['sequenceNumber'] = sequence_number

        try:
            r = requests.get(URL, auth=(user, password), params=params)
        except Exception:
            raise IngestApiError.apiRequestError(
                Exception('error while doing the request'))

        try:
            root_elt = etree.fromstring(r.content)
        except Exception:
            raise IngestApiError.apiRequestError(
                Exception('error while doing the request'))

        parser = self.get_feed_parser(provider)
        items = parser.parse(root_elt, provider)

        try:
            min_date_time = root_elt.xpath(
                '//iptc:timestamp[@role="minDateTime"]/text()',
                namespaces=NS)[0].strip()
            sequence_number = root_elt.xpath('//iptc:transmitId/text()',
                                             namespaces=NS)[0].strip()
        except IndexError:
            raise IngestApiError.apiRequestError(
                Exception('missing minDateTime or transmitId'))
        else:
            update.setdefault('private', {})
            update['private']['min_date_time'] = min_date_time
            update['private']['sequence_number'] = sequence_number

        return [items]