Exemple #1
0
 def _test(self, provider):
     """Test connection."""
     config = provider.get('config', {})
     xml = self._fetch_data(config, provider)
     data = feedparser.parse(xml)
     if data.bozo:
         raise ParserError.parseMessageError(data.bozo_exception, provider)
    def parse_email(self, content, content_type, provider):
        if content_type != 'text/calendar':
            raise ParserError.parseMessageError('Not supported content type.')

        content.seek(0)
        cal = Calendar.from_ical(content.read())
        return self.parse(cal, provider)
Exemple #3
0
    def parse(self, xml, provider=None):
        self.provider = provider
        item = {
            ITEM_TYPE:
            CONTENT_TYPE.TEXT,  # set the default type.
            'versioncreated':
            utcnow(),
            'anpa_category': [{
                "name": "Formidlingstjenester",
                "qcode": "r"
            }],
            'genre': [{
                "name": "Fulltekstmeldinger",
                "qcode": "Fulltekstmeldinger",
                "scheme": "genre_custom"
            }],
            'subject': [{
                'qcode': 'PRM-NTB',
                'name': 'PRM-NTB',
                'scheme': 'category'
            }],
            'urgency':
            6,
            'ednote':
            '*** Dette er en pressemelding formidlet av NTB pva. andre ***'
        }

        try:
            self.do_mapping(item, xml)
        except Exception as ex:
            raise ParserError.parseMessageError(ex, provider)
        return [item]
Exemple #4
0
 def _test(self, provider):
     """Test connection."""
     self.provider = provider
     xml = self._fetch_data()
     data = feedparser.parse(xml)
     if data.bozo:
         raise ParserError.parseMessageError(data.bozo_exception, provider)
Exemple #5
0
    def parse_email(self, content, content_type, provider):
        if content_type != 'text/xml':
            raise ParserError.parseMessageError('Not supported content type.')

        content.seek(0)
        xml = ET.parse(content)
        return self.parse(xml.getroot(), provider)
Exemple #6
0
    def _update(self, provider):
        """
        Check data provider for data updates and returns new items (if any).

        :param provider: data provider instance
        :return: a list containing a list of new content items
        :rtype: list

        :raises IngestApiError: if data retrieval error occurs
        :raises ParserError: if retrieved RSS data cannot be parsed
        """
        config = provider.get('config', {})

        if config.get('auth_required'):
            self.auth_info = {
                'username': config.get('username', ''),
                'password': config.get('password', '')
            }

        try:
            xml_data = self._fetch_data(config, provider)
            data = feedparser.parse(xml_data)
        except IngestApiError:
            raise
        except Exception as ex:
            raise ParserError.parseMessageError(ex, provider)

        # If provider last updated time is not available, set it to 1.1.1970
        # so that it will be recognized as "not up to date".
        # Also convert it to a naive datetime object (removing tzinfo is fine,
        # because it is in UTC anyway)
        t_provider_updated = provider.get('last_updated', utcfromtimestamp(0))
        t_provider_updated = t_provider_updated.replace(tzinfo=None)

        new_items = []
        field_aliases = config.get('field_aliases')

        for entry in data.entries:
            t_entry_updated = utcfromtimestamp(timegm(entry.updated_parsed))

            if t_entry_updated <= t_provider_updated:
                continue

            item = self._create_item(entry, field_aliases, provider.get('source', None))
            self.add_timestamps(item)

            # If the RSS entry references any images, create picture items from
            # them and create a package referencing them and the entry itself.
            # If there are no image references, treat entry as a simple text
            # item, even if it might reference other media types, e.g. videos.
            image_urls = self._extract_image_links(entry)
            if image_urls:
                image_items = self._create_image_items(image_urls, item)
                new_items.extend(image_items)
                new_items.append(item)
                item = self._create_package(item, image_items)

            new_items.append(item)

        return [new_items]
Exemple #7
0
 def _test(self, provider):
     """Test connection."""
     config = provider.get('config', {})
     xml = self._fetch_data(config, provider)
     data = feedparser.parse(xml)
     if data.bozo:
         raise ParserError.parseMessageError(data.bozo_exception, provider)
Exemple #8
0
    def _update(self, provider):
        """
        Check data provider for data updates and returns new items (if any).

        :param provider: data provider instance
        :return: a list containing a list of new content items
        :rtype: list

        :raises IngestApiError: if data retrieval error occurs
        :raises ParserError: if retrieved RSS data cannot be parsed
        """
        config = provider.get('config', {})

        if config.get('auth_required'):
            self.auth_info = {
                'username': config.get('username', ''),
                'password': config.get('password', '')
            }

        try:
            xml_data = self._fetch_data(config, provider)
            data = feedparser.parse(xml_data)
        except IngestApiError:
            raise
        except Exception as ex:
            raise ParserError.parseMessageError(ex, provider)

        # If provider last updated time is not available, set it to 1.1.1970
        # so that it will be recognized as "not up to date".
        # Also convert it to a naive datetime object (removing tzinfo is fine,
        # because it is in UTC anyway)
        t_provider_updated = provider.get('last_updated', utcfromtimestamp(0))
        t_provider_updated = t_provider_updated.replace(tzinfo=None)

        new_items = []
        field_aliases = config.get('field_aliases')

        for entry in data.entries:
            t_entry_updated = utcfromtimestamp(timegm(entry.updated_parsed))

            if t_entry_updated <= t_provider_updated:
                continue

            item = self._create_item(entry, field_aliases)
            self.add_timestamps(item)

            # If the RSS entry references any images, create picture items from
            # them and create a package referencing them and the entry itself.
            # If there are no image references, treat entry as a simple text
            # item, even if it might reference other media types, e.g. videos.
            image_urls = self._extract_image_links(entry)
            if image_urls:
                image_items = self._create_image_items(image_urls, item)
                new_items.extend(image_items)
                new_items.append(item)
                item = self._create_package(item, image_items)

            new_items.append(item)

        return [new_items]
Exemple #9
0
 def parse(self, xml, provider=None):
     item = {ITEM_TYPE: CONTENT_TYPE.TEXT,  # set the default type.
             }
     try:
         self.do_mapping(item, xml, namespaces=NS)
     except Exception as ex:
         raise ParserError.parseMessageError(ex, provider)
     return item
 def parse(self, xml, provider=None):
     item = {ITEM_TYPE: CONTENT_TYPE.TEXT,  # set the default type.
             }
     try:
         self.do_mapping(item, xml, namespaces=NS)
     except Exception as ex:
         raise ParserError.parseMessageError(ex, provider)
     return item
Exemple #11
0
 def parse(self, xml, provider=None):
     item = super().parse(xml, provider)
     try:
         category = utils.ingest_category_from_subject(item.get('subject'))
         item.setdefault('subject', []).append(category)
         utils.set_default_service(item)
     except Exception as ex:
         raise ParserError.parseMessageError(ex, provider)
     return item
Exemple #12
0
    def _update(self, provider, update):
        """
        Check data provider for data updates and returns new items (if any).

        :param provider: data provider instance
        :return: a list containing a list of new content items
        :rtype: list

        :raises IngestApiError: if data retrieval error occurs
        :raises ParserError: if retrieved RSS data cannot be parsed
        """
        xml_data = self._fetch_data()

        try:
            data = feedparser.parse(xml_data)
        except Exception as ex:
            raise ParserError.parseMessageError(ex, provider, data=xml_data)

        # If provider last updated time is not available, set it to 1.1.1970
        # so that it will be recognized as "not up to date".
        # Also convert it to a naive datetime object (removing tzinfo is fine,
        # because it is in UTC anyway)
        t_provider_updated = provider.get(LAST_ITEM_UPDATE,
                                          utcfromtimestamp(0))
        t_provider_updated = t_provider_updated.replace(tzinfo=None)

        new_items = []
        field_aliases = self.config.get("field_aliases")

        for entry in data.entries:
            try:
                t_entry_updated = utcfromtimestamp(timegm(
                    entry.updated_parsed))
                if t_entry_updated <= t_provider_updated:
                    continue
            except (AttributeError, TypeError):
                # missing updated info, so better ingest it
                pass

            item = self._create_item(entry, field_aliases,
                                     provider.get("source", None))
            self.localize_timestamps(item)

            # If the RSS entry references any images, create picture items from
            # them and create a package referencing them and the entry itself.
            # If there are no image references, treat entry as a simple text
            # item, even if it might reference other media types, e.g. videos.
            image_urls = self._extract_image_links(entry)
            if image_urls:
                image_items = self._create_image_items(image_urls, item)
                new_items.extend(image_items)
                new_items.append(item)
                item = self._create_package(item, image_items)

            new_items.append(item)

        return [new_items]
Exemple #13
0
    def _update(self, provider, update):
        json_items = self._fetch_data()
        parsed_items = []

        for item in json_items:
            try:
                parser = self.get_feed_parser(provider, item)
                parsed_items.append(parser.parse(item))
            except Exception as ex:
                raise ParserError.parseMessageError(ex, provider, data=item)

        return parsed_items
 def test_parse_message_error_save_data(self):
     data = 'some data'
     with assert_raises(ParserError):
         try:
             raise Exception("Err message")
         except Exception as ex:
             raise ParserError.parseMessageError(ex, self.provider, data=data)
     self.assertEqual(len(self.mock_logger_handler.messages['error']), 1)
     message = self.mock_logger_handler.messages['error'][0]
     self.assertIn('file=', message)
     filename = message.split('file=')[1]
     with open(filename, 'r') as file:
         self.assertEqual(data, file.read())
 def test_raise_parseMessageError(self):
     with assert_raises(ParserError) as error_context:
         ex = Exception("Testing parseMessageError")
         raise ParserError.parseMessageError(ex, self.provider)
     exception = error_context.exception
     self.assertTrue(exception.code == 1001)
     self.assertTrue(exception.message == "Message could not be parsed")
     self.assertIsNotNone(exception.system_exception)
     self.assertEqual(exception.system_exception.args[0], "Testing parseMessageError")
     self.assertEqual(len(self.mock_logger_handler.messages['error']), 1)
     self.assertEqual(self.mock_logger_handler.messages['error'][0],
                      "ParserError Error 1001 - Message could not be parsed: "
                      "Testing parseMessageError on channel TestProvider")
Exemple #16
0
    def _update(self, provider, update):
        config = provider.get('config', {})
        json_items = self._fetch_data(config, provider)
        parsed_items = []

        for item in json_items:
            try:
                parser = self.get_feed_parser(provider, item)
                parsed_items.append(parser.parse(item))
            except Exception as ex:
                raise ParserError.parseMessageError(ex, provider, data=item)

        return parsed_items
Exemple #17
0
 def test_raise_parseMessageError(self):
     with assert_raises(ParserError) as error_context:
         ex = Exception("Testing parseMessageError")
         raise ParserError.parseMessageError(ex, self.provider)
     exception = error_context.exception
     self.assertTrue(exception.code == 1001)
     self.assertTrue(exception.message == "Message could not be parsed")
     self.assertIsNotNone(exception.system_exception)
     self.assertEquals(exception.system_exception.args[0],
                       "Testing parseMessageError")
     self.assertEqual(len(self.mock_logger_handler.messages['error']), 1)
     self.assertEqual(
         self.mock_logger_handler.messages['error'][0],
         "ParserError Error 1001 - Message could not be parsed: "
         "Testing parseMessageError on channel TestProvider")
Exemple #18
0
    def _get_decsription(self, lines, provider):
        """Lookup the bom product to determine the descriptive string, not finding this is a fatal error

        :param lines:
        :param provider:
        :return:
        """
        warning_str = 'Unknown'
        bom_products_map = get_resource_service('vocabularies').find_one(req=None, _id='bom_products')
        product = [x for x in bom_products_map.get('items', []) if x['qcode'] == lines[0].strip() and x['is_active']]
        if len(product) > 0:
            warning_str = product[0].get('name', '')
        else:
            logger.error('No BOM product mapping found for {}'.format(lines[0].strip()))
            raise ParserError.parseMessageError(Exception('No BOM product'), provider, data=lines[0])
        return warning_str
Exemple #19
0
    def parse(self, xml, provider=None, content=None):
        items = []
        try:
            # parse xml file, only expecting one event per file
            if not ET.iselement(xml.find('guid')):
                guid = generate_guid(type=GUID_NEWSML)
            else:
                guid = xml.find('guid').text

            item = {
                ITEM_TYPE: CONTENT_TYPE.TEXT,
                GUID_FIELD: guid,
                FORMAT: FORMATS.PRESERVED
            }
            item['name'] = xml.find('title').text
            item['definition_short'] = xml.find('title').text
            item['definition_long'] = xml.find('content').text
            item['dates'] = {
                'start': xml.find('timeStart').text,
                'end': xml.find('timeEnd').text,
                'tz': '',
                'recurring_rule': {}
            }
            # add location
            item['location'] = [{
                'name': xml.find('location').text,
                'qcode': '',
                'geo': ''
            }]
            if ET.iselement(xml.find('geo')):
                geo = xml.find('geo')
                item['location'][0]['geo'] = '%s, %s' % (
                    geo.find('latitude').text, geo.find('longitude').text)
            # IMPORTANT: firstcreated must be less than 2 days past
            # we must preserve the original event created and updated in some other fields
            item['firstcreated'] = utcnow()
            item['versioncreated'] = utcnow()
            items.append(item)

            return items
        except Exception as ex:
            raise ParserError.parseMessageError(ex, provider)
Exemple #20
0
    def _update(self, provider):
        """Check data provider for data updates and returns new items (if any).

        :param provider: data provider instance
        :return: a list containing a list of new content items
        :rtype: list

        :raises IngestApiError: if data retrieval error occurs
        :raises ParserError: if retrieved RSS data cannot be parsed
        """
        config = provider.get('config', {})

        try:
            xml_data = self._fetch_data(config, provider)
            data = feedparser.parse(xml_data)
        except IngestApiError:
            raise
        except Exception as ex:
            raise ParserError.parseMessageError(ex, provider)

        # If provider last updated time is not available, set it to 1.1.1970
        # so that it will be recognized as "not up to date".
        # Also convert it to a naive datetime object (removing tzinfo is fine,
        # because it is in UTC anyway)
        t_provider_updated = provider.get('last_updated', utcfromtimestamp(0))
        t_provider_updated = t_provider_updated.replace(tzinfo=None)

        new_items = []
        field_aliases = config.get('field_aliases')

        for entry in data.entries:
            t_entry_updated = utcfromtimestamp(timegm(entry.updated_parsed))

            if t_entry_updated > t_provider_updated:
                item = self._create_item(entry, field_aliases)
                self.add_timestamps(item)
                new_items.append(item)

        return [new_items]
Exemple #21
0
    def _get_decsription(self, lines, provider):
        """Lookup the bom product to determine the descriptive string, not finding this is a fatal error

        :param lines:
        :param provider:
        :return:
        """
        warning_str = 'Unknown'
        bom_products_map = get_resource_service('vocabularies').find_one(
            req=None, _id='bom_products')
        product = [
            x for x in bom_products_map.get('items', [])
            if x['qcode'] == lines[0].strip() and x['is_active']
        ]
        if len(product) > 0:
            warning_str = product[0].get('name', '')
        else:
            logger.error('No BOM product mapping found for {}'.format(
                lines[0].strip()))
            raise ParserError.parseMessageError(Exception('No BOM product'),
                                                provider,
                                                data=lines[0])
        return warning_str
 def parse(self, xml, provider=None):
     try:
         return self._parse(xml)
     except Exception as ex:
         raise ParserError.parseMessageError(ex, provider)
Exemple #23
0
class RSSFeedingService(HTTPFeedingServiceBase):
    """
    Feeding service for providing feeds received in RSS 2.0 format.

    (NOTE: it should also work with other syndicated feeds formats, too, since
    the underlying parser supports them, but for our needs RSS 2.0 is assumed)
    """

    NAME = "rss"

    ERRORS = [
        IngestApiError.apiAuthError().get_error_description(),
        IngestApiError.apiNotFoundError().get_error_description(),
        IngestApiError.apiGeneralError().get_error_description(),
        ParserError.parseMessageError().get_error_description(),
    ]

    label = "RSS/Atom"

    fields = ([{
        "id": "url",
        "type": "text",
        "label": "Host",
        "placeholder": "RSS Feed URL",
        "required": True,
        "errors": {
            4001: "Connection timed out.",
            4006: "URL not found.",
            4009: "Can't connect to host.",
            1001: "Can't parse the RSS.",
        },
    }] + HTTPFeedingServiceBase.AUTH_REQ_FIELDS + [{
        "id": "field_aliases",
        "type": "mapping",
        "label": "Content Field Aliases",
        "add_mapping_label": "Add alias",
        "remove_mapping_label": "Remove",
        "empty_label": "No field aliases defined.",
        "first_field_options": {
            "label":
            "Content Field Name",
            "values": [
                "body_text", "guid", "published_parsed", "summary", "title",
                "updated_parsed"
            ],
        },
        "second_field_options": {
            "label": "Field Alias",
            "placeholder": "Enter field alias"
        },
    }])

    HTTP_AUTH = None

    field_groups = {
        "auth_data": {
            "label": "Authentication Info",
            "fields": ["username", "password"]
        }
    }

    ItemField = namedtuple("ItemField", ["name", "name_in_data", "type"])

    item_fields = [
        ItemField("guid", "guid", str),
        ItemField("uri", "guid", str),
        ItemField("firstcreated", "published_parsed", datetime),
        ItemField("versioncreated", "updated_parsed", datetime),
        ItemField("headline", "title", str),
        ItemField("abstract", "summary", str),
        ItemField("body_html", "body_text", str),
        ItemField("byline", "author", str),
    ]
    """A list of fields that items created from the ingest data should contain.

    Each list item is a named tuple with the following three attribues:

    * name - the name of the field (attribute) in the resulting ingest item
    * name_in_data - the expected name of the data field in the retrieved
        ingest data (this can be overriden by providing a field name alias)
    * type - field's data type
    """

    IMG_MIME_TYPES = (
        "image/gif",
        "image/jpeg",
        "image/png",
        "image/tiff",
    )
    """
    Supported MIME types for ingesting external images referenced by the
    RSS entries.
    """

    IMG_FILE_SUFFIXES = (".gif", ".jpeg", ".jpg", ".png", ".tif", ".tiff")
    """
    Supported image filename extensions for ingesting (used for the
    <media:thumbnail> tags - they lack the "type" attribute).
    """
    def prepare_href(self, url, mimetype=None):
        """Prepare a link to an external resource (e.g. an image file).

        It can be directly used by the ingest machinery for fetching it.

        If provider requires authentication, basic HTTP authentication info is
        added to the given url, otherwise it is returned unmodified.

        :param str url: the original URL as extracted from an RSS entry

        :return: prepared URL
        :rtype: str
        """
        if self.auth_info:
            userinfo_part = "{}:{}@".format(
                urlquote(self.auth_info["username"]),
                urlquote(self.auth_info["password"]))
            scheme, netloc, path, query, fragment = urlsplit(url)
            netloc = userinfo_part + netloc
            url = urlunsplit((scheme, netloc, path, query, fragment))

        return url

    def _test(self, provider):
        """Test connection."""
        self.provider = provider
        xml = self._fetch_data()
        data = feedparser.parse(xml)
        if data.bozo:
            raise ParserError.parseMessageError(data.bozo_exception, provider)

    def _update(self, provider, update):
        """
        Check data provider for data updates and returns new items (if any).

        :param provider: data provider instance
        :return: a list containing a list of new content items
        :rtype: list

        :raises IngestApiError: if data retrieval error occurs
        :raises ParserError: if retrieved RSS data cannot be parsed
        """
        xml_data = self._fetch_data()

        try:
            data = feedparser.parse(xml_data)
        except Exception as ex:
            raise ParserError.parseMessageError(ex, provider, data=xml_data)

        # If provider last updated time is not available, set it to 1.1.1970
        # so that it will be recognized as "not up to date".
        # Also convert it to a naive datetime object (removing tzinfo is fine,
        # because it is in UTC anyway)
        t_provider_updated = provider.get(LAST_ITEM_UPDATE,
                                          utcfromtimestamp(0))
        t_provider_updated = t_provider_updated.replace(tzinfo=None)

        new_items = []
        field_aliases = self.config.get("field_aliases")

        for entry in data.entries:
            try:
                t_entry_updated = utcfromtimestamp(timegm(
                    entry.updated_parsed))
                if t_entry_updated <= t_provider_updated:
                    continue
            except (AttributeError, TypeError):
                # missing updated info, so better ingest it
                pass

            item = self._create_item(entry, field_aliases,
                                     provider.get("source", None))
            self.localize_timestamps(item)

            # If the RSS entry references any images, create picture items from
            # them and create a package referencing them and the entry itself.
            # If there are no image references, treat entry as a simple text
            # item, even if it might reference other media types, e.g. videos.
            image_urls = self._extract_image_links(entry)
            if image_urls:
                image_items = self._create_image_items(image_urls, item)
                new_items.extend(image_items)
                new_items.append(item)
                item = self._create_package(item, image_items)

            new_items.append(item)

        return [new_items]

    def _fetch_data(self):
        """Fetch the latest feed data.

        :return: fetched RSS data
        :rtype: str

        :raises IngestApiError: if fetching data fails for any reason
            (e.g. authentication error, resource not found, etc.)
        """
        url = self.config["url"]

        response = self.get_url(url)

        return response.content

    def _extract_image_links(self, rss_entry):
        """Extract URLs of all images referenced by the given RSS entry.

        Images can be referenced via `<enclosure>`, `<media:thumbnail>` or
        `<media:content>` RSS tag and must be listed among the allowed image
        types. All other links to external media are ignored.

        Duplicate URLs are omitted from the result.

        :param rss_entry: parsed RSS item (entry)
        :type rss_entry: :py:class:`feedparser.FeedParserDict`

        :return: a list of all unique image URLs found (as strings)
        """
        img_links = set()

        for link in getattr(rss_entry, "links", []):
            if link.get("type") in self.IMG_MIME_TYPES:
                img_links.add(link["href"])

        for item in getattr(rss_entry, "media_thumbnail", []):
            url = item.get("url", "")
            if url.endswith(self.IMG_FILE_SUFFIXES):
                img_links.add(url)

        for item in getattr(rss_entry, "media_content", []):
            if item.get("type") in self.IMG_MIME_TYPES:
                img_links.add(item["url"])

        return list(img_links)

    def _create_item(self, data, field_aliases=None, source="source"):
        """Create a new content item from RSS feed data.

        :param dict data: parsed data of a single feed entry
        :param field_aliases: (optional) field name aliases. Used for content
             fields that are named differently in retrieved data.
        :type field_aliases: list of {field_name: alias} dictionaries or None
        :param str source: the source of provider

        :return: created content item
        :rtype: dict
        """
        if field_aliases is None:
            field_aliases = {}
        else:
            field_aliases = merge_dicts(field_aliases)
        aliased_fields = set(field_aliases.values())

        item = dict(type=CONTENT_TYPE.TEXT)

        # Only consider fields that are not used as an alias (i.e. used to
        # populate another field) - unless those fields have their own
        # aliases, too.
        # The idea is that if e.g. the main text field is aliased to use the
        # parsed data's summary field, that summary should not be used to
        # populate the field it was originally meant for.
        fields_to_consider = (f for f in self.item_fields
                              if (f.name_in_data not in aliased_fields) or (
                                  f.name_in_data in aliased_fields
                                  and f.name_in_data in field_aliases))

        utc_now = datetime.utcnow()
        for field in fields_to_consider:
            data_field_name = field_aliases.get(field.name_in_data,
                                                field.name_in_data)
            field_value = data.get(data_field_name)

            if (field.type is datetime) and field_value:
                field_value = utcfromtimestamp(timegm(field_value))
                field_value = utc_now if field_value > utc_now else field_value

            item[field.name] = field_value

            # Some feeds use <content:encoded> tag for storing the main content,
            # and that tag is parsed differently. If the body_html has not been
            # found in its default data field and is not aliased, try to
            # populate it using the aforementioned content field as a fallback.
            if field.name == "body_html" and not field_value and field.name_in_data not in field_aliases:
                try:
                    item["body_html"] = data.content[0].value
                except Exception:
                    pass  # content either non-existent or parsed differently

        if not data.get("guidislink") and data.get("link"):
            item["uri"] = data["link"]
            scheme, netloc, path, query, fragment = urlsplit(item["uri"])
            if data.get("guid"):
                item["guid"] = generate_tag(domain=netloc, id=data.get("guid"))
            else:
                item["guid"] = generate_tag_from_url(data["link"])

        if item.get("uri", None):
            if not item.get("body_html", None):
                item["body_html"] = ""
            item[
                "body_html"] = '<p><a href="%s" target="_blank">%s</a></p>' % (
                    item["uri"], source) + item["body_html"]

        item["dateline"] = {
            "source": source,
            "date": item.get("firstcreated", item.get("versioncreated"))
        }

        if not item.get("versioncreated") and item.get("firstcreated"):
            item["versioncreated"] = item["firstcreated"]

        return item

    def _create_image_items(self, image_links, text_item):
        """Create a list of picture items that represent the external images located on given URLs.

        Each created item's `firstcreated` and `versioncreated` fields are set
        to the same value as the values of these fields in `text_item`.

        :param iterable image_links: list of image URLs
        :param dict text_item: the "main" text item the images are related to

        :return: list of created image items (as dicts)
        """
        image_items = []

        for image_url in image_links:
            img_item = {
                "guid": generate_tag_from_url(image_url),
                ITEM_TYPE: CONTENT_TYPE.PICTURE,
                "firstcreated": text_item.get("firstcreated"),
                "versioncreated": text_item.get("versioncreated"),
                "renditions": {
                    "baseImage": {
                        "href": image_url
                    }
                },
            }
            image_items.append(img_item)

        return image_items

    def _create_package(self, text_item, image_items):
        """Create a new content package from given content items.

        The package's `main` group contains only the references to given items,
        not the items themselves. In the list of references, the reference to
        the text item preceeds the references to image items.

        Package's `firstcreated` and `versioncreated` fields are set to values
        of these fields in `text_item`, and the `headline` is copied as well.

        :param dict text_item: item representing the text content
        :param list image_items: list of items (dicts) representing the images
            related to the text content
        :return: the created content package
        :rtype: dict
        """
        package = {
            ITEM_TYPE:
            CONTENT_TYPE.COMPOSITE,
            "guid":
            "{}:pkg".format(text_item["guid"]),
            "firstcreated":
            text_item["firstcreated"],
            "versioncreated":
            text_item["versioncreated"],
            "headline":
            text_item.get("headline", ""),
            "groups": [
                {
                    "id": "root",
                    "role": "grpRole:NEP",
                    "refs": [{
                        "idRef": "main"
                    }],
                },
                {
                    "id": "main",
                    "role": "main",
                    "refs": [],
                },
            ],
        }

        item_references = package["groups"][1]["refs"]
        item_references.append({"residRef": text_item["guid"]})

        for image in image_items:
            item_references.append({"residRef": image["guid"]})

        return package
Exemple #24
0
class RSSFeedingService(FeedingService):
    """
    Feeding service for providing feeds received in RSS 2.0 format.

    (NOTE: it should also work with other syndicated feeds formats, too, since
    the underlying parser supports them, but for our needs RSS 2.0 is assumed)
    """

    NAME = 'rss'
    ERRORS = [IngestApiError.apiAuthError().get_error_description(),
              IngestApiError.apiNotFoundError().get_error_description(),
              IngestApiError.apiGeneralError().get_error_description(),
              ParserError.parseMessageError().get_error_description()]

    ItemField = namedtuple('ItemField', ['name', 'name_in_data', 'type'])

    item_fields = [
        ItemField('guid', 'guid', str),
        ItemField('uri', 'guid', str),
        ItemField('firstcreated', 'published_parsed', datetime),
        ItemField('versioncreated', 'updated_parsed', datetime),
        ItemField('headline', 'title', str),
        ItemField('abstract', 'summary', str),
        ItemField('body_html', 'body_text', str),
	ItemField('timescalled', 'timescalled', int),
	ItemField('test', 'test', str),
	ItemField('testing', 'testing', str),
	ItemField('mobilecircle', 'mobilecircle', str),
	ItemField('audiofile', 'audiofile', str),
	ItemField('timesrecorded', 'timesrecorded', int),
	ItemField('timespublished', 'timespublished', int),
    ]
    """A list of fields that items created from the ingest data should contain.

    Each list item is a named tuple with the following three attribues:

    * name - the name of the field (attribute) in the resulting ingest item
    * name_in_data - the expected name of the data field in the retrieved
        ingest data (this can be overriden by providing a field name alias)
    * type - field's data type
    """

    IMG_MIME_TYPES = (
        'image/gif',
        'image/jpeg',
        'image/png',
        'image/tiff',
    )
    """
    Supported MIME types for ingesting external images referenced by the
    RSS entries.
    """

    IMG_FILE_SUFFIXES = ('.gif', '.jpeg', '.jpg', '.png', '.tif', '.tiff')
    """
    Supported image filename extensions for ingesting (used for the
    <media:thumbnail> tags - they lack the "type" attribute).
    """

    def __init__(self):
        super().__init__()
        self.auth_info = None

    def prepare_href(self, url, mimetype=None):
        """
        Prepare a link to an external resource (e.g. an image file) so
        that it can be directly used by the ingest machinery for fetching it.

        If provider requires authentication, basic HTTP authentication info is
        added to the given url, otherwise it is returned unmodified.

        :param str url: the original URL as extracted from an RSS entry

        :return: prepared URL
        :rtype: str
        """
        if self.auth_info:
            userinfo_part = '{}:{}@'.format(
                urlquote(self.auth_info['username']),
                urlquote(self.auth_info['password'])
            )
            scheme, netloc, path, query, fragment = urlsplit(url)
            netloc = userinfo_part + netloc
            url = urlunsplit((scheme, netloc, path, query, fragment))

        return url

    def _update(self, provider):
        """
        Check data provider for data updates and returns new items (if any).

        :param provider: data provider instance
        :return: a list containing a list of new content items
        :rtype: list

        :raises IngestApiError: if data retrieval error occurs
        :raises ParserError: if retrieved RSS data cannot be parsed
        """
        config = provider.get('config', {})

        if config.get('auth_required'):
            self.auth_info = {
                'username': config.get('username', ''),
                'password': config.get('password', '')
            }

        try:
            xml_data = self._fetch_data(config, provider)
            data = feedparser.parse(xml_data)
        except IngestApiError:
            raise
        except Exception as ex:
            raise ParserError.parseMessageError(ex, provider)

        # If provider last updated time is not available, set it to 1.1.1970
        # so that it will be recognized as "not up to date".
        # Also convert it to a naive datetime object (removing tzinfo is fine,
        # because it is in UTC anyway)
        t_provider_updated = provider.get('last_updated', utcfromtimestamp(0))
        t_provider_updated = t_provider_updated.replace(tzinfo=None)

        new_items = []
        field_aliases = config.get('field_aliases')

        for entry in data.entries:
            t_entry_updated = utcfromtimestamp(timegm(entry.updated_parsed))

            if t_entry_updated <= t_provider_updated:
                continue

            item = self._create_item(entry, field_aliases, provider.get('source', None))
            self.add_timestamps(item)

            # If the RSS entry references any images, create picture items from
            # them and create a package referencing them and the entry itself.
            # If there are no image references, treat entry as a simple text
            # item, even if it might reference other media types, e.g. videos.
            image_urls = self._extract_image_links(entry)
            if image_urls:
                image_items = self._create_image_items(image_urls, item)
                new_items.extend(image_items)
                new_items.append(item)
                item = self._create_package(item, image_items)

            new_items.append(item)

        return [new_items]

    def _fetch_data(self, config, provider):
        """Fetch the latest feed data.

        :param dict config: RSS resource configuration
        :param provider: data provider instance, needed as an argument when
            raising ingest errors
        :return: fetched RSS data
        :rtype: str

        :raises IngestApiError: if fetching data fails for any reason
            (e.g. authentication error, resource not found, etc.)
        """
        url = config['url']

        if config.get('auth_required', False):
            auth = (config.get('username'), config.get('password'))
        else:
            auth = None

        response = requests.get(url, auth=auth)

        if response.ok:
            return response.content
        else:
            if response.status_code in (401, 403):
                raise IngestApiError.apiAuthError(
                    Exception(response.reason), provider)
            elif response.status_code == 404:
                raise IngestApiError.apiNotFoundError(
                    Exception(response.reason), provider)
            else:
                raise IngestApiError.apiGeneralError(
                    Exception(response.reason), provider)

    def _extract_image_links(self, rss_entry):
        """Extract URLs of all images referenced by the given RSS entry.

        Images can be referenced via `<enclosure>`, `<media:thumbnail>` or
        `<media:content>` RSS tag and must be listed among the allowed image
        types. All other links to external media are ignored.

        Duplicate URLs are omitted from the result.

        :param rss_entry: parsed RSS item (entry)
        :type rss_entry: :py:class:`feedparser.FeedParserDict`

        :return: a list of all unique image URLs found (as strings)
        """
        img_links = set()

        for link in getattr(rss_entry, 'links', []):
            if link.get('type') in self.IMG_MIME_TYPES:
                img_links.add(link['href'])

        for item in getattr(rss_entry, 'media_thumbnail', []):
            url = item.get('url', '')
            if url.endswith(self.IMG_FILE_SUFFIXES):
                img_links.add(url)

        for item in getattr(rss_entry, 'media_content', []):
            if item.get('type') in self.IMG_MIME_TYPES:
                img_links.add(item['url'])

        return list(img_links)

    def _create_item(self, data, field_aliases=None, source=None):
        """Create a new content item from RSS feed data.

        :param dict data: parsed data of a single feed entry
        :param field_aliases: (optional) field name aliases. Used for content
             fields that are named differently in retrieved data.
        :type field_aliases: list of {field_name: alias} dictionaries or None
        :param str source: the source of provider

        :return: created content item
        :rtype: dict
        """
        if field_aliases is None:
            field_aliases = {}
        else:
            field_aliases = merge_dicts(field_aliases)
        aliased_fields = set(field_aliases.values())

        item = dict(type=CONTENT_TYPE.TEXT)

        # Only consider fields that are not used as an alias (i.e. used to
        # populate another field) - unless those fields have their own
        # aliases, too.
        # The idea is that if e.g. the main text field is aliased to use the
        # parsed data's summary field, that summary should not be used to
        # populate the field it was originally meant for.
        fields_to_consider = (
            f for f in self.item_fields
            if (f.name_in_data not in aliased_fields) or
               (f.name_in_data in aliased_fields and
                f.name_in_data in field_aliases)
        )

        for field in fields_to_consider:
            data_field_name = field_aliases.get(
                field.name_in_data, field.name_in_data
            )
            field_value = data.get(data_field_name)

            if (field.type is datetime) and field_value:
                field_value = utcfromtimestamp(timegm(field_value))

            item[field.name] = field_value

            # Some feeds use <content:encoded> tag for storing the main content,
            # and that tag is parsed differently. If the body_html has not been
            # found in its default data field and is not aliased, try to
            # populate it using the aforementioned content field as a fallback.
            if (
                field.name == 'body_html' and
                not field_value and
                field.name_in_data not in field_aliases
            ):
                try:
                    item['body_html'] = data.content[0].value
                except:
                    pass  # content either non-existent or parsed differently

        if item.get('uri', None):
            if not item.get('body_html', None):
                item['body_html'] = ''
            source = source or 'source'
            item['body_html'] = '<p><a href="%s" target="_blank">%s</a></p>' % (item['uri'], source) + item['body_html']
        return item

    def _create_image_items(self, image_links, text_item):
        """Create a list of picture items that represent the external images
        located on given URLs.

        Each created item's `firstcreated` and `versioncreated` fields are set
        to the same value as the values of these fields in `text_item`.

        :param iterable image_links: list of image URLs
        :param dict text_item: the "main" text item the images are related to

        :return: list of created image items (as dicts)
        """
        image_items = []

        for image_url in image_links:
            img_item = {
                'guid': generate_guid(type=GUID_TAG),
                ITEM_TYPE: CONTENT_TYPE.PICTURE,
                'firstcreated': text_item.get('firstcreated'),
                'versioncreated': text_item.get('versioncreated'),
                'renditions': {
                    'baseImage': {
                        'href': image_url
                    }
                }
            }
            image_items.append(img_item)

        return image_items

    def _create_package(self, text_item, image_items):
        """Create a new content package from given content items.

        The package's `main` group contains only the references to given items,
        not the items themselves. In the list of references, the reference to
        the text item preceeds the references to image items.

        Package's `firstcreated` and `versioncreated` fields are set to values
        of these fields in `text_item`, and the `headline` is copied as well.

        :param dict text_item: item representing the text content
        :param list image_items: list of items (dicts) representing the images
            related to the text content
        :return: the created content package
        :rtype: dict
        """
        package = {
            ITEM_TYPE: CONTENT_TYPE.COMPOSITE,
            'guid': generate_guid(type=GUID_TAG),
            'firstcreated': text_item['firstcreated'],
            'versioncreated': text_item['versioncreated'],
            'headline': text_item.get('headline', ''),
            'groups': [
                {
                    'id': 'root',
                    'role': 'grpRole:NEP',
                    'refs': [{'idRef': 'main'}],
                }, {
                    'id': 'main',
                    'role': 'main',
                    'refs': [],
                }
            ]
        }

        item_references = package['groups'][1]['refs']
        item_references.append({'residRef': text_item['guid']})

        for image in image_items:
            item_references.append({'residRef': image['guid']})

        return package
Exemple #25
0
    def parse(self, cal, provider=None):

        try:
            items = []

            for component in cal.walk():
                if component.name == "VEVENT":
                    item = {
                        ITEM_TYPE: CONTENT_TYPE.TEXT,
                        GUID_FIELD: generate_guid(type=GUID_NEWSML),
                        FORMAT: FORMATS.PRESERVED
                    }
                    item['name'] = component.get('summary')
                    item['definition_short'] = component.get('summary')
                    item['definition_long'] = component.get('description')
                    item['original_source'] = component.get('uid')

                    # add dates
                    # check if component .dt return date instead of datetime, if so, convert to datetime
                    dtstart = component.get('dtstart').dt
                    dates_start = dtstart if isinstance(dtstart, datetime.datetime) \
                        else datetime.datetime.combine(dtstart, datetime.datetime.min.time())
                    if not dates_start.tzinfo:
                        dates_start = utc.localize(dates_start)
                    try:
                        dtend = component.get('dtend').dt
                        dates_end = dtend if isinstance(dtend, datetime.datetime) \
                            else datetime.datetime.combine(dtend, datetime.datetime.min.time())
                        if not dates_end.tzinfo:
                            dates_end = utc.localize(dates_end)
                    except AttributeError as e:
                        dates_end = None
                    item['dates'] = {
                        'start': dates_start,
                        'end': dates_end,
                        'tz': '',
                        'recurring_rule': {}
                    }
                    # parse ics RRULE to fit eventsML recurring_rule
                    r_rule = component.get('rrule')
                    if isinstance(r_rule, vRecur):
                        r_rule_dict = vRecur.from_ical(r_rule)
                        if 'FREQ' in r_rule_dict.keys():
                            item['dates']['recurring_rule'][
                                'frequency'] = ''.join(r_rule_dict.get('FREQ'))
                        if 'INTERVAL' in r_rule_dict.keys():
                            item['dates']['recurring_rule'][
                                'interval'] = r_rule_dict.get('INTERVAL')[0]
                        if 'UNTIL' in r_rule_dict.keys():
                            item['dates']['recurring_rule'][
                                'until'] = r_rule_dict.get('UNTIL')[0]
                        if 'COUNT' in r_rule_dict.keys():
                            item['dates']['recurring_rule'][
                                'count'] = r_rule_dict.get('COUNT')
                        if 'BYMONTH' in r_rule_dict.keys():
                            item['dates']['recurring_rule'][
                                'bymonth'] = ' '.join(
                                    r_rule_dict.get('BYMONTH'))
                        if 'BYDAY' in r_rule_dict.keys():
                            item['dates']['recurring_rule'][
                                'byday'] = ' '.join(r_rule_dict.get('BYDAY'))
                        if 'BYHOUR' in r_rule_dict.keys():
                            item['dates']['recurring_rule'][
                                'byhour'] = ' '.join(r_rule_dict.get('BYHOUR'))
                        if 'BYMIN' in r_rule_dict.keys():
                            item['dates']['recurring_rule'][
                                'bymin'] = ' '.join(r_rule_dict.get('BYMIN'))

                    # set timezone info if date is a datetime
                    if isinstance(
                            component.get('dtstart').dt, datetime.datetime):
                        item['dates']['tz'] = tzid_from_dt(
                            component.get('dtstart').dt)

                    # add participants
                    item['participant'] = []
                    if component.get('attendee'):
                        for attendee in component.get('attendee'):
                            if isinstance(attendee, vCalAddress):
                                item['participant'].append({
                                    'name':
                                    vCalAddress.from_ical(attendee),
                                    'qcode':
                                    ''
                                })

                    # add organizers
                    item['organizer'] = [{
                        'name': component.get('organizer', ''),
                        'qcode': ''
                    }]

                    # add location
                    item['location'] = [{
                        'name': component.get('location', ''),
                        'qcode': '',
                        'geo': ''
                    }]
                    if component.get('geo'):
                        item['location'][0]['geo'] = vGeo.from_ical(
                            component.get('geo').to_ical())

                    # IMPORTANT: firstcreated must be less than 2 days past
                    # we must preserve the original event created and updated in some other fields
                    if component.get('created'):
                        item['event_created'] = component.get('created').dt
                    if component.get('last-modified'):
                        item['event_lastmodified'] = component.get(
                            'last-modified').dt
                    item['firstcreated'] = utcnow()
                    item['versioncreated'] = utcnow()
                    items.append(item)
            original_source_ids = [
                _['original_source'] for _ in items
                if _.get('original_source', None)
            ]
            existing_items = list(
                get_resource_service('events').get_from_mongo(
                    req=None,
                    lookup={'original_source': {
                        '$in': original_source_ids
                    }}))

            def original_source_exists(item):
                """Return true if the item exists in `existing_items`"""
                for c in existing_items:
                    if c['original_source'] == item['original_source']:
                        if c['dates']['start'] == item['dates']['start']:
                            return True
                return False

            def is_future(item):
                """Return true if the item is reccuring or in the future"""
                if not item['dates'].get('recurring_rule'):
                    if item['dates']['start'] < utcnow() - datetime.timedelta(
                            days=1):
                        return False
                return True

            items = [_ for _ in items if is_future(_)]
            items = [_ for _ in items if not original_source_exists(_)]
            return items
        except Exception as ex:
            raise ParserError.parseMessageError(ex, provider)
Exemple #26
0
class BBCLDRSFeedingService(FeedingService):
    """
    Feeding Service class for reading BBC's Local Democracy Reporting Service
    """

    # Following the api spec at https://docs.ldrs.org.uk/

    NAME = 'bbc_ldrs'
    ERRORS = [
        IngestApiError.apiAuthError().get_error_description(),
        IngestApiError.apiNotFoundError().get_error_description(),
        IngestApiError.apiGeneralError().get_error_description(),
        ParserError.parseMessageError().get_error_description()
    ]

    label = 'BBC Local Democracy Reporter Service'

    fields = [{
        'id': 'url',
        'type': 'text',
        'label': 'LDRS URL',
        'placeholder': 'LDRS URL',
        'required': True,
        'default': 'https://api.ldrs.org.uk/v1/item'
    }, {
        'id': 'api_key',
        'type': 'text',
        'label': 'API Key',
        'placeholder': 'API Key',
        'required': True,
        'default': ''
    }]

    def __init__(self):
        super().__init__()

    def _test(self, provider):
        config = provider.get('config', {})
        url = config['url']
        api_key = config['api_key']

        # limit the data to a single article and filter out all article fields
        # to save bandwidth
        params = {'limit': 1, 'fields': 'id'}
        headers = {'apikey': api_key}

        try:
            response = requests.get(url,
                                    params=params,
                                    headers=headers,
                                    timeout=30)
        except requests.exceptions.ConnectionError as err:
            raise IngestApiError.apiConnectionError(exception=err)

        if not response.ok:
            if response.status_code == 404:
                raise IngestApiError.apiNotFoundError(
                    Exception(response.reason), provider)
            else:
                raise IngestApiError.apiGeneralError(
                    Exception(response.reason), provider)

    def _update(self, provider, update):
        config = provider.get('config', {})
        json_items = self._fetch_data(config, provider)
        parsed_items = []

        for item in json_items:
            try:
                parser = self.get_feed_parser(provider, item)
                parsed_items.append(parser.parse(item))
            except Exception as ex:
                raise ParserError.parseMessageError(ex, provider, data=item)

        return parsed_items

    def _fetch_data(self, config, provider):
        url = config['url']
        api_key = config['api_key']

        last_update = provider.get(
            'last_updated', utcfromtimestamp(0)).strftime('%Y-%m-%dT%H:%M:%S')

        # Results are pagified so we'll read this many at a time
        offset_jump = 10

        params = {'start': last_update, 'limit': offset_jump}
        headers = {'apikey': api_key}

        items = []

        offset = 0
        while True:
            params['offset'] = offset

            try:
                response = requests.get(url,
                                        params=params,
                                        headers=headers,
                                        timeout=30)
            except requests.exceptions.ConnectionError as err:
                raise IngestApiError.apiConnectionError(exception=err)

            if response.ok:
                # The total number of results are given to us in json, get them
                # via a regex to read the field so we don't have to convert the
                # whole thing to json pointlessly
                item_ident = re.search('\"total\": *[0-9]*',
                                       response.text).group()
                results_str = re.search('[0-9]+', item_ident).group()

                if results_str is None:
                    raise IngestApiError.apiGeneralError(
                        Exception(response.text), provider)

                num_results = int(results_str)

                if num_results > 0:
                    items.append(response.text)

                if offset >= num_results:
                    return items

                offset += offset_jump
            else:
                if re.match('Error: No API Key provided', response.text):
                    raise IngestApiError.apiAuthError(Exception(response.text),
                                                      provider)
                elif response.status_code == 404:
                    raise IngestApiError.apiNotFoundError(
                        Exception(response.reason), provider)
                else:
                    raise IngestApiError.apiGeneralError(
                        Exception(response.reason), provider)

        return items
    def parse(self, data, provider=None):
        if self.subjects_map is None:
            self._set_metadata()
        try:
            stage_map = config.NIFS_STAGE_MAP
            qcode_map = config.NIFS_QCODE_MAP
        except KeyError:
            raise SuperdeskIngestError.notConfiguredError(
                Exception('NIFS maps are not found in settings'))
        events = json.loads(data.decode('utf-8', 'ignore'))
        items = []
        try:
            for event in events:
                stage = stage_map.get(event['stageId'], '')

                # we retrieve qcode from sportId, according to config, and sport name
                try:
                    qcode = qcode_map[event['sportId']]
                except KeyError:
                    logger.warning(
                        'no qcode registered for sportId {sport_id}'.format(
                            sport_id=event['sportId']))
                    qcode = ''
                    sport = ''
                else:
                    sport = self.get_sport(qcode)

                # name as requested by NTB
                if stage or sport:
                    tpl_name = '{sport} {stage}, {rnd}. runde, {home} - {away}'
                else:
                    tpl_name = '{rnd}. runde, {home} - {away}'

                name = tpl_name.format(stage=stage,
                                       sport=sport,
                                       rnd=event['round'],
                                       home=event['homeTeam']['name'],
                                       away=event['awayTeam']['name']).strip()

                event_start = dateutil.parser.parse(event['timestamp'])
                # there is no end time specified in event
                event_end = event_start + timedelta(hours=2)

                # we have a common category and subject + a subject per sport
                # cf. SDNTB-496
                subject = [{'qcode': CAT, 'name': CAT, 'scheme': 'category'}]
                subject.append({
                    'qcode':
                    MAIN_SUBJ_QCODE,
                    'name':
                    self.subjects_map.get(MAIN_SUBJ_QCODE, ''),
                    'scheme':
                    'subject_custom'
                })
                subject.append({
                    'qcode': qcode,
                    'name': sport,
                    'scheme': 'subject_custom'
                })

                service = {'qcode': SERVICE_QCODE, 'name': self.service_name}

                item = {
                    'guid': event['uid'],
                    ITEM_TYPE: CONTENT_TYPE.EVENT,
                    'dates': {
                        'start': event_start,
                        'end': event_end,
                        'tz': ''
                    },
                    'name': name,
                    'slugline': sport,
                    'subject': subject,
                    'anpa_category': [service],
                    'calendars': [self.calendar_item],
                    'firstcreated': utcnow(),
                    'versioncreated': utcnow()
                }
                items.append(item)
            return items
        except Exception as ex:
            raise ParserError.parseMessageError(ex, provider)
Exemple #28
0
class BBCLDRSFeedingService(HTTPFeedingServiceBase):
    """
    Feeding Service class for reading BBC's Local Democracy Reporting Service
    """

    # Following the api spec at https://docs.ldrs.org.uk/

    NAME = "bbc_ldrs"
    ERRORS = [ParserError.parseMessageError().get_error_description()]

    label = "BBC Local Democracy Reporter Service"

    fields = [
        {
            "id": "url",
            "type": "text",
            "label": "LDRS URL",
            "placeholder": "LDRS URL",
            "required": True,
            "default": "https://api.ldrs.org.uk/v1/item",
        },
        {
            "id": "api_key",
            "type": "text",
            "label": "API Key",
            "placeholder": "API Key",
            "required": True,
            "default": "",
        },
    ]
    HTTP_AUTH = False

    def __init__(self):
        super().__init__()

    def _test(self, provider):
        config = self.config
        url = config["url"]
        api_key = config["api_key"]

        # limit the data to a single article and filter out all article fields
        # to save bandwidth
        params = {"limit": 1, "fields": "id"}
        headers = {"apikey": api_key}

        self.get_url(url, params=params, headers=headers)

    def _update(self, provider, update):
        json_items = self._fetch_data()
        parsed_items = []

        for item in json_items:
            try:
                parser = self.get_feed_parser(provider, item)
                parsed_items.append(parser.parse(item))
            except Exception as ex:
                raise ParserError.parseMessageError(ex, provider, data=item)

        return parsed_items

    def _fetch_data(self):
        url = self.config["url"]
        api_key = self.config["api_key"]

        last_update = self.provider.get("last_updated", utcfromtimestamp(0)).strftime("%Y-%m-%dT%H:%M:%S")

        # Results are pagified so we'll read this many at a time
        offset_jump = 10

        params = {"start": last_update, "limit": offset_jump}
        headers = {"apikey": api_key}

        items = []

        offset = 0
        while True:
            params["offset"] = offset

            response = self.get_url(url, params=params, headers=headers)
            # The total number of results are given to us in json, get them
            # via a regex to read the field so we don't have to convert the
            # whole thing to json pointlessly
            item_ident = re.search('"total": *[0-9]*', response.text).group()
            results_str = re.search("[0-9]+", item_ident).group()

            if results_str is None:
                raise IngestApiError.apiGeneralError(Exception(response.text), self.provider)

            num_results = int(results_str)

            if num_results > 0:
                items.append(response.text)

            if offset >= num_results:
                return items

            offset += offset_jump

        return items
    def parse(self, cal, provider=None):

        try:
            items = []

            for component in cal.walk():
                if component.name == "VEVENT":
                    item = {
                        ITEM_TYPE: CONTENT_TYPE.EVENT,
                        GUID_FIELD: generate_guid(type=GUID_NEWSML),
                        FORMAT: FORMATS.PRESERVED
                    }
                    item['name'] = component.get('summary')
                    item['definition_short'] = component.get('summary')
                    item['definition_long'] = component.get('description')
                    item['original_source'] = component.get('uid')
                    item['state'] = CONTENT_STATE.INGESTED
                    item['pubstatus'] = None
                    eocstat_map = get_resource_service(
                        'vocabularies').find_one(req=None,
                                                 _id='eventoccurstatus')
                    if eocstat_map:
                        item['occur_status'] = [
                            x for x in eocstat_map.get('items', [])
                            if x['qcode'] == 'eocstat:eos5'
                            and x.get('is_active', True)
                        ][0]
                        item['occur_status'].pop('is_active', None)

                    self.parse_dates(item, component)
                    self.parse_recurring_rules(item, component)

                    # add participants
                    item['participant'] = []
                    if component.get('attendee'):
                        for attendee in component.get('attendee'):
                            if isinstance(attendee, vCalAddress):
                                item['participant'].append({
                                    'name':
                                    vCalAddress.from_ical(attendee),
                                    'qcode':
                                    ''
                                })

                    # add organizers
                    item['organizer'] = [{
                        'name': component.get('organizer', ''),
                        'qcode': ''
                    }]

                    # add location
                    item['location'] = [{
                        'name': component.get('location', ''),
                        'qcode': '',
                        'geo': ''
                    }]
                    if component.get('geo'):
                        item['location'][0]['geo'] = vGeo.from_ical(
                            component.get('geo').to_ical())

                    # IMPORTANT: firstcreated must be less than 2 days past
                    # we must preserve the original event created and updated in some other fields
                    if component.get('created'):
                        item['event_created'] = component.get('created').dt
                    if component.get('last-modified'):
                        item['event_lastmodified'] = component.get(
                            'last-modified').dt
                    item['firstcreated'] = utcnow()
                    item['versioncreated'] = utcnow()
                    items.append(item)
            original_source_ids = [
                _['original_source'] for _ in items
                if _.get('original_source', None)
            ]
            existing_items = list(
                get_resource_service('events').get_from_mongo(
                    req=None,
                    lookup={'original_source': {
                        '$in': original_source_ids
                    }}))

            def original_source_exists(item):
                """Return true if the item exists in `existing_items`"""
                for c in existing_items:
                    if c['original_source'] == item['original_source']:
                        if c['dates']['start'] == item['dates']['start']:
                            return True
                return False

            def is_future(item):
                """Return true if the item is reccuring or in the future"""
                if not item['dates'].get('recurring_rule'):
                    if item['dates']['start'] < utcnow() - datetime.timedelta(
                            days=1):
                        return False
                return True

            items = [_ for _ in items if is_future(_)]
            items = [_ for _ in items if not original_source_exists(_)]
            return items
        except Exception as ex:
            raise ParserError.parseMessageError(ex, provider)
Exemple #30
0
from superdesk.io import register_provider
from superdesk.io.ingest_service import IngestService
from superdesk.utils import merge_dicts

from urllib.parse import quote as urlquote, urlsplit, urlunsplit


PROVIDER = "rss"

utcfromtimestamp = datetime.utcfromtimestamp

errors = [
    IngestApiError.apiAuthError().get_error_description(),
    IngestApiError.apiNotFoundError().get_error_description(),
    IngestApiError.apiGeneralError().get_error_description(),
    ParserError.parseMessageError().get_error_description(),
]


class RssIngestService(IngestService):
    """Ingest service for providing feeds received in RSS 2.0 format.

    (NOTE: it should also work with other syndicated feeds formats, too, since
    the underlying parser supports them, but for our needs RSS 2.0 is assumed)
    """

    ItemField = namedtuple("ItemField", ["name", "name_in_data", "type"])

    item_fields = [
        ItemField("guid", "guid", str),
        ItemField("uri", "guid", str),
Exemple #31
0
from datetime import datetime

from superdesk.errors import IngestApiError, ParserError
from superdesk.io import register_provider
from superdesk.io.ingest_service import IngestService
from superdesk.utils import merge_dicts


PROVIDER = 'rss'

utcfromtimestamp = datetime.utcfromtimestamp

errors = [IngestApiError.apiAuthError().get_error_description(),
          IngestApiError.apiNotFoundError().get_error_description(),
          IngestApiError.apiGeneralError().get_error_description(),
          ParserError.parseMessageError().get_error_description()]


class RssIngestService(IngestService):
    """Ingest service for providing feeds received in RSS 2.0 format.

    (NOTE: it should also work with other syndicated feeds formats, too, since
    the underlying parser supports them, but for our needs RSS 2.0 is assumed)
    """

    ItemField = namedtuple('ItemField', ['name', 'name_in_data', 'type'])

    item_fields = [
        ItemField('guid', 'guid', str),
        ItemField('uri', 'guid', str),
        ItemField('firstcreated', 'published_parsed', datetime),