Esempio n. 1
0
    def _update(self, provider):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)
        if not self.path:
            return

        for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created):
            try:
                if os.path.isfile(os.path.join(self.path, filename)):
                    filepath = os.path.join(self.path, filename)
                    stat = os.lstat(filepath)
                    last_updated = datetime.fromtimestamp(stat.st_mtime, tz=utc)
                    if self.is_latest_content(last_updated, provider.get('last_updated')):
                        with open(os.path.join(self.path, filename), 'r') as f:
                            item = self.parser.parse_message(etree.fromstring(f.read()), provider)

                            self.add_timestamps(item)
                            self.move_file(self.path, filename, provider=provider, success=True)
                            yield [item]
                    else:
                        self.move_file(self.path, filename, provider=provider, success=True)
            except etreeParserError as ex:
                logger.exception("Ingest Type: AFP - File: {0} could not be processed".format(filename), ex)
                self.move_file(self.path, filename, provider=provider, success=False)
                raise ParserError.newsmlOneParserError(ex, provider)
            except ParserError as ex:
                self.move_file(self.path, filename, provider=provider, success=False)
            except Exception as ex:
                self.move_file(self.path, filename, provider=provider, success=False)
                raise ProviderError.ingestError(ex, provider)

        push_notification('ingest:update')
Esempio n. 2
0
    def _update(self, provider):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)
        if not self.path:
            return

        for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created):
            try:
                if os.path.isfile(os.path.join(self.path, filename)):
                    filepath = os.path.join(self.path, filename)
                    stat = os.lstat(filepath)
                    last_updated = datetime.fromtimestamp(stat.st_mtime, tz=utc)
                    if self.is_latest_content(last_updated, provider.get('last_updated')):
                        with open(os.path.join(self.path, filename), 'r') as f:
                            item = self.parser.parse_message(etree.fromstring(f.read()), provider)

                            self.add_timestamps(item)
                            self.move_file(self.path, filename, provider=provider, success=True)
                            yield [item]
                    else:
                        self.move_file(self.path, filename, provider=provider, success=True)
            except etreeParserError as ex:
                logger.exception("Ingest Type: AFP - File: {0} could not be processed".format(filename), ex)
                self.move_file(self.path, filename, provider=provider, success=False)
                raise ParserError.newsmlOneParserError(ex, provider)
            except ParserError as ex:
                self.move_file(self.path, filename, provider=provider, success=False)
            except Exception as ex:
                self.move_file(self.path, filename, provider=provider, success=False)
                raise ProviderError.ingestError(ex, provider)

        push_notification('ingest:update')
    def parse(self, xml, provider=None):
        item = {}
        try:
            self.root = xml
            parsed_el = xml.find('NewsItem/NewsComponent/AdministrativeMetadata/Source')
            if parsed_el is not None:
                item['original_source'] = parsed_el.find('Party').get('FormalName', '')

            parsed_el = xml.find('NewsEnvelope/TransmissionId')
            if parsed_el is not None:
                item['ingest_provider_sequence'] = parsed_el.text

            parsed_el = xml.find('NewsEnvelope/Priority')
            item['priority'] = self.map_priority(parsed_el.text if parsed_el is not None else None)

            self.parse_news_identifier(item, xml)
            self.parse_newslines(item, xml)
            self.parse_news_management(item, xml)

            parsed_el = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/Language')
            if parsed_el is not None:
                language = self.parse_attributes_as_dictionary(parsed_el)
                item['language'] = language[0]['FormalName'] if len(language) else ''

            keywords = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/Property')
            item['keywords'] = self.parse_attribute_values(keywords, 'Keyword')

            subjects = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail')
            subjects += xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter')
            subjects += xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject')

            item['subject'] = self.format_subjects(subjects)

            # item['ContentItem'] = self.parse_attributes_as_dictionary(
            #    tree.find('NewsItem/NewsComponent/ContentItem'))
            # item['Content'] = etree.tostring(
            # tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content'))

            item['body_html'] = etree.tostring(
                xml.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content'),
                encoding='unicode').replace('<body.content>', '').replace('</body.content>', '')

            parsed_el = xml.findall('NewsItem/NewsComponent/ContentItem/Characteristics/Property')
            characteristics = self.parse_attribute_values(parsed_el, 'Words')
            item['word_count'] = characteristics[0] if len(characteristics) else None

            parsed_el = xml.find('NewsItem/NewsComponent/RightsMetadata/UsageRights/UsageType')
            if parsed_el is not None:
                item.setdefault('usageterms', parsed_el.text)

            parsed_el = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/Genre')
            if parsed_el is not None:
                item['genre'] = []
                for el in parsed_el:
                    item['genre'].append({'name': el.get('FormalName')})

            return self.populate_fields(item)
        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, provider)
Esempio n. 4
0
    def parse_message(self, tree, provider):
        """Parse NewsMessage."""
        item = {}
        try:
            self.root = tree

            parsed_el = tree.find('NewsItem/NewsComponent/AdministrativeMetadata/Source')
            if parsed_el is not None:
                item['original_source'] = parsed_el.find('Party').get('FormalName', '')

            parsed_el = tree.find('NewsEnvelope/TransmissionId')
            if parsed_el is not None:
                item['ingest_provider_sequence'] = parsed_el.text

            self.parse_news_identifier(item, tree)
            self.parse_newslines(item, tree)
            self.parse_news_management(item, tree)

            parsed_el = tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/Language')
            if parsed_el is not None:
                language = self.parse_attributes_as_dictionary(parsed_el)
                item['language'] = language[0]['FormalName'] if len(language) else ''

            keywords = tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/Property')
            item['keywords'] = self.parse_attribute_values(keywords, 'Keyword')

            subjects = tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail')
            subjects += tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter')
            subjects += tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject')

            item['subject'] = self.format_subjects(subjects)

            # item['ContentItem'] = self.parse_attributes_as_dictionary(
            #    tree.find('NewsItem/NewsComponent/ContentItem'))
            # item['Content'] = etree.tostring(
            # tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content'))

            item['body_html'] = etree.tostring(
                tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content'),
                encoding='unicode').replace('<body.content>', '').replace('</body.content>', '')

            parsed_el = tree.findall('NewsItem/NewsComponent/ContentItem/Characteristics/Property')
            characteristics = self.parse_attribute_values(parsed_el, 'Words')
            item['word_count'] = characteristics[0] if len(characteristics) else None

            parsed_el = tree.find('NewsItem/NewsComponent/RightsMetadata/UsageRights/UsageType')
            if parsed_el is not None:
                item.setdefault('usageterms', parsed_el.text)

            parsed_el = tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/Genre')
            if parsed_el is not None:
                item['genre'] = []
                for el in parsed_el:
                    item['genre'].append({'name': el.get('FormalName')})

            return self.populate_fields(item)
        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, provider)
Esempio n. 5
0
    def parse(self, xml, provider=None):
        item = {}
        try:
            self.root = xml
            parsed_el = xml.find("NewsItem/NewsComponent/AdministrativeMetadata/Source")
            if parsed_el is not None:
                item["original_source"] = parsed_el.find("Party").get("FormalName", "")

            parsed_el = xml.find("NewsEnvelope/TransmissionId")
            if parsed_el is not None:
                item["ingest_provider_sequence"] = parsed_el.text

            parsed_el = xml.find("NewsEnvelope/Priority")
            item["priority"] = self.map_priority(parsed_el.text if parsed_el is not None else None)

            self.parse_news_identifier(item, xml)
            self.parse_newslines(item, xml)
            self.parse_news_management(item, xml)

            parsed_el = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Language")
            if parsed_el is not None:
                language = self.parse_attributes_as_dictionary(parsed_el)
                item["language"] = language[0]["FormalName"] if len(language) else ""

            keywords = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Property")
            item["keywords"] = self.parse_attribute_values(keywords, "Keyword")

            subjects = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail")
            subjects += xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter")
            subjects += xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject")

            item["subject"] = self.format_subjects(subjects)

            # item['ContentItem'] = self.parse_attributes_as_dictionary(
            #    tree.find('NewsItem/NewsComponent/ContentItem'))
            # item['Content'] = etree.tostring(
            # tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content'))

            self.parse_content(item, xml)

            parsed_el = xml.findall("NewsItem/NewsComponent/ContentItem/Characteristics/Property")
            characteristics = self.parse_attribute_values(parsed_el, "Words")
            item["word_count"] = characteristics[0] if len(characteristics) else None

            parsed_el = xml.find("NewsItem/NewsComponent/RightsMetadata/UsageRights/UsageType")
            if parsed_el is not None:
                item.setdefault("usageterms", parsed_el.text)

            parsed_el = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Genre")
            if parsed_el is not None:
                item["genre"] = []
                for el in parsed_el:
                    item["genre"].append({"name": el.get("FormalName")})

            return self.populate_fields(item)
        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, provider)
Esempio n. 6
0
    def parse(self, xml, provider=None):
        """
        Parse content the xml newsml file to json object.

        Example content the xml newsml file:

        <?xml version="1.0" encoding="utf-8"?>
        <NewsML Version="1.2">
          <!--AFP NewsML text-photo profile evolution2-->
          <!--Processed by Xafp1-4ToNewsML1-2 rev21-->
          <Catalog Href="http://www.afp.com/dtd/AFPCatalog.xml"/>
          <NewsEnvelope>
            ......
          </NewsEnvelope>
          <NewsItem xml:lang="fr">
            <Identification>
                .......
            </Identification>
            <NewsManagement>
                ......
            </NewsManagement>
            <NewsComponent>
                ......
            </NewsComponent>
          </NewsItem>
        </NewsML>

        :param xml:
        :param provider:
        :return:
        """

        self._provider = provider
        if self._provider is None:
            self._provider = {}

        try:
            self.root = xml
            self._items = []
            self._item_seed = {}
            # parser the NewsEnvelope element
            self._item_seed.update(
                self.parse_newsenvelop(xml.find('NewsEnvelope'))
            )
            # parser the NewsItem element
            for newsitem_el in xml.findall('NewsItem'):
                try:
                    self.parse_newsitem(newsitem_el)
                except SkipItemException:
                    continue

            return self._items
        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, self._provider)
Esempio n. 7
0
    def parse(self, xml, provider=None):
        """
        Parser content the xml newsml file to json object.

        Example content the xml newsml file:

        <?xml version="1.0" encoding="utf-8"?>
        <NewsML Version="1.2">
          <!--AFP NewsML text-photo profile evolution2-->
          <!--Processed by Xafp1-4ToNewsML1-2 rev21-->
          <Catalog Href="http://www.afp.com/dtd/AFPCatalog.xml"/>
          <NewsEnvelope>
            ......
          </NewsEnvelope>
          <NewsItem xml:lang="fr">
            <Identification>
                .......
            </Identification>
            <NewsManagement>
                ......
            </NewsManagement>
            <NewsComponent>
                ......
            </NewsComponent>
          </NewsItem>
        </NewsML>

        :param xml:
        :param provider:
        :return:
        """
        try:
            items = []
            self.root = xml

            # parser the NewsEnvelope element
            item_envelop = self.parser_newsenvelop(xml.find('NewsEnvelope'))

            # parser the NewsItem element
            l_newsitem_el = xml.findall('NewsItem')
            for newsitem_el in l_newsitem_el:
                try:
                    item = item_envelop.copy()
                    self.parser_newsitem(item, newsitem_el)
                    item = self.populate_fields(item)
                except SkipItemException:
                    continue
                items.append(item)
            return items

        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, provider)
Esempio n. 8
0
 def test_raise_newsmlOneParserError(self):
     with assert_raises(ParserError) as error_context:
         try:
             raise Exception("Testing newsmlOneParserError")
         except Exception as ex:
             raise ParserError.newsmlOneParserError(ex, self.provider)
     exception = error_context.exception
     self.assertTrue(exception.code == 1004)
     self.assertTrue(exception.message == "NewsML1 input could not be processed")
     self.assertIsNotNone(exception.system_exception)
     self.assertEqual(exception.system_exception.args[0], "Testing newsmlOneParserError")
     self.assertEqual(len(self.mock_logger_handler.messages['error']), 1)
     self.assertEqual(self.mock_logger_handler.messages['error'][0],
                      "ParserError Error 1004 - NewsML1 input could not be processed: "
                      "Testing newsmlOneParserError on channel TestProvider")
class EventFileFeedingService(FileFeedingService):
    """
    Feeding Service class which can read the configured local file system for article(s).
    """

    NAME = 'event_file'
    ERRORS = [
        ParserError.IPTC7901ParserError().get_error_description(),
        ParserError.nitfParserError().get_error_description(),
        ParserError.newsmlOneParserError().get_error_description(),
        ProviderError.ingestError().get_error_description(),
        ParserError.parseFileError().get_error_description()
    ]

    label = 'Event file feed'
    """
    Defines the collection service to be used with this ingest feeding service.
    """
    service = 'events'

    fields = [{
        'id': 'path',
        'type': 'text',
        'label': 'Event File Server Folder',
        'placeholder': 'path to folder',
        'required': True,
        'errors': {
            3003: 'Path not found on server.',
            3004: 'Path should be directory.'
        }
    }]

    def _update(self, provider, update):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.warn(
                'File Feeding Service {} is configured without path. Please check the configuration'
                .format(provider['name']))
            return []

        for filename in get_sorted_files(self.path,
                                         sort_by=FileSortAttributes.created):
            try:
                last_updated = None
                file_path = os.path.join(self.path, filename)
                if os.path.isfile(file_path):
                    stat = os.lstat(file_path)
                    last_updated = datetime.fromtimestamp(stat.st_mtime,
                                                          tz=utc)

                    if self.is_latest_content(last_updated,
                                              provider.get('last_updated')):
                        parser = self.get_feed_parser(provider, file_path)
                        logger.info('Ingesting events with {} parser'.format(
                            parser.__class__.__name__))
                        if getattr(parser, 'parse_file'):
                            with open(file_path, 'rb') as f:
                                item = parser.parse_file(f, provider)
                        else:
                            item = parser.parse(file_path, provider)

                        self.after_extracting(item, provider)
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)

                        if isinstance(item, list):
                            yield item
                        else:
                            yield [item]
                    else:
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)
            except Exception as ex:
                if last_updated and self.is_old_content(last_updated):
                    self.move_file(self.path,
                                   filename,
                                   provider=provider,
                                   success=False)
                raise ParserError.parseFileError(
                    '{}-{}'.format(provider['name'], self.NAME), filename, ex,
                    provider)

        push_notification('ingest:update')
class FileFeedingService(FeedingService):
    """
    Feeding Service class which can read the configured local file system for article(s).
    """

    NAME = 'file'

    ERRORS = [
        ParserError.IPTC7901ParserError().get_error_description(),
        ParserError.nitfParserError().get_error_description(),
        ParserError.newsmlOneParserError().get_error_description(),
        ProviderError.ingestError().get_error_description(),
        ParserError.parseFileError().get_error_description()
    ]

    label = 'File Feed'

    fields = [{
        'id': 'path',
        'type': 'text',
        'label': 'Server Folder',
        'placeholder': 'path to folder',
        'required': True,
        'errors': {
            3003: 'Path not found on server.',
            3004: 'Path should be directory.'
        }
    }]

    def _test(self, provider):
        path = provider.get('config', {}).get('path', None)
        if not os.path.exists(path):
            raise IngestFileError.notExistsError()
        if not os.path.isdir(path):
            raise IngestFileError.isNotDirError()

    def _update(self, provider, update):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.warn(
                'File Feeding Service {} is configured without path. Please check the configuration'
                .format(provider['name']))
            return []

        registered_parser = self.get_feed_parser(provider)
        for filename in get_sorted_files(self.path,
                                         sort_by=FileSortAttributes.created):
            try:
                last_updated = None
                file_path = os.path.join(self.path, filename)
                if os.path.isfile(file_path):
                    stat = os.lstat(file_path)
                    last_updated = datetime.fromtimestamp(stat.st_mtime,
                                                          tz=utc)

                    if self.is_latest_content(last_updated,
                                              provider.get('last_updated')):
                        if isinstance(registered_parser, XMLFeedParser):
                            with open(file_path, 'rb') as f:
                                xml = etree.parse(f)
                                parser = self.get_feed_parser(
                                    provider, xml.getroot())
                                item = parser.parse(xml.getroot(), provider)
                        else:
                            parser = self.get_feed_parser(provider, file_path)
                            item = parser.parse(file_path, provider)

                        self.after_extracting(item, provider)
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)

                        if isinstance(item, list):
                            yield item
                        else:
                            yield [item]
                    else:
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)
            except Exception as ex:
                if last_updated and self.is_old_content(last_updated):
                    self.move_file(self.path,
                                   filename,
                                   provider=provider,
                                   success=False)
                raise ParserError.parseFileError(
                    '{}-{}'.format(provider['name'], self.NAME), filename, ex,
                    provider)

        push_notification('ingest:update')

    def after_extracting(self, article, provider):
        """Sub-classes should override this method if something needs to be done to the given article.

        For example, if the article comes from DPA provider the system needs to derive dateline
        from the properties in the article.

        Invoked after parser parses the article received from the provider.

        :param article: dict having properties that can be saved into ingest collection
        :type article: dict
        :param provider: dict - Ingest provider details to which the current directory has been configured
        :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource`
        """
        pass

    def move_file(self, file_path, filename, provider, success=True):
        """Move the files from the current directory to the _Processed if successful, else _Error if unsuccessful.

        Creates _Processed and _Error directories within current directory if they don't exist.

        :param file_path: str - current directory location
        :param filename: str - file name in the current directory to move
        :param provider: dict - Ingest provider details to which the current directory has been configured
        :param success: bool - default value is True. When True moves to _Processed directory else _Error directory.
        :raises IngestFileError.folderCreateError() if creation of _Processed or _Error directories fails
        :raises IngestFileError.fileMoveError() if failed to move the file pointed by filename
        """

        try:
            if not os.path.exists(os.path.join(file_path, "_PROCESSED/")):
                os.makedirs(os.path.join(file_path, "_PROCESSED/"))
            if not os.path.exists(os.path.join(file_path, "_ERROR/")):
                os.makedirs(os.path.join(file_path, "_ERROR/"))
        except Exception as ex:
            raise IngestFileError.folderCreateError(ex, provider)

        try:
            if success:
                shutil.copy2(os.path.join(file_path, filename),
                             os.path.join(file_path, "_PROCESSED/"))
            else:
                shutil.copy2(os.path.join(file_path, filename),
                             os.path.join(file_path, "_ERROR/"))
        except Exception as ex:
            raise IngestFileError.fileMoveError(ex, provider)
        finally:
            os.remove(os.path.join(file_path, filename))

    def is_latest_content(self, last_updated, provider_last_updated=None):
        """
        Parse file only if it's not older than provider last update -10m
        """

        if not provider_last_updated:
            provider_last_updated = utcnow() - timedelta(days=7)

        return provider_last_updated - timedelta(minutes=10) < last_updated

    def is_old_content(self, last_updated):
        """Test if file is old so it wouldn't probably work in is_latest_content next time.

        Such files can be moved to `_ERROR` folder, it wouldn't be ingested anymore.

        :param last_updated: file last updated datetime
        """
        return last_updated < utcnow() - timedelta(minutes=10)
Esempio n. 11
0
    def parse(self, xml, provider=None):
        item = {}
        try:
            self.root = xml
            parsed_el = xml.find(
                "NewsItem/NewsComponent/AdministrativeMetadata/Source/Party")
            if parsed_el is not None:
                item["original_source"] = parsed_el.attrib.get(
                    "FormalName", "ANA")

            parsed_el = xml.find("NewsEnvelope/Priority")
            item["priority"] = self.map_priority(
                parsed_el.text if parsed_el is not None else None)

            self.parse_news_identifier(item, xml)
            self.parse_newslines(item, xml)
            self.parse_news_management(item, xml)

            parsed_el = xml.findall(
                "NewsItem/NewsComponent/DescriptiveMetadata/Language")
            if parsed_el is not None:
                language = self.parse_attributes_as_dictionary(parsed_el)
                item["language"] = language[0]["FormalName"] if len(
                    language) else ""

            subjects = xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail[@Scheme="IptcSubjectCodes"]'
            )
            subjects += xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter[@Scheme="IptcSubjectCodes"]'
            )
            subjects += xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject[@Scheme="IptcSubjectCodes"]'
            )

            item["subject"] = self.format_subjects(subjects)

            item["body_html"] = (html.unescape(
                etree.tostring(xml.find(
                    "NewsItem/NewsComponent/NewsComponent/ContentItem/DataContent"
                ),
                               encoding="unicode")).replace(
                                   "<DataContent>",
                                   "").replace("</DataContent>", "").replace(
                                       "<P>", "<p>").replace("</P>", "</p>"))

            item["body_html"] = (item.get("body_html").replace(
                "<p>© ΑΠΕ-ΜΠΕ ΑΕ. Τα πνευματικά δικαιώματα ανήκουν στο "
                "ΑΠΕ-ΜΠΕ ΑΕ και παραχωρούνται σε συνδρομητές μόνον "
                "για συγκεκριμένη χρήση.</p>",
                "",
            ).strip())
            parsed_el = xml.findall(
                "NewsItem/NewsComponent/NewsComponent/ContentItem/Characteristics/Property"
            )
            characteristics = self.parse_attribute_values(
                parsed_el, "WordCount")
            item["word_count"] = characteristics[0] if len(
                characteristics) else None

            # Extract the city for setting into the dateline
            city = xml.find(
                'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="City"]'
            ).attrib.get("Value")
            # Anglicise the greek for Athens if required
            city = "Athens" if city == "Αθήνα" else city
            country = xml.find(
                'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="Country"]'
            ).attrib.get("Value")
            # Normalise the country code
            country = "GR" if country == "GRC" else country

            cities = app.locators.find_cities()
            located = [
                c for c in cities
                if c["city"] == city and c["country_code"] == country
            ]
            if len(located) == 1:
                item["dateline"]["located"] = located[0]
                item["dateline"]["source"] = provider.get("source")
                item["dateline"]["text"] = format_dateline_to_locmmmddsrc(
                    item["dateline"]["located"],
                    item.get("dateline", {}).get("date"),
                    provider.get("source"))
            return self.populate_fields(item)
        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, provider)
Esempio n. 12
0
    def parse(self, xml, provider=None):
        item = {
            'versioncreated':
            utcnow(),
            'anpa_category': [{
                "name": "Formidlingstjenester",
                "qcode": "r"
            }],
            'genre': [{
                "name": "Fulltekstmeldinger",
                "qcode": "Fulltekstmeldinger",
                "scheme": "genre_custom"
            }],
            'subject': [{
                'qcode': 'Børsmelding',
                'name': 'Børsmelding',
                'scheme': 'category'
            }],
            'ednote':
            '*** Dette er en børsmelding formidlet av NTB pva. andre ***'
        }
        self.populate_fields(item)

        try:
            # we remove newsml namespace for convenience (to avoid to write prefix each time)
            # we deepcopy first to avoid modifying original item
            xml = deepcopy(xml)
            for elt in xml.iter():
                elt.tag = elt.tag.replace('{' + NEWSML_NS + '}', '')
            news_items = xml.findall('NewsItem')

            # there may be several items (for different languages), we keep in order of
            # preference: Norwegian, English, first item (cf. SDNTB-573)
            selected = None
            for news_item in news_items:
                try:
                    lang = news_item.xpath(
                        'NewsComponent/DescriptiveMetadata/Language/@FormalName',
                    )[0]
                except IndexError:
                    logger.warning(
                        "missing language in item, ignoring it.\nxml: {xml}".
                        format(
                            xml=etree.tostring(news_item, encoding="unicode")))
                    continue

                if selected is None or lang in ('no', 'en'):
                    selected = news_item

                if lang == 'no':
                    break

            if selected is None:
                logger.warning("can't find any valid item\nxml={xml}".format(
                    xml=etree.tostring(news_item, encoding="unicode")))
                raise ParserError.parseFileError(
                    source=etree.tostring(xml, encoding="unicode"))

            self.do_mapping(item, selected)
            return [item]
        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, provider)
    def parse(self, xml, provider=None):
        """
        Parser content the xml newsml file to json object.

        Example content the xml newsml file:

        <?xml version="1.0" encoding="utf-8"?>
        <NewsML Version="1.2">
          <!--AFP NewsML text-photo profile evolution2-->
          <!--Processed by Xafp1-4ToNewsML1-2 rev21-->
          <Catalog Href="http://www.afp.com/dtd/AFPCatalog.xml"/>
          <NewsEnvelope>
            ......
          </NewsEnvelope>
          <NewsItem xml:lang="fr">
            <Identification>
                .......
            </Identification>
            <NewsManagement>
                ......
            </NewsManagement>
            <NewsComponent>
                ......
            </NewsComponent>
          </NewsItem>
        </NewsML>

        :param xml:
        :param provider:
        :return:
        """
        try:
            items = []
            self.root = xml

            # parser the NewsEnvelope element
            item_envelop = self.parse_newsenvelop(xml.find('NewsEnvelope'))

            # parser the NewsItem element
            l_newsitem_el = xml.findall('NewsItem')
            for newsitem_el in l_newsitem_el:
                try:
                    item = item_envelop.copy()
                    self.parse_newsitem(item, newsitem_el)
                    # add product is NEWS/GENERAL, if product is empty
                    if not [
                            it for it in item.get('subject', [])
                            if it.get('scheme') == 'services-products'
                    ]:
                        item.setdefault('subject', []).append({
                            'name':
                            'NEWS/GENERAL',
                            'qcode':
                            'NEWS/GENERAL',
                            'parent':
                            'NEWS',
                            'scheme':
                            'services-products'
                        })
                    # Distribution is default
                    item.setdefault('subject', []).extend([
                        {
                            "name": 'default',
                            "qcode": 'default',
                            "scheme": "distribution"
                        },
                    ])
                    # Slugline and keywords is epmty
                    item['slugline'] = None
                    item['keywords'] = []
                    # remove duplicated subject
                    item['subject'] = [
                        dict(i) for i, _ in itertools.groupby(
                            sorted(item['subject'], key=lambda k: k['qcode']))
                    ]
                    item = self.populate_fields(item)
                except SkipItemException:
                    continue
                items.append(item)
            return items

        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, provider)
Esempio n. 14
0
class EventFileFeedingService(FileFeedingService):
    """
    Feeding Service class which can read the configured local file system for article(s).
    """

    NAME = 'event_file'
    ERRORS = [
        ParserError.IPTC7901ParserError().get_error_description(),
        ParserError.nitfParserError().get_error_description(),
        ParserError.newsmlOneParserError().get_error_description(),
        ProviderError.ingestError().get_error_description(),
        ParserError.parseFileError().get_error_description()
    ]

    label = 'Event File Feed'
    """
    Defines the collection service to be used with this ingest feeding service.
    """
    service = 'events'

    def _update(self, provider, update):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.warn(
                'File Feeding Service {} is configured without path. Please check the configuration'
                .format(provider['name']))
            return []

        registered_parser = self.get_feed_parser(provider)
        for filename in get_sorted_files(self.path,
                                         sort_by=FileSortAttributes.created):
            try:
                last_updated = None
                file_path = os.path.join(self.path, filename)
                if os.path.isfile(file_path):
                    stat = os.lstat(file_path)
                    last_updated = datetime.fromtimestamp(stat.st_mtime,
                                                          tz=utc)

                    if self.is_latest_content(last_updated,
                                              provider.get('last_updated')):
                        if isinstance(registered_parser,
                                      NTBEventXMLFeedParser):
                            logger.info('Ingesting xml events')
                            with open(file_path, 'rb') as f:
                                xml = ElementTree.parse(f)
                                parser = self.get_feed_parser(
                                    provider, xml.getroot())
                                item = parser.parse(xml.getroot(), provider)
                        elif isinstance(registered_parser, IcsTwoFeedParser):
                            logger.info('Ingesting ics events')
                            with open(file_path, 'rb') as f:
                                cal = Calendar.from_ical(f.read())
                                parser = self.get_feed_parser(provider, cal)
                                item = parser.parse(cal, provider)
                        else:
                            logger.info('Ingesting events with unknown parser')
                            parser = self.get_feed_parser(provider, file_path)
                            item = parser.parse(file_path, provider)

                        self.after_extracting(item, provider)
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)

                        if isinstance(item, list):
                            yield item
                        else:
                            yield [item]
                    else:
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)
            except Exception as ex:
                if last_updated and self.is_old_content(last_updated):
                    self.move_file(self.path,
                                   filename,
                                   provider=provider,
                                   success=False)
                raise ParserError.parseFileError(
                    '{}-{}'.format(provider['name'], self.NAME), filename, ex,
                    provider)

        push_notification('ingest:update')
Esempio n. 15
0
import logging

from datetime import datetime
from .newsml_1_2 import NewsMLOneParser
from superdesk.io.file_ingest_service import FileIngestService
from superdesk.utils import get_sorted_files, FileSortAttributes
from ..utc import utc
from ..etree import etree, ParseError as etreeParserError
from superdesk.notification import push_notification
from superdesk.io import register_provider
from superdesk.errors import ParserError, ProviderError


logger = logging.getLogger(__name__)
PROVIDER = 'afp'
errors = [ParserError.newsmlOneParserError().get_error_description(),
          ProviderError.ingestError().get_error_description()]


class AFPIngestService(FileIngestService):
    """AFP Ingest Service"""

    def __init__(self):
        self.parser = NewsMLOneParser()

    def _update(self, provider):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)
        if not self.path:
            return
Esempio n. 16
0
class FileFeedingService(FeedingService):
    """
    Feeding Service class which can read the configured local file system for article(s).
    """

    NAME = 'file'

    ERRORS = [
        ParserError.IPTC7901ParserError().get_error_description(),
        ParserError.nitfParserError().get_error_description(),
        ParserError.newsmlOneParserError().get_error_description(),
        ProviderError.ingestError().get_error_description(),
        ParserError.parseFileError().get_error_description()
    ]

    label = 'File feed'

    fields = [
        {
            'id': 'path', 'type': 'text', 'label': 'Server Folder',
            'placeholder': 'path to folder', 'required': True,
            'errors': {3003: 'Path not found on server.', 3004: 'Path should be directory.'}
        }
    ]

    def _test(self, provider):
        path = provider.get('config', {}).get('path', None)
        if not os.path.exists(path):
            raise IngestFileError.notExistsError()
        if not os.path.isdir(path):
            raise IngestFileError.isNotDirError()

    def _update(self, provider, update):
        # check if deprecated FILE_INGEST_OLD_CONTENT_MINUTES setting is still used
        if "FILE_INGEST_OLD_CONTENT_MINUTES" in app.config:
            deprecated_cont_min = app.config["FILE_INGEST_OLD_CONTENT_MINUTES"]
            cont_min = app.config[OLD_CONTENT_MINUTES]
            if deprecated_cont_min != cont_min:
                logger.warning(
                    "'FILE_INGEST_OLD_CONTENT_MINUTES' is deprecated, please update settings.py to use {new_name!r}"
                    .format(new_name=OLD_CONTENT_MINUTES))
                app.config[OLD_CONTENT_MINUTES] = deprecated_cont_min

        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.warn('File Feeding Service {} is configured without path. Please check the configuration'
                        .format(provider['name']))
            return []

        registered_parser = self.get_feed_parser(provider)
        for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created):
            try:
                last_updated = None
                file_path = os.path.join(self.path, filename)
                if os.path.isfile(file_path):
                    last_updated = self.get_last_updated(file_path)

                    if self.is_latest_content(last_updated, provider.get('last_updated')):
                        if isinstance(registered_parser, XMLFeedParser):
                            with open(file_path, 'rb') as f:
                                xml = etree.parse(f)
                                parser = self.get_feed_parser(provider, xml.getroot())
                                item = parser.parse(xml.getroot(), provider)
                        else:
                            parser = self.get_feed_parser(provider, file_path)
                            item = parser.parse(file_path, provider)

                        self.after_extracting(item, provider)

                        if isinstance(item, list):
                            failed = yield item
                        else:
                            failed = yield [item]

                        self.move_file(self.path, filename, provider=provider, success=not failed)
                    else:
                        self.move_file(self.path, filename, provider=provider, success=False)
            except Exception as ex:
                if last_updated and self.is_old_content(last_updated):
                    self.move_file(self.path, filename, provider=provider, success=False)
                raise ParserError.parseFileError('{}-{}'.format(provider['name'], self.NAME), filename, ex, provider)

        push_notification('ingest:update')

    def after_extracting(self, article, provider):
        """Sub-classes should override this method if something needs to be done to the given article.

        For example, if the article comes from DPA provider the system needs to derive dateline
        from the properties in the article.

        Invoked after parser parses the article received from the provider.

        :param article: dict having properties that can be saved into ingest collection
        :type article: dict
        :param provider: dict - Ingest provider details to which the current directory has been configured
        :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource`
        """
        pass

    def move_file(self, file_path, filename, provider, success=True):
        """Move the files from the current directory to the _Processed if successful, else _Error if unsuccessful.

        Creates _Processed and _Error directories within current directory if they don't exist.

        :param file_path: str - current directory location
        :param filename: str - file name in the current directory to move
        :param provider: dict - Ingest provider details to which the current directory has been configured
        :param success: bool - default value is True. When True moves to _Processed directory else _Error directory.
        :raises IngestFileError.folderCreateError() if creation of _Processed or _Error directories fails
        :raises IngestFileError.fileMoveError() if failed to move the file pointed by filename
        """

        try:
            if not os.path.exists(os.path.join(file_path, "_PROCESSED/")):
                os.makedirs(os.path.join(file_path, "_PROCESSED/"))
            if not os.path.exists(os.path.join(file_path, "_ERROR/")):
                os.makedirs(os.path.join(file_path, "_ERROR/"))
        except Exception as ex:
            raise IngestFileError.folderCreateError(ex, provider)

        try:
            if success:
                shutil.copy2(os.path.join(file_path, filename), os.path.join(file_path, "_PROCESSED/"))
            else:
                shutil.copy2(os.path.join(file_path, filename), os.path.join(file_path, "_ERROR/"))
        except Exception as ex:
            raise IngestFileError.fileMoveError(ex, provider)
        finally:
            os.remove(os.path.join(file_path, filename))

    def get_last_updated(self, file_path):
        """Get last updated time for file.

        Using both mtime and ctime timestamps not to miss
        old files being copied around and recent files after
        changes done in place.
        """
        stat = os.lstat(file_path)
        timestamp = max(stat.st_mtime, stat.st_ctime)
        return datetime.fromtimestamp(timestamp, tz=utc)
Esempio n. 17
0
    def parse(self, xml, provider=None):
        item = {}
        try:
            self.root = xml
            parsed_el = xml.find(
                'NewsItem/NewsComponent/AdministrativeMetadata/Source/Party')
            if parsed_el is not None:
                item['original_source'] = parsed_el.attrib.get(
                    'FormalName', 'ANA')

            parsed_el = xml.find('NewsEnvelope/Priority')
            item['priority'] = self.map_priority(
                parsed_el.text if parsed_el is not None else None)

            self.parse_news_identifier(item, xml)
            self.parse_newslines(item, xml)
            self.parse_news_management(item, xml)

            parsed_el = xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/Language')
            if parsed_el is not None:
                language = self.parse_attributes_as_dictionary(parsed_el)
                item['language'] = language[0]['FormalName'] if len(
                    language) else ''

            subjects = xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail[@Scheme="IptcSubjectCodes"]'
            )
            subjects += xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter[@Scheme="IptcSubjectCodes"]'
            )
            subjects += xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject[@Scheme="IptcSubjectCodes"]'
            )

            item['subject'] = self.format_subjects(subjects)

            item['body_html'] = html.unescape(
                etree.tostring(xml.find(
                    'NewsItem/NewsComponent/NewsComponent/ContentItem/DataContent'
                ),
                               encoding='unicode')).replace(
                                   '<DataContent>',
                                   '').replace('</DataContent>', '').replace(
                                       '<P>', '<p>').replace('</P>', '</p>')

            item['body_html'] = item.get('body_html').replace(
                '<p>© ΑΠΕ-ΜΠΕ ΑΕ. Τα πνευματικά δικαιώματα ανήκουν στο '
                'ΑΠΕ-ΜΠΕ ΑΕ και παραχωρούνται σε συνδρομητές μόνον '
                'για συγκεκριμένη χρήση.</p>', '').strip()
            parsed_el = xml.findall(
                'NewsItem/NewsComponent/NewsComponent/ContentItem/Characteristics/Property'
            )
            characteristics = self.parse_attribute_values(
                parsed_el, 'WordCount')
            item['word_count'] = characteristics[0] if len(
                characteristics) else None

            # Extract the city for setting into the dateline
            city = xml.find(
                'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="City"]'
            ).attrib.get('Value')
            # Anglicise the greek for Athens if required
            city = 'Athens' if city == 'Αθήνα' else city
            country = xml.find(
                'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="Country"]'
            ).attrib.get('Value')
            # Normalise the country code
            country = 'GR' if country == 'GRC' else country

            cities = app.locators.find_cities()
            located = [
                c for c in cities
                if c['city'] == city and c['country_code'] == country
            ]
            if len(located) == 1:
                item['dateline']['located'] = located[0]
                item['dateline']['source'] = provider.get('source')
                item['dateline']['text'] = format_dateline_to_locmmmddsrc(
                    item['dateline']['located'],
                    item.get('dateline', {}).get('date'),
                    provider.get('source'))
            return self.populate_fields(item)
        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, provider)
Esempio n. 18
0
    def parse(self, xml, provider=None):
        item = {}
        try:
            self.root = xml
            parsed_el = xml.find("NewsItem/NewsComponent/AdministrativeMetadata/Source")
            if parsed_el is not None:
                item["original_source"] = parsed_el.find("Party").get("FormalName", "")

            parsed_el = xml.find("NewsEnvelope/TransmissionId")
            if parsed_el is not None:
                item["ingest_provider_sequence"] = parsed_el.text

            parsed_el = xml.find("NewsEnvelope/Priority")
            item["priority"] = self.map_priority(parsed_el.text if parsed_el else None)

            self.parse_news_identifier(item, xml)
            self.parse_newslines(item, xml)
            self.parse_news_management(item, xml)

            parsed_el = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Language")
            if parsed_el is not None:
                language = self.parse_attributes_as_dictionary(parsed_el)
                item["language"] = language[0]["FormalName"] if len(language) else ""

            keywords = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Property")
            item["keywords"] = self.parse_attribute_values(keywords, "Keyword")

            subjects = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail")
            subjects += xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter")
            subjects += xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject")

            item["subject"] = self.format_subjects(subjects)

            # item['ContentItem'] = self.parse_attributes_as_dictionary(
            #    tree.find('NewsItem/NewsComponent/ContentItem'))
            # item['Content'] = etree.tostring(
            # tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content'))

            item["body_html"] = (
                etree.tostring(
                    xml.find("NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content"),
                    encoding="unicode",
                )
                .replace("<body.content>", "")
                .replace("</body.content>", "")
            )

            parsed_el = xml.findall("NewsItem/NewsComponent/ContentItem/Characteristics/Property")
            characteristics = self.parse_attribute_values(parsed_el, "Words")
            item["word_count"] = characteristics[0] if len(characteristics) else None

            parsed_el = xml.find("NewsItem/NewsComponent/RightsMetadata/UsageRights/UsageType")
            if parsed_el is not None:
                item.setdefault("usageterms", parsed_el.text)

            parsed_el = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Genre")
            if parsed_el is not None:
                item["genre"] = []
                for el in parsed_el:
                    item["genre"].append({"name": el.get("FormalName")})

            return self.populate_fields(item)
        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, provider)