Example #1
0
    def _update(self, provider):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.info('No path')
            return []

        for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created):
            try:
                filepath = os.path.join(self.path, filename)
                if os.path.isfile(filepath):
                    stat = os.lstat(filepath)
                    last_updated = datetime.fromtimestamp(stat.st_mtime, tz=utc)
                    if self.is_latest_content(last_updated, provider.get('last_updated')):
                        item = self.parser.parse_file(filepath, provider)
                        dpa_derive_dateline(item)

                        self.move_file(self.path, filename, provider=provider, success=True)
                        yield [item]
                    else:
                        self.move_file(self.path, filename, provider=provider, success=True)
            except Exception as ex:
                self.move_file(self.path, filename, provider=provider, success=False)
                raise ParserError.parseFileError('DPA', filename, ex, provider)
Example #2
0
    def _update(self, provider, update):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.warn(
                'File Feeding Service {} is configured without path. Please check the configuration'
                .format(provider['name']))
            return []

        registered_parser = self.get_feed_parser(provider)
        for filename in get_sorted_files(self.path,
                                         sort_by=FileSortAttributes.created):
            try:
                last_updated = None
                file_path = os.path.join(self.path, filename)
                if os.path.isfile(file_path):
                    stat = os.lstat(file_path)
                    last_updated = datetime.fromtimestamp(stat.st_mtime,
                                                          tz=utc)

                    if self.is_latest_content(last_updated,
                                              provider.get('last_updated')):
                        if isinstance(registered_parser, XMLFeedParser):
                            with open(file_path, 'rb') as f:
                                xml = etree.parse(f)
                                parser = self.get_feed_parser(
                                    provider, xml.getroot())
                                item = parser.parse(xml.getroot(), provider)
                        else:
                            parser = self.get_feed_parser(provider, file_path)
                            item = parser.parse(file_path, provider)

                        self.after_extracting(item, provider)

                        if isinstance(item, list):
                            failed = yield item
                        else:
                            failed = yield [item]

                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=not failed)
                    else:
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)
            except Exception as ex:
                if last_updated and self.is_old_content(last_updated):
                    self.move_file(self.path,
                                   filename,
                                   provider=provider,
                                   success=False)
                raise ParserError.parseFileError(
                    '{}-{}'.format(provider['name'], self.NAME), filename, ex,
                    provider)

        push_notification('ingest:update')
Example #3
0
    def parse_file(self, filename, provider):
        try:
            path = provider.get('config', {}).get('path', None)

            if not path:
                return []

            item = self.parser.parse_file(os.path.join(path, filename), provider)

            return [item]
        except Exception as ex:
            raise ParserError.parseFileError('Teletype', filename, ex, provider)
Example #4
0
    def _update(self, provider, update):
        # check if deprecated FILE_INGEST_OLD_CONTENT_MINUTES setting is still used
        if "FILE_INGEST_OLD_CONTENT_MINUTES" in app.config:
            deprecated_cont_min = app.config["FILE_INGEST_OLD_CONTENT_MINUTES"]
            cont_min = app.config[OLD_CONTENT_MINUTES]
            if deprecated_cont_min != cont_min:
                logger.warning(
                    "'FILE_INGEST_OLD_CONTENT_MINUTES' is deprecated, please update settings.py to use {new_name!r}"
                    .format(new_name=OLD_CONTENT_MINUTES))
                app.config[OLD_CONTENT_MINUTES] = deprecated_cont_min

        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.warn('File Feeding Service {} is configured without path. Please check the configuration'
                        .format(provider['name']))
            return []

        registered_parser = self.get_feed_parser(provider)
        for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created):
            try:
                last_updated = None
                file_path = os.path.join(self.path, filename)
                if os.path.isfile(file_path):
                    last_updated = self.get_last_updated(file_path)

                    if self.is_latest_content(last_updated, provider.get('last_updated')):
                        if isinstance(registered_parser, XMLFeedParser):
                            with open(file_path, 'rb') as f:
                                xml = etree.parse(f)
                                parser = self.get_feed_parser(provider, xml.getroot())
                                item = parser.parse(xml.getroot(), provider)
                        else:
                            parser = self.get_feed_parser(provider, file_path)
                            item = parser.parse(file_path, provider)

                        self.after_extracting(item, provider)

                        if isinstance(item, list):
                            failed = yield item
                        else:
                            failed = yield [item]

                        self.move_file(self.path, filename, provider=provider, success=not failed)
                    else:
                        self.move_file(self.path, filename, provider=provider, success=False)
            except Exception as ex:
                if last_updated and self.is_old_content(last_updated):
                    self.move_file(self.path, filename, provider=provider, success=False)
                raise ParserError.parseFileError('{}-{}'.format(provider['name'], self.NAME), filename, ex, provider)

        push_notification('ingest:update')
Example #5
0
    def parse_file(self, filename, provider):
        try:
            path = provider.get('config', {}).get('path', None)

            if not path:
                return []

            item = self.parser.parse_file(os.path.join(path, filename), provider)

            return [item]
        except Exception as ex:
            raise ParserError.parseFileError('Teletype', filename, ex, provider)
Example #6
0
    def parse_file(self, filename, provider):
        try:
            self.path = provider.get('config', {}).get('path', None)

            if not self.path:
                return []

            with open(os.path.join(self.path, filename), 'r') as f:
                item = self.parser.parse_message(etree.fromstring(f.read()), provider)

            return [item]
        except Exception as ex:
            self.move_file(self.path, filename, provider=provider, success=False)
            raise ParserError.parseFileError('AAP', filename, ex, provider)
Example #7
0
    def parse_titles(self, titles):
        """Lookup title columns and return dictionary of titles index
        """
        index = {}
        titles = [s.lower().strip() for s in titles]
        for field in self.titles:
            if field.lower().strip() not in titles:
                raise ParserError.parseFileError()
            index[field] = titles.index(field.lower().strip())
        # generate_fields may not present when testing config
        for field in self.generate_fields:
            if field.lower().strip() in titles:
                index[field] = titles.index(field.lower().strip())

        return index
Example #8
0
 def test_raise_parseFileError(self):
     with assert_raises(ParserError) as error_context:
         try:
             raise Exception("Testing parseFileError")
         except Exception as ex:
             raise ParserError.parseFileError('afp', 'test.txt', ex, self.provider)
     exception = error_context.exception
     self.assertTrue(exception.code == 1002)
     self.assertTrue(exception.message == "Ingest file could not be parsed")
     self.assertIsNotNone(exception.system_exception)
     self.assertEqual(exception.system_exception.args[0], "Testing parseFileError")
     self.assertEqual(len(self.mock_logger_handler.messages['error']), 1)
     message = self.mock_logger_handler.messages['error'][0]
     self.assertIn("ParserError Error 1002 - Ingest file could not be parsed", message)
     self.assertIn("Testing parseFileError on channel TestProvider", message)
     self.assertIn("source=afp", message)
     self.assertIn("file=test.txt", message)
Example #9
0
    def parse_file(self, filename, provider):
        try:
            path = provider.get('config', {}).get('path', None)

            if not path:
                return []

            with open(os.path.join(path, filename), 'r') as f:
                item = self.parser.parse_message(etree.fromstring(f.read()), provider)

                item['firstcreated'] = normalize_date(item.get('firstcreated'), self.tz)
                item['versioncreated'] = normalize_date(item.get('versioncreated'), self.tz)

            return [item]
        except Exception as ex:
            self.move_file(self.path, filename, provider=provider, success=False)
            raise ParserError.parseFileError('AAP', filename, ex, provider)
Example #10
0
 def test_raise_parseFileError(self):
     with assert_raises(ParserError) as error_context:
         try:
             ex = Exception("Testing parseFileError")
             raise ex
         except Exception:
             raise ParserError.parseFileError('afp', 'test.txt', ex, self.provider)
     exception = error_context.exception
     self.assertTrue(exception.code == 1002)
     self.assertTrue(exception.message == "Ingest file could not be parsed")
     self.assertIsNotNone(exception.system_exception)
     self.assertEqual(exception.system_exception.args[0], "Testing parseFileError")
     self.assertEqual(len(self.mock_logger_handler.messages['error']), 2)
     self.assertEqual(self.mock_logger_handler.messages['error'][0],
                      "Source Type: afp - File: test.txt could not be processed")
     self.assertEqual(self.mock_logger_handler.messages['error'][1],
                      "ParserError Error 1002 - Ingest file could not be parsed: "
                      "Testing parseFileError on channel TestProvider")
Example #11
0
    def _update(self, provider):
        self.provider = provider
        self.path = provider.get("config", {}).get("path", None)

        if not self.path:
            logger.warn(
                "File Feeding Service {} is configured without path. Please check the configuration".format(
                    provider["name"]
                )
            )
            return []

        registered_parser = self.get_feed_parser(provider)
        for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created):
            try:
                file_path = os.path.join(self.path, filename)
                if os.path.isfile(file_path):
                    stat = os.lstat(file_path)
                    last_updated = datetime.fromtimestamp(stat.st_mtime, tz=utc)

                    if self.is_latest_content(last_updated, provider.get("last_updated")):
                        if isinstance(registered_parser, XMLFeedParser):
                            with open(file_path, "rt") as f:
                                xml = ElementTree.parse(f)
                                parser = self.get_feed_parser(provider, xml.getroot())
                                item = parser.parse(xml, provider)
                        else:
                            parser = self.get_feed_parser(provider, file_path)
                            item = parser.parse(file_path, provider)

                        self.after_extracting(item, provider)
                        self.move_file(self.path, filename, provider=provider, success=True)

                        if isinstance(item, list):
                            yield item
                        else:
                            yield [item]
                    else:
                        self.move_file(self.path, filename, provider=provider, success=True)
            except Exception as ex:
                self.move_file(self.path, filename, provider=provider, success=False)
                raise ParserError.parseFileError("{}-{}".format(provider["name"], self.NAME), filename, ex, provider)

        push_notification("ingest:update")
Example #12
0
    def parse_file(self, filename, provider):
        """
        Given a filename of a file to be ingested prepend the path found in the providers config and call
        the underlying parse_file method
        :param filename:
        :param provider:
        :return: The item parsed from the file
        """
        try:
            path = provider.get('config', {}).get('path', None)

            if not path:
                return []

            item = self.parser.parse_file(os.path.join(path, filename), provider)

            return [item]
        except Exception as ex:
            raise ParserError.parseFileError('DPA', filename, ex, provider)
Example #13
0
 def test_raise_parseFileError(self):
     with assert_raises(ParserError) as error_context:
         try:
             ex = Exception("Testing parseFileError")
             raise ex
         except Exception:
             raise ParserError.parseFileError('afp', 'test.txt', ex,
                                              self.provider)
     exception = error_context.exception
     self.assertTrue(exception.code == 1002)
     self.assertTrue(exception.message == "Ingest file could not be parsed")
     self.assertIsNotNone(exception.system_exception)
     self.assertEquals(exception.system_exception.args[0],
                       "Testing parseFileError")
     self.assertEqual(len(self.mock_logger_handler.messages['error']), 2)
     self.assertEqual(
         self.mock_logger_handler.messages['error'][0],
         "Source Type: afp - File: test.txt could not be processed")
     self.assertEqual(
         self.mock_logger_handler.messages['error'][1],
         "ParserError Error 1002 - Ingest file could not be parsed: "
         "Testing parseFileError on channel TestProvider")
Example #14
0
    def _update(self, provider):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.warn('File Feeding Service {} is configured without path. Please check the configuration'
                        .format(provider['name']))
            return []

        registered_parser = self.get_feed_parser(provider)
        for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created):
            try:
                file_path = os.path.join(self.path, filename)
                if os.path.isfile(file_path):
                    stat = os.lstat(file_path)
                    last_updated = datetime.fromtimestamp(stat.st_mtime, tz=utc)

                    if self.is_latest_content(last_updated, provider.get('last_updated')):
                        if isinstance(registered_parser, XMLFeedParser):
                            with open(file_path, 'r') as f:
                                xml_string = etree.fromstring(f.read())
                                parser = self.get_feed_parser(provider, xml_string)
                                item = parser.parse(xml_string, provider)
                        else:
                            parser = self.get_feed_parser(provider, file_path)
                            item = parser.parse(file_path, provider)

                        self.after_extracting(item, provider)
                        self.move_file(self.path, filename, provider=provider, success=True)

                        yield [item]
                    else:
                        self.move_file(self.path, filename, provider=provider, success=True)
            except Exception as ex:
                self.move_file(self.path, filename, provider=provider, success=False)
                raise ParserError.parseFileError('{}-{}'.format(provider['name'], self.NAME), filename, ex, provider)

        push_notification('ingest:update')
Example #15
0
    def _update(self, provider):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.info('No path')
            return []

        for filename in get_sorted_files(self.path,
                                         sort_by=FileSortAttributes.created):
            try:
                filepath = os.path.join(self.path, filename)
                if os.path.isfile(filepath):
                    stat = os.lstat(filepath)
                    last_updated = datetime.fromtimestamp(stat.st_mtime,
                                                          tz=utc)
                    if self.is_latest_content(last_updated,
                                              provider.get('last_updated')):
                        item = self.parser.parse_file(filepath, provider)

                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)
                        yield [item]
                    else:
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)
            except Exception as ex:
                self.move_file(self.path,
                               filename,
                               provider=provider,
                               success=False)
                raise ParserError.parseFileError('Teletype', filename, ex,
                                                 provider)
class FileFeedingService(FeedingService):
    """
    Feeding Service class which can read the configured local file system for article(s).
    """

    NAME = 'file'

    ERRORS = [
        ParserError.IPTC7901ParserError().get_error_description(),
        ParserError.nitfParserError().get_error_description(),
        ParserError.newsmlOneParserError().get_error_description(),
        ProviderError.ingestError().get_error_description(),
        ParserError.parseFileError().get_error_description()
    ]

    label = 'File Feed'

    fields = [{
        'id': 'path',
        'type': 'text',
        'label': 'Server Folder',
        'placeholder': 'path to folder',
        'required': True,
        'errors': {
            3003: 'Path not found on server.',
            3004: 'Path should be directory.'
        }
    }]

    def _test(self, provider):
        path = provider.get('config', {}).get('path', None)
        if not os.path.exists(path):
            raise IngestFileError.notExistsError()
        if not os.path.isdir(path):
            raise IngestFileError.isNotDirError()

    def _update(self, provider, update):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.warn(
                'File Feeding Service {} is configured without path. Please check the configuration'
                .format(provider['name']))
            return []

        registered_parser = self.get_feed_parser(provider)
        for filename in get_sorted_files(self.path,
                                         sort_by=FileSortAttributes.created):
            try:
                last_updated = None
                file_path = os.path.join(self.path, filename)
                if os.path.isfile(file_path):
                    stat = os.lstat(file_path)
                    last_updated = datetime.fromtimestamp(stat.st_mtime,
                                                          tz=utc)

                    if self.is_latest_content(last_updated,
                                              provider.get('last_updated')):
                        if isinstance(registered_parser, XMLFeedParser):
                            with open(file_path, 'rb') as f:
                                xml = etree.parse(f)
                                parser = self.get_feed_parser(
                                    provider, xml.getroot())
                                item = parser.parse(xml.getroot(), provider)
                        else:
                            parser = self.get_feed_parser(provider, file_path)
                            item = parser.parse(file_path, provider)

                        self.after_extracting(item, provider)
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)

                        if isinstance(item, list):
                            yield item
                        else:
                            yield [item]
                    else:
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)
            except Exception as ex:
                if last_updated and self.is_old_content(last_updated):
                    self.move_file(self.path,
                                   filename,
                                   provider=provider,
                                   success=False)
                raise ParserError.parseFileError(
                    '{}-{}'.format(provider['name'], self.NAME), filename, ex,
                    provider)

        push_notification('ingest:update')

    def after_extracting(self, article, provider):
        """Sub-classes should override this method if something needs to be done to the given article.

        For example, if the article comes from DPA provider the system needs to derive dateline
        from the properties in the article.

        Invoked after parser parses the article received from the provider.

        :param article: dict having properties that can be saved into ingest collection
        :type article: dict
        :param provider: dict - Ingest provider details to which the current directory has been configured
        :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource`
        """
        pass

    def move_file(self, file_path, filename, provider, success=True):
        """Move the files from the current directory to the _Processed if successful, else _Error if unsuccessful.

        Creates _Processed and _Error directories within current directory if they don't exist.

        :param file_path: str - current directory location
        :param filename: str - file name in the current directory to move
        :param provider: dict - Ingest provider details to which the current directory has been configured
        :param success: bool - default value is True. When True moves to _Processed directory else _Error directory.
        :raises IngestFileError.folderCreateError() if creation of _Processed or _Error directories fails
        :raises IngestFileError.fileMoveError() if failed to move the file pointed by filename
        """

        try:
            if not os.path.exists(os.path.join(file_path, "_PROCESSED/")):
                os.makedirs(os.path.join(file_path, "_PROCESSED/"))
            if not os.path.exists(os.path.join(file_path, "_ERROR/")):
                os.makedirs(os.path.join(file_path, "_ERROR/"))
        except Exception as ex:
            raise IngestFileError.folderCreateError(ex, provider)

        try:
            if success:
                shutil.copy2(os.path.join(file_path, filename),
                             os.path.join(file_path, "_PROCESSED/"))
            else:
                shutil.copy2(os.path.join(file_path, filename),
                             os.path.join(file_path, "_ERROR/"))
        except Exception as ex:
            raise IngestFileError.fileMoveError(ex, provider)
        finally:
            os.remove(os.path.join(file_path, filename))

    def is_latest_content(self, last_updated, provider_last_updated=None):
        """
        Parse file only if it's not older than provider last update -10m
        """

        if not provider_last_updated:
            provider_last_updated = utcnow() - timedelta(days=7)

        return provider_last_updated - timedelta(minutes=10) < last_updated

    def is_old_content(self, last_updated):
        """Test if file is old so it wouldn't probably work in is_latest_content next time.

        Such files can be moved to `_ERROR` folder, it wouldn't be ingested anymore.

        :param last_updated: file last updated datetime
        """
        return last_updated < utcnow() - timedelta(minutes=10)
Example #17
0
class FileFeedingService(FeedingService):
    """
    Feeding Service class which can read the configured local file system for article(s).
    """

    NAME = 'file'

    ERRORS = [
        ParserError.IPTC7901ParserError().get_error_description(),
        ParserError.nitfParserError().get_error_description(),
        ParserError.newsmlOneParserError().get_error_description(),
        ProviderError.ingestError().get_error_description(),
        ParserError.parseFileError().get_error_description()
    ]

    label = 'File feed'

    fields = [
        {
            'id': 'path', 'type': 'text', 'label': 'Server Folder',
            'placeholder': 'path to folder', 'required': True,
            'errors': {3003: 'Path not found on server.', 3004: 'Path should be directory.'}
        }
    ]

    def _test(self, provider):
        path = provider.get('config', {}).get('path', None)
        if not os.path.exists(path):
            raise IngestFileError.notExistsError()
        if not os.path.isdir(path):
            raise IngestFileError.isNotDirError()

    def _update(self, provider, update):
        # check if deprecated FILE_INGEST_OLD_CONTENT_MINUTES setting is still used
        if "FILE_INGEST_OLD_CONTENT_MINUTES" in app.config:
            deprecated_cont_min = app.config["FILE_INGEST_OLD_CONTENT_MINUTES"]
            cont_min = app.config[OLD_CONTENT_MINUTES]
            if deprecated_cont_min != cont_min:
                logger.warning(
                    "'FILE_INGEST_OLD_CONTENT_MINUTES' is deprecated, please update settings.py to use {new_name!r}"
                    .format(new_name=OLD_CONTENT_MINUTES))
                app.config[OLD_CONTENT_MINUTES] = deprecated_cont_min

        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.warn('File Feeding Service {} is configured without path. Please check the configuration'
                        .format(provider['name']))
            return []

        registered_parser = self.get_feed_parser(provider)
        for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created):
            try:
                last_updated = None
                file_path = os.path.join(self.path, filename)
                if os.path.isfile(file_path):
                    last_updated = self.get_last_updated(file_path)

                    if self.is_latest_content(last_updated, provider.get('last_updated')):
                        if isinstance(registered_parser, XMLFeedParser):
                            with open(file_path, 'rb') as f:
                                xml = etree.parse(f)
                                parser = self.get_feed_parser(provider, xml.getroot())
                                item = parser.parse(xml.getroot(), provider)
                        else:
                            parser = self.get_feed_parser(provider, file_path)
                            item = parser.parse(file_path, provider)

                        self.after_extracting(item, provider)

                        if isinstance(item, list):
                            failed = yield item
                        else:
                            failed = yield [item]

                        self.move_file(self.path, filename, provider=provider, success=not failed)
                    else:
                        self.move_file(self.path, filename, provider=provider, success=False)
            except Exception as ex:
                if last_updated and self.is_old_content(last_updated):
                    self.move_file(self.path, filename, provider=provider, success=False)
                raise ParserError.parseFileError('{}-{}'.format(provider['name'], self.NAME), filename, ex, provider)

        push_notification('ingest:update')

    def after_extracting(self, article, provider):
        """Sub-classes should override this method if something needs to be done to the given article.

        For example, if the article comes from DPA provider the system needs to derive dateline
        from the properties in the article.

        Invoked after parser parses the article received from the provider.

        :param article: dict having properties that can be saved into ingest collection
        :type article: dict
        :param provider: dict - Ingest provider details to which the current directory has been configured
        :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource`
        """
        pass

    def move_file(self, file_path, filename, provider, success=True):
        """Move the files from the current directory to the _Processed if successful, else _Error if unsuccessful.

        Creates _Processed and _Error directories within current directory if they don't exist.

        :param file_path: str - current directory location
        :param filename: str - file name in the current directory to move
        :param provider: dict - Ingest provider details to which the current directory has been configured
        :param success: bool - default value is True. When True moves to _Processed directory else _Error directory.
        :raises IngestFileError.folderCreateError() if creation of _Processed or _Error directories fails
        :raises IngestFileError.fileMoveError() if failed to move the file pointed by filename
        """

        try:
            if not os.path.exists(os.path.join(file_path, "_PROCESSED/")):
                os.makedirs(os.path.join(file_path, "_PROCESSED/"))
            if not os.path.exists(os.path.join(file_path, "_ERROR/")):
                os.makedirs(os.path.join(file_path, "_ERROR/"))
        except Exception as ex:
            raise IngestFileError.folderCreateError(ex, provider)

        try:
            if success:
                shutil.copy2(os.path.join(file_path, filename), os.path.join(file_path, "_PROCESSED/"))
            else:
                shutil.copy2(os.path.join(file_path, filename), os.path.join(file_path, "_ERROR/"))
        except Exception as ex:
            raise IngestFileError.fileMoveError(ex, provider)
        finally:
            os.remove(os.path.join(file_path, filename))

    def get_last_updated(self, file_path):
        """Get last updated time for file.

        Using both mtime and ctime timestamps not to miss
        old files being copied around and recent files after
        changes done in place.
        """
        stat = os.lstat(file_path)
        timestamp = max(stat.st_mtime, stat.st_ctime)
        return datetime.fromtimestamp(timestamp, tz=utc)
Example #18
0
class EventFileFeedingService(FileFeedingService):
    """
    Feeding Service class which can read the configured local file system for article(s).
    """

    NAME = 'event_file'
    ERRORS = [
        ParserError.IPTC7901ParserError().get_error_description(),
        ParserError.nitfParserError().get_error_description(),
        ParserError.newsmlOneParserError().get_error_description(),
        ProviderError.ingestError().get_error_description(),
        ParserError.parseFileError().get_error_description()
    ]

    label = 'Event File Feed'
    """
    Defines the collection service to be used with this ingest feeding service.
    """
    service = 'events'

    def _update(self, provider, update):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.warn(
                'File Feeding Service {} is configured without path. Please check the configuration'
                .format(provider['name']))
            return []

        registered_parser = self.get_feed_parser(provider)
        for filename in get_sorted_files(self.path,
                                         sort_by=FileSortAttributes.created):
            try:
                last_updated = None
                file_path = os.path.join(self.path, filename)
                if os.path.isfile(file_path):
                    stat = os.lstat(file_path)
                    last_updated = datetime.fromtimestamp(stat.st_mtime,
                                                          tz=utc)

                    if self.is_latest_content(last_updated,
                                              provider.get('last_updated')):
                        if isinstance(registered_parser,
                                      NTBEventXMLFeedParser):
                            logger.info('Ingesting xml events')
                            with open(file_path, 'rb') as f:
                                xml = ElementTree.parse(f)
                                parser = self.get_feed_parser(
                                    provider, xml.getroot())
                                item = parser.parse(xml.getroot(), provider)
                        elif isinstance(registered_parser, IcsTwoFeedParser):
                            logger.info('Ingesting ics events')
                            with open(file_path, 'rb') as f:
                                cal = Calendar.from_ical(f.read())
                                parser = self.get_feed_parser(provider, cal)
                                item = parser.parse(cal, provider)
                        else:
                            logger.info('Ingesting events with unknown parser')
                            parser = self.get_feed_parser(provider, file_path)
                            item = parser.parse(file_path, provider)

                        self.after_extracting(item, provider)
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)

                        if isinstance(item, list):
                            yield item
                        else:
                            yield [item]
                    else:
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)
            except Exception as ex:
                if last_updated and self.is_old_content(last_updated):
                    self.move_file(self.path,
                                   filename,
                                   provider=provider,
                                   success=False)
                raise ParserError.parseFileError(
                    '{}-{}'.format(provider['name'], self.NAME), filename, ex,
                    provider)

        push_notification('ingest:update')
Example #19
0
from datetime import datetime
from .nitf import NITFParser
from superdesk.io.file_ingest_service import FileIngestService
from superdesk.utc import utc, timezone
from superdesk.notification import push_notification
from superdesk.io import register_provider
from ..etree import etree, ParseError as etreeParserError
from superdesk.utils import get_sorted_files, FileSortAttributes
from superdesk.errors import ParserError, ProviderError


logger = logging.getLogger(__name__)
PROVIDER = 'aap'
errors = [ParserError.nitfParserError().get_error_description(),
          ProviderError.ingestError().get_error_description(),
          ParserError.parseFileError().get_error_description()]


class AAPIngestService(FileIngestService):
    """AAP Ingest Service"""

    def __init__(self):
        self.tz = timezone('Australia/Sydney')
        self.parser = NITFParser()

    def prepare_href(self, href):
        return href

    def _update(self, provider):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)
Example #20
0
    def parse(self, xml, provider=None):
        item = {
            'versioncreated':
            utcnow(),
            'anpa_category': [{
                "name": "Formidlingstjenester",
                "qcode": "r"
            }],
            'genre': [{
                "name": "Fulltekstmeldinger",
                "qcode": "Fulltekstmeldinger",
                "scheme": "genre_custom"
            }],
            'subject': [{
                'qcode': 'Børsmelding',
                'name': 'Børsmelding',
                'scheme': 'category'
            }],
            'ednote':
            '*** Dette er en børsmelding formidlet av NTB pva. andre ***'
        }
        self.populate_fields(item)

        try:
            # we remove newsml namespace for convenience (to avoid to write prefix each time)
            # we deepcopy first to avoid modifying original item
            xml = deepcopy(xml)
            for elt in xml.iter():
                elt.tag = elt.tag.replace('{' + NEWSML_NS + '}', '')
            news_items = xml.findall('NewsItem')

            # there may be several items (for different languages), we keep in order of
            # preference: Norwegian, English, first item (cf. SDNTB-573)
            selected = None
            for news_item in news_items:
                try:
                    lang = news_item.xpath(
                        'NewsComponent/DescriptiveMetadata/Language/@FormalName',
                    )[0]
                except IndexError:
                    logger.warning(
                        "missing language in item, ignoring it.\nxml: {xml}".
                        format(
                            xml=etree.tostring(news_item, encoding="unicode")))
                    continue

                if selected is None or lang in ('no', 'en'):
                    selected = news_item

                if lang == 'no':
                    break

            if selected is None:
                logger.warning("can't find any valid item\nxml={xml}".format(
                    xml=etree.tostring(news_item, encoding="unicode")))
                raise ParserError.parseFileError(
                    source=etree.tostring(xml, encoding="unicode"))

            self.do_mapping(item, selected)
            return [item]
        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, provider)
 def parse(self, image_path, provider=None):
     try:
         item = self.parse_item(image_path)
         return item
     except Exception as ex:
         raise ParserError.parseFileError(exception=ex, provider=provider)
class EventFileFeedingService(FileFeedingService):
    """
    Feeding Service class which can read the configured local file system for article(s).
    """

    NAME = 'event_file'
    ERRORS = [
        ParserError.IPTC7901ParserError().get_error_description(),
        ParserError.nitfParserError().get_error_description(),
        ParserError.newsmlOneParserError().get_error_description(),
        ProviderError.ingestError().get_error_description(),
        ParserError.parseFileError().get_error_description()
    ]

    label = 'Event file feed'
    """
    Defines the collection service to be used with this ingest feeding service.
    """
    service = 'events'

    fields = [{
        'id': 'path',
        'type': 'text',
        'label': 'Event File Server Folder',
        'placeholder': 'path to folder',
        'required': True,
        'errors': {
            3003: 'Path not found on server.',
            3004: 'Path should be directory.'
        }
    }]

    def _update(self, provider, update):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.warn(
                'File Feeding Service {} is configured without path. Please check the configuration'
                .format(provider['name']))
            return []

        for filename in get_sorted_files(self.path,
                                         sort_by=FileSortAttributes.created):
            try:
                last_updated = None
                file_path = os.path.join(self.path, filename)
                if os.path.isfile(file_path):
                    stat = os.lstat(file_path)
                    last_updated = datetime.fromtimestamp(stat.st_mtime,
                                                          tz=utc)

                    if self.is_latest_content(last_updated,
                                              provider.get('last_updated')):
                        parser = self.get_feed_parser(provider, file_path)
                        logger.info('Ingesting events with {} parser'.format(
                            parser.__class__.__name__))
                        if getattr(parser, 'parse_file'):
                            with open(file_path, 'rb') as f:
                                item = parser.parse_file(f, provider)
                        else:
                            item = parser.parse(file_path, provider)

                        self.after_extracting(item, provider)
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)

                        if isinstance(item, list):
                            yield item
                        else:
                            yield [item]
                    else:
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)
            except Exception as ex:
                if last_updated and self.is_old_content(last_updated):
                    self.move_file(self.path,
                                   filename,
                                   provider=provider,
                                   success=False)
                raise ParserError.parseFileError(
                    '{}-{}'.format(provider['name'], self.NAME), filename, ex,
                    provider)

        push_notification('ingest:update')
Example #23
0
 def parse(self, image_path, provider=None):
     try:
         item = self.parse_item(image_path)
         return item
     except Exception as ex:
         raise ParserError.parseFileError(exception=ex, provider=provider)
Example #24
0
class SpreadsheetFeedingService(FeedingService):
    NAME = 'spreadsheet'
    service = 'events'
    ERRORS = [
        IngestApiError.apiNotFoundError().get_error_description(),
        ParserError.parseFileError().get_error_description(),
        IngestSpreadsheetError.SpreadsheetPermissionError(
        ).get_error_description(),
        IngestSpreadsheetError.SpreadsheetQuotaLimitError(
        ).get_error_description(),
        IngestSpreadsheetError.SpreadsheetCredentialsError().
        get_error_description(),
        IngestSpreadsheetError.WorksheetNotFoundError().get_error_description(
        ),
    ]

    label = 'Events from Google Documents Spreadsheet'

    fields = [
        {
            'id': 'service_account',
            'type': 'text',
            'label': 'Service account',
            'required': True,
            'errors': {
                15300: 'Invalid service account key'
            },
        },
        {
            'id': 'url',
            'type': 'text',
            'label': 'Source',
            'placeholder': 'Google Spreadsheet URL',
            'required': True,
            'errors': {
                1001: 'Can\'t parse spreadsheet.',
                1002: 'Can\'t parse spreadsheet.',
                4006: 'URL not found.',
                15100: 'Missing write permission while processing file',
                15200: 'Server reaches read quota limits.'
            }
        },
        {
            'id': 'worksheet_title',
            'type': 'text',
            'label': 'Sheet title',
            'placeholder': 'Title / Name of sheet',
            'required': True,
            'errors': {
                15400: 'Sheet not found'
            }
        },
    ]

    def _test(self, provider):
        worksheet = self._get_worksheet(provider)
        data = worksheet.get_all_values()
        BelgaSpreadsheetParser().parse_titles(data[0])

    def _update(self, provider, update):
        """Load items from google spreadsheet and insert (update) to events database

        If STATUS field is empty, create new item
        If STATUS field is UPDATED, update item
        """
        worksheet = self._get_worksheet(provider)

        # Get all values to avoid reaching read limit
        data = worksheet.get_all_values()
        titles = [s.lower().strip() for s in data[0]]

        # avoid maximum limit cols error
        total_col = worksheet.col_count
        if total_col < len(titles) + 3:
            worksheet.add_cols(len(titles) + 3 - total_col)

        for field in ('_STATUS', '_ERR_MESSAGE', '_GUID'):
            if field.lower() not in titles:
                titles.append(field)
                worksheet.update_cell(1, len(titles), field)
        data[0] = titles  # pass to parser uses for looking up index

        parser = BelgaSpreadsheetParser()
        items, cells_list = parser.parse(data, provider)
        items = self._process_event_items(items, provider)
        # add ingest item
        yield items
        # Update status for google sheet
        if cells_list:
            worksheet.update_cells(cells_list)

    def _get_worksheet(self, provider):
        """Get worksheet from google spreadsheet

        :return: worksheet
        :rtype: object
        """
        scope = [
            'https://spreadsheets.google.com/feeds',
            'https://www.googleapis.com/auth/drive',
        ]
        config = provider.get('config', {})
        url = config.get('url', '')
        service_account = config.get('service_account', '')
        title = config.get('worksheet_title', '')

        try:
            service_account = json.loads(service_account)
            credentials = ServiceAccountCredentials.from_json_keyfile_dict(
                service_account, scope)
            gc = gspread.authorize(credentials)
            spreadsheet = gc.open_by_url(url)
            permission = spreadsheet.list_permissions()[0]
            if permission['role'] != 'writer':
                raise IngestSpreadsheetError.SpreadsheetPermissionError()
            worksheet = spreadsheet.worksheet(title)
            return worksheet
        except (json.decoder.JSONDecodeError, AttributeError, ValueError) as e:
            # both permission and credential raise Value error
            if e.args[0] == 15100:
                raise IngestSpreadsheetError.SpreadsheetPermissionError()
            raise IngestSpreadsheetError.SpreadsheetCredentialsError()
        except gspread.exceptions.NoValidUrlKeyFound:
            raise IngestApiError.apiNotFoundError()
        except gspread.exceptions.WorksheetNotFound:
            raise IngestSpreadsheetError.WorksheetNotFoundError()
        except gspread.exceptions.APIError as e:
            error = e.response.json()['error']
            response_code = error['code']
            logger.error('Provider %s: %s', provider.get('name'),
                         error['message'])
            if response_code == 403:
                raise IngestSpreadsheetError.SpreadsheetPermissionError()
            elif response_code == 429:
                raise IngestSpreadsheetError.SpreadsheetQuotaLimitError()
            else:
                raise IngestApiError.apiNotFoundError()

    def _process_event_items(self, items, provider):
        events_service = superdesk.get_resource_service('events')
        list_items = []
        for item in items:
            status = item.pop('status')
            location = item.get('location')
            if item.get('contact'):
                contact = item.pop('contact')
                contact_service = superdesk.get_resource_service('contacts')
                _contact = contact_service.find_one(
                    req=None,
                    **{
                        'first_name':
                        contact['first_name'],
                        'last_name':
                        contact['last_name'],
                        'organisation':
                        contact['organisation'],
                        'contact_email':
                        contact['contact_email'][0],
                        'contact_phone.number':
                        contact['contact_phone'][0]['number'],
                    })
                if _contact and status == 'UPDATED':
                    item.setdefault('event_contact_info',
                                    [_contact[superdesk.config.ID_FIELD]])
                    contact_service.patch(_contact[superdesk.config.ID_FIELD],
                                          contact)
                else:
                    item.setdefault('event_contact_info',
                                    list(contact_service.post([contact])))

            if location:
                location_service = superdesk.get_resource_service('locations')
                saved_location = list(
                    location_service.find({
                        'name':
                        location[0]['name'],
                        'address.line':
                        location[0]['address']['line'],
                        'address.country':
                        location[0]['address']['country'],
                    }))
                if saved_location and status == 'UPDATED':
                    location_service.patch(
                        saved_location[0][superdesk.config.ID_FIELD],
                        location[0])
                elif not saved_location:
                    _location = deepcopy(location)
                    location_service.post(_location)
                    item['location'][0]['qcode'] = _location[0]['guid']

            old_item = events_service.find_one(guid=item[GUID_FIELD], req=None)
            if not old_item:
                if not status:
                    item.setdefault('firstcreated', datetime.now())
                    item.setdefault('versioncreated', datetime.now())
                    list_items.append(item)
            else:
                old_item.update(item)
                list_items.append(old_item)
        return list_items
Example #25
0
class TeletypeIngestService(FileIngestService):

    PROVIDER = 'teletype'

    ERRORS = [
        ParserError.ZCZCParserError().get_error_description(),
        ProviderError.ingestError().get_error_description(),
        ParserError.parseFileError().get_error_description()
    ]

    def __init__(self):
        self.parser = ZCZCParser()

    def _update(self, provider):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.info('No path')
            return []

        for filename in get_sorted_files(self.path,
                                         sort_by=FileSortAttributes.created):
            try:
                filepath = os.path.join(self.path, filename)
                if os.path.isfile(filepath):
                    stat = os.lstat(filepath)
                    last_updated = datetime.fromtimestamp(stat.st_mtime,
                                                          tz=utc)
                    if self.is_latest_content(last_updated,
                                              provider.get('last_updated')):
                        item = self.parser.parse_file(filepath, provider)

                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)
                        yield [item]
                    else:
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)
            except Exception as ex:
                self.move_file(self.path,
                               filename,
                               provider=provider,
                               success=False)
                raise ParserError.parseFileError('Teletype', filename, ex,
                                                 provider)

    def parse_file(self, filename, provider):
        try:
            path = provider.get('config', {}).get('path', None)

            if not path:
                return []

            item = self.parser.parse_file(os.path.join(path, filename),
                                          provider)

            return [item]
        except Exception as ex:
            self.move_file(self.path,
                           filename,
                           provider=provider,
                           success=False)
            raise ParserError.parseFileError('Teletype', filename, ex,
                                             provider)