Ejemplo n.º 1
0
    def _update(self, provider):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.info('No path')
            return []

        for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created):
            try:
                filepath = os.path.join(self.path, filename)
                if os.path.isfile(filepath):
                    stat = os.lstat(filepath)
                    last_updated = datetime.fromtimestamp(stat.st_mtime, tz=utc)
                    if self.is_latest_content(last_updated, provider.get('last_updated')):
                        item = self.parser.parse_file(filepath, self)

                        self.move_file(self.path, filename, provider=provider, success=True)
                        yield [item]
                    else:
                        self.move_file(self.path, filename, provider=provider, success=True)
            except ParserError.ZCZCParserError as ex:
                logger.exception("Ingest Type: Teletype - File: {0} could not be processed".format(filename))
                self.move_file(self.path, filename, provider=provider, success=False)
                raise ParserError.ZCZCParserError(ex, provider)
            except ParserError as ex:
                self.move_file(self.path, filename, provider=provider, success=False)
            except Exception as ex:
                self.move_file(self.path, filename, provider=provider, success=False)
                raise ProviderError.ingestError(ex, provider)
Ejemplo n.º 2
0
    def parse_file(self, filename, provider):
        try:
            item = {}
            self.set_item_defaults(item, provider)

            with open(filename, 'r', encoding='ascii') as f:
                lines = f.readlines()
                header = False
                body = False
                for line in lines:
                    if self.START_OF_MESSAGE in line and not header:
                        item['guid'] = filename + str(uuid.uuid4())
                        header = True
                        continue
                    if header:
                        if line[0] in self.header_map:
                            if self.header_map[line[0]]:
                                item[self.header_map[line[0]]] = line[1:-1]
                            continue
                        if line[0] == self.CATEGORY:
                            item[self.ITEM_ANPA_CATEGORY] = [{
                                'qcode': line[1]
                            }]
                            continue
                        if line[0] == self.FORMAT:
                            if line[1] == self.TEXT:
                                item[ITEM_TYPE] = CONTENT_TYPE.TEXT
                                continue
                            if line[1] == self.TABULAR:
                                item[ITEM_TYPE] = CONTENT_TYPE.PREFORMATTED
                                continue
                            continue
                        if line[0] == self.IPTC:
                            iptc_code = line[1:-1]
                            item[self.ITEM_SUBJECT] = [{
                                'qcode':
                                iptc_code,
                                'name':
                                subject_codes[iptc_code]
                            }]
                            continue
                        header = False
                        body = True
                        item['body_html'] = line
                    else:
                        if self.END_OF_MESSAGE in line:
                            break
                        if body:
                            item['body_html'] = item.get('body_html',
                                                         '') + line
            return self.post_process_item(item, provider)

        except Exception as ex:
            raise ParserError.ZCZCParserError(exception=ex, provider=provider)
Ejemplo n.º 3
0
    def parse_file(self, filename, provider):
        try:
            item = {}
            self.set_item_defaults(item)

            with open(filename, 'r', encoding='ascii') as f:
                lines = f.readlines()
                header = False
                for line in lines:
                    if self.START_OF_MESSAGE in line and not header:
                        item['guid'] = filename + str(uuid.uuid4())
                        header = True
                        continue
                    if header:
                        if line[0] in self.header_map:
                            if self.header_map[line[0]]:
                                item[self.header_map[line[0]]] = line[1:-1]
                            continue
                        if line[0] == self.CATEGORY:
                            item['anpa-category'] = {'qcode': line[1]}
                            continue
                        if line[0] == self.FORMAT:
                            if line[1] == self.TEXT:
                                item['type'] = 'text'
                                continue
                            if line[1] == self.TABULAR:
                                item['type'] = 'preformatted'
                                continue
                            continue
                        if line[0] == self.IPTC:
                            iptc_code = line[1:-1]
                            item['subject'] = [{
                                'qcode': iptc_code,
                                'name': subject_codes[iptc_code]
                            }]
                            continue
                        header = False
                        item['body_html'] = line
                    else:
                        if self.END_OF_MESSAGE in line:
                            break
                        item['body_html'] = item['body_html'] + line
            return item

        except Exception as ex:
            raise ParserError.ZCZCParserError(ex, provider)
Ejemplo n.º 4
0
class TeletypeIngestService(FileIngestService):

    PROVIDER = 'teletype'

    ERRORS = [
        ParserError.ZCZCParserError().get_error_description(),
        ProviderError.ingestError().get_error_description(),
        ParserError.parseFileError().get_error_description()
    ]

    def __init__(self):
        self.parser = ZCZCParser()

    def _update(self, provider):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.info('No path')
            return []

        for filename in get_sorted_files(self.path,
                                         sort_by=FileSortAttributes.created):
            try:
                filepath = os.path.join(self.path, filename)
                if os.path.isfile(filepath):
                    stat = os.lstat(filepath)
                    last_updated = datetime.fromtimestamp(stat.st_mtime,
                                                          tz=utc)
                    if self.is_latest_content(last_updated,
                                              provider.get('last_updated')):
                        item = self.parser.parse_file(filepath, provider)

                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)
                        yield [item]
                    else:
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)
            except Exception as ex:
                self.move_file(self.path,
                               filename,
                               provider=provider,
                               success=False)
                raise ParserError.parseFileError('Teletype', filename, ex,
                                                 provider)

    def parse_file(self, filename, provider):
        try:
            path = provider.get('config', {}).get('path', None)

            if not path:
                return []

            item = self.parser.parse_file(os.path.join(path, filename),
                                          provider)

            return [item]
        except Exception as ex:
            self.move_file(self.path,
                           filename,
                           provider=provider,
                           success=False)
            raise ParserError.parseFileError('Teletype', filename, ex,
                                             provider)
Ejemplo n.º 5
0
# at https://www.sourcefabric.org/superdesk/license*.


import os
import logging
from datetime import datetime
from superdesk.io.file_ingest_service import FileIngestService
from superdesk.utc import utc
from superdesk.io import register_provider
from superdesk.utils import get_sorted_files, FileSortAttributes
from superdesk.errors import ParserError, ProviderError
from superdesk.io.zczc import ZCZCParser

logger = logging.getLogger(__name__)
PROVIDER = 'teletype'
errors = [ParserError.ZCZCParserError().get_error_description(),
          ProviderError.ingestError().get_error_description(),
          ParserError.parseFileError().get_error_description()]


class TeletypeIngestService(FileIngestService):

    def __init__(self):
        self.parser = ZCZCParser()

    def _update(self, provider):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.info('No path')
Ejemplo n.º 6
0
class FileFeedingService(FeedingService):
    """
    Feeding Service class which can read the configured local file system for article(s).
    """

    NAME = 'file'
    ERRORS = [
        ParserError.ZCZCParserError().get_error_description(),
        ParserError.IPTC7901ParserError().get_error_description(),
        ParserError.nitfParserError().get_error_description(),
        ParserError.newsmlOneParserError().get_error_description(),
        ProviderError.ingestError().get_error_description(),
        ParserError.parseFileError().get_error_description()
    ]

    def _update(self, provider):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.warn(
                'File Feeding Service {} is configured without path. Please check the configuration'
                .format(provider['name']))
            return []

        registered_parser = self.get_feed_parser(provider)
        for filename in get_sorted_files(self.path,
                                         sort_by=FileSortAttributes.created):
            try:
                file_path = os.path.join(self.path, filename)
                if os.path.isfile(file_path):
                    stat = os.lstat(file_path)
                    last_updated = datetime.fromtimestamp(stat.st_mtime,
                                                          tz=utc)

                    if self.is_latest_content(last_updated,
                                              provider.get('last_updated')):
                        if isinstance(registered_parser, XMLFeedParser):
                            with open(file_path, 'rt') as f:
                                xml = ElementTree.parse(f)
                                parser = self.get_feed_parser(
                                    provider, xml.getroot())
                                item = parser.parse(xml, provider)
                        else:
                            parser = self.get_feed_parser(provider, file_path)
                            item = parser.parse(file_path, provider)

                        self.after_extracting(item, provider)
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)

                        if isinstance(item, list):
                            yield item
                        else:
                            yield [item]
                    else:
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)
            except Exception as ex:
                self.move_file(self.path,
                               filename,
                               provider=provider,
                               success=False)
                raise ParserError.parseFileError(
                    '{}-{}'.format(provider['name'], self.NAME), filename, ex,
                    provider)

        push_notification('ingest:update')

    def after_extracting(self, article, provider):
        """
        Sub-classes should override this method if something needs to be done to the given article. For example, if the
        article comes from DPA provider the system needs to derive dateline from the properties in the article.

        Invoked after parser parses the article received from the provider.

        :param article: dict having properties that can be saved into ingest collection
        :type article: dict
        :param provider: dict - Ingest provider details to which the current directory has been configured
        :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource`
        """
        pass

    def move_file(self, file_path, filename, provider, success=True):
        """
        Move the files from the current directory to the _Processed if successful, else _Error if unsuccessful.
        Creates _Processed and _Error directories within current directory if they don't exist.

        :param file_path: str - current directory location
        :param filename: str - file name in the current directory to move
        :param provider: dict - Ingest provider details to which the current directory has been configured
        :param success: bool - default value is True. When True moves to _Processed directory else _Error directory.
        :raises IngestFileError.folderCreateError() if creation of _Processed or _Error directories fails
        :raises IngestFileError.fileMoveError() if failed to move the file pointed by filename
        """

        try:
            if not os.path.exists(os.path.join(file_path, "_PROCESSED/")):
                os.makedirs(os.path.join(file_path, "_PROCESSED/"))
            if not os.path.exists(os.path.join(file_path, "_ERROR/")):
                os.makedirs(os.path.join(file_path, "_ERROR/"))
        except Exception as ex:
            raise IngestFileError.folderCreateError(ex, provider)

        try:
            if success:
                shutil.copy2(os.path.join(file_path, filename),
                             os.path.join(file_path, "_PROCESSED/"))
            else:
                shutil.copy2(os.path.join(file_path, filename),
                             os.path.join(file_path, "_ERROR/"))
        except Exception as ex:
            raise IngestFileError.fileMoveError(ex, provider)
        finally:
            os.remove(os.path.join(file_path, filename))

    def is_latest_content(self, last_updated, provider_last_updated=None):
        """
        Parse file only if it's not older than provider last update -10m
        """

        if not provider_last_updated:
            provider_last_updated = utcnow() - timedelta(days=7)

        return provider_last_updated - timedelta(minutes=10) < last_updated