Ejemplo n.º 1
0
    def _update(self, provider):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.info('No path')
            return []

        for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created):
            try:
                filepath = os.path.join(self.path, filename)
                if os.path.isfile(filepath):
                    stat = os.lstat(filepath)
                    last_updated = datetime.fromtimestamp(stat.st_mtime, tz=utc)
                    if self.is_latest_content(last_updated, provider.get('last_updated')):
                        item = self.parser.parse_file(filepath)

                        self.move_file(self.path, filename, provider=provider, success=True)
                        yield [item]
                    else:
                        self.move_file(self.path, filename, provider=provider, success=True)
            except ParserError.IPTC7901ParserError() as ex:
                logger.exception("Ingest Type: DPA - File: {0} could not be processed".format(filename))
                self.move_file(self.path, filename, provider=provider, success=False)
                raise ParserError.IPTC7901ParserError(ex, provider)
            except ParserError as ex:
                self.move_file(self.path, filename, provider=provider, success=False)
            except Exception as ex:
                self.move_file(self.path, filename, provider=provider, success=False)
                raise ProviderError.ingestError(ex, provider)
Ejemplo n.º 2
0
    def parse(self, file_path, provider=None):
        try:
            item = {ITEM_TYPE: CONTENT_TYPE.TEXT, 'guid': generate_guid(type=GUID_TAG),
                    'versioncreated': utcnow()}

            with open(file_path, 'rb') as f:
                lines = [line for line in f]
            # parse first header line
            m = re.match(b'\x01([a-zA-Z]*)([0-9]*) (.) (.) ([0-9]*) ([a-zA-Z0-9 ]*)', lines[0], flags=re.I)
            if m:
                item['original_source'] = m.group(1).decode('latin-1', 'replace')
                item['ingest_provider_sequence'] = m.group(2).decode()
                item['priority'] = self.map_priority(m.group(3).decode())
                item['anpa_category'] = [{'qcode': self.map_category(m.group(4).decode())}]
                item['word_count'] = int(m.group(5).decode())

            inHeader = True
            inText = False
            inNote = False
            for line in lines[1:]:
                # STX starts the body of the story
                if line[0:1] == b'\x02':
                    # pick the rest of the line off as the headline
                    item['headline'] = line[1:].decode('latin-1', 'replace').rstrip('\r\n')
                    item['body_html'] = ''
                    inText = True
                    inHeader = False
                    continue
                # ETX denotes the end of the story
                if line[0:1] == b'\x03':
                    break
                if inText:
                    if line.decode('latin-1', 'replace')\
                            .find('The following information is not for publication') != -1 \
                            or line.decode('latin-1', 'replace').find(
                                'The following information is not intended for publication') != -1:
                        inNote = True
                        inText = False
                        item['ednote'] = ''
                        continue
                    item['body_html'] += line.decode('latin-1', 'replace')
                if inNote:
                    item['ednote'] += line.decode('latin-1', 'replace')
                    continue
                if inHeader:
                    if 'slugline' not in item:
                        item['slugline'] = ''
                    item['slugline'] += line.decode('latin-1', 'replace').rstrip('/\r\n')
                    continue

            return item
        except Exception as ex:
            raise ParserError.IPTC7901ParserError(exception=ex, provider=provider)
Ejemplo n.º 3
0
class FileFeedingService(FeedingService):
    """
    Feeding Service class which can read the configured local file system for article(s).
    """

    NAME = 'file'

    ERRORS = [
        ParserError.IPTC7901ParserError().get_error_description(),
        ParserError.nitfParserError().get_error_description(),
        ParserError.newsmlOneParserError().get_error_description(),
        ProviderError.ingestError().get_error_description(),
        ParserError.parseFileError().get_error_description()
    ]

    label = 'File feed'

    fields = [
        {
            'id': 'path', 'type': 'text', 'label': 'Server Folder',
            'placeholder': 'path to folder', 'required': True,
            'errors': {3003: 'Path not found on server.', 3004: 'Path should be directory.'}
        }
    ]

    def _test(self, provider):
        path = provider.get('config', {}).get('path', None)
        if not os.path.exists(path):
            raise IngestFileError.notExistsError()
        if not os.path.isdir(path):
            raise IngestFileError.isNotDirError()

    def _update(self, provider, update):
        # check if deprecated FILE_INGEST_OLD_CONTENT_MINUTES setting is still used
        if "FILE_INGEST_OLD_CONTENT_MINUTES" in app.config:
            deprecated_cont_min = app.config["FILE_INGEST_OLD_CONTENT_MINUTES"]
            cont_min = app.config[OLD_CONTENT_MINUTES]
            if deprecated_cont_min != cont_min:
                logger.warning(
                    "'FILE_INGEST_OLD_CONTENT_MINUTES' is deprecated, please update settings.py to use {new_name!r}"
                    .format(new_name=OLD_CONTENT_MINUTES))
                app.config[OLD_CONTENT_MINUTES] = deprecated_cont_min

        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.warn('File Feeding Service {} is configured without path. Please check the configuration'
                        .format(provider['name']))
            return []

        registered_parser = self.get_feed_parser(provider)
        for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created):
            try:
                last_updated = None
                file_path = os.path.join(self.path, filename)
                if os.path.isfile(file_path):
                    last_updated = self.get_last_updated(file_path)

                    if self.is_latest_content(last_updated, provider.get('last_updated')):
                        if isinstance(registered_parser, XMLFeedParser):
                            with open(file_path, 'rb') as f:
                                xml = etree.parse(f)
                                parser = self.get_feed_parser(provider, xml.getroot())
                                item = parser.parse(xml.getroot(), provider)
                        else:
                            parser = self.get_feed_parser(provider, file_path)
                            item = parser.parse(file_path, provider)

                        self.after_extracting(item, provider)

                        if isinstance(item, list):
                            failed = yield item
                        else:
                            failed = yield [item]

                        self.move_file(self.path, filename, provider=provider, success=not failed)
                    else:
                        self.move_file(self.path, filename, provider=provider, success=False)
            except Exception as ex:
                if last_updated and self.is_old_content(last_updated):
                    self.move_file(self.path, filename, provider=provider, success=False)
                raise ParserError.parseFileError('{}-{}'.format(provider['name'], self.NAME), filename, ex, provider)

        push_notification('ingest:update')

    def after_extracting(self, article, provider):
        """Sub-classes should override this method if something needs to be done to the given article.

        For example, if the article comes from DPA provider the system needs to derive dateline
        from the properties in the article.

        Invoked after parser parses the article received from the provider.

        :param article: dict having properties that can be saved into ingest collection
        :type article: dict
        :param provider: dict - Ingest provider details to which the current directory has been configured
        :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource`
        """
        pass

    def move_file(self, file_path, filename, provider, success=True):
        """Move the files from the current directory to the _Processed if successful, else _Error if unsuccessful.

        Creates _Processed and _Error directories within current directory if they don't exist.

        :param file_path: str - current directory location
        :param filename: str - file name in the current directory to move
        :param provider: dict - Ingest provider details to which the current directory has been configured
        :param success: bool - default value is True. When True moves to _Processed directory else _Error directory.
        :raises IngestFileError.folderCreateError() if creation of _Processed or _Error directories fails
        :raises IngestFileError.fileMoveError() if failed to move the file pointed by filename
        """

        try:
            if not os.path.exists(os.path.join(file_path, "_PROCESSED/")):
                os.makedirs(os.path.join(file_path, "_PROCESSED/"))
            if not os.path.exists(os.path.join(file_path, "_ERROR/")):
                os.makedirs(os.path.join(file_path, "_ERROR/"))
        except Exception as ex:
            raise IngestFileError.folderCreateError(ex, provider)

        try:
            if success:
                shutil.copy2(os.path.join(file_path, filename), os.path.join(file_path, "_PROCESSED/"))
            else:
                shutil.copy2(os.path.join(file_path, filename), os.path.join(file_path, "_ERROR/"))
        except Exception as ex:
            raise IngestFileError.fileMoveError(ex, provider)
        finally:
            os.remove(os.path.join(file_path, filename))

    def get_last_updated(self, file_path):
        """Get last updated time for file.

        Using both mtime and ctime timestamps not to miss
        old files being copied around and recent files after
        changes done in place.
        """
        stat = os.lstat(file_path)
        timestamp = max(stat.st_mtime, stat.st_ctime)
        return datetime.fromtimestamp(timestamp, tz=utc)
class EventFileFeedingService(FileFeedingService):
    """
    Feeding Service class which can read the configured local file system for article(s).
    """

    NAME = 'event_file'
    ERRORS = [
        ParserError.IPTC7901ParserError().get_error_description(),
        ParserError.nitfParserError().get_error_description(),
        ParserError.newsmlOneParserError().get_error_description(),
        ProviderError.ingestError().get_error_description(),
        ParserError.parseFileError().get_error_description()
    ]

    label = 'Event file feed'
    """
    Defines the collection service to be used with this ingest feeding service.
    """
    service = 'events'

    fields = [{
        'id': 'path',
        'type': 'text',
        'label': 'Event File Server Folder',
        'placeholder': 'path to folder',
        'required': True,
        'errors': {
            3003: 'Path not found on server.',
            3004: 'Path should be directory.'
        }
    }]

    def _update(self, provider, update):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.warn(
                'File Feeding Service {} is configured without path. Please check the configuration'
                .format(provider['name']))
            return []

        for filename in get_sorted_files(self.path,
                                         sort_by=FileSortAttributes.created):
            try:
                last_updated = None
                file_path = os.path.join(self.path, filename)
                if os.path.isfile(file_path):
                    stat = os.lstat(file_path)
                    last_updated = datetime.fromtimestamp(stat.st_mtime,
                                                          tz=utc)

                    if self.is_latest_content(last_updated,
                                              provider.get('last_updated')):
                        parser = self.get_feed_parser(provider, file_path)
                        logger.info('Ingesting events with {} parser'.format(
                            parser.__class__.__name__))
                        if getattr(parser, 'parse_file'):
                            with open(file_path, 'rb') as f:
                                item = parser.parse_file(f, provider)
                        else:
                            item = parser.parse(file_path, provider)

                        self.after_extracting(item, provider)
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)

                        if isinstance(item, list):
                            yield item
                        else:
                            yield [item]
                    else:
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)
            except Exception as ex:
                if last_updated and self.is_old_content(last_updated):
                    self.move_file(self.path,
                                   filename,
                                   provider=provider,
                                   success=False)
                raise ParserError.parseFileError(
                    '{}-{}'.format(provider['name'], self.NAME), filename, ex,
                    provider)

        push_notification('ingest:update')
class FileFeedingService(FeedingService):
    """
    Feeding Service class which can read the configured local file system for article(s).
    """

    NAME = 'file'

    ERRORS = [
        ParserError.IPTC7901ParserError().get_error_description(),
        ParserError.nitfParserError().get_error_description(),
        ParserError.newsmlOneParserError().get_error_description(),
        ProviderError.ingestError().get_error_description(),
        ParserError.parseFileError().get_error_description()
    ]

    label = 'File Feed'

    fields = [{
        'id': 'path',
        'type': 'text',
        'label': 'Server Folder',
        'placeholder': 'path to folder',
        'required': True,
        'errors': {
            3003: 'Path not found on server.',
            3004: 'Path should be directory.'
        }
    }]

    def _test(self, provider):
        path = provider.get('config', {}).get('path', None)
        if not os.path.exists(path):
            raise IngestFileError.notExistsError()
        if not os.path.isdir(path):
            raise IngestFileError.isNotDirError()

    def _update(self, provider, update):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.warn(
                'File Feeding Service {} is configured without path. Please check the configuration'
                .format(provider['name']))
            return []

        registered_parser = self.get_feed_parser(provider)
        for filename in get_sorted_files(self.path,
                                         sort_by=FileSortAttributes.created):
            try:
                last_updated = None
                file_path = os.path.join(self.path, filename)
                if os.path.isfile(file_path):
                    stat = os.lstat(file_path)
                    last_updated = datetime.fromtimestamp(stat.st_mtime,
                                                          tz=utc)

                    if self.is_latest_content(last_updated,
                                              provider.get('last_updated')):
                        if isinstance(registered_parser, XMLFeedParser):
                            with open(file_path, 'rb') as f:
                                xml = etree.parse(f)
                                parser = self.get_feed_parser(
                                    provider, xml.getroot())
                                item = parser.parse(xml.getroot(), provider)
                        else:
                            parser = self.get_feed_parser(provider, file_path)
                            item = parser.parse(file_path, provider)

                        self.after_extracting(item, provider)
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)

                        if isinstance(item, list):
                            yield item
                        else:
                            yield [item]
                    else:
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)
            except Exception as ex:
                if last_updated and self.is_old_content(last_updated):
                    self.move_file(self.path,
                                   filename,
                                   provider=provider,
                                   success=False)
                raise ParserError.parseFileError(
                    '{}-{}'.format(provider['name'], self.NAME), filename, ex,
                    provider)

        push_notification('ingest:update')

    def after_extracting(self, article, provider):
        """Sub-classes should override this method if something needs to be done to the given article.

        For example, if the article comes from DPA provider the system needs to derive dateline
        from the properties in the article.

        Invoked after parser parses the article received from the provider.

        :param article: dict having properties that can be saved into ingest collection
        :type article: dict
        :param provider: dict - Ingest provider details to which the current directory has been configured
        :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource`
        """
        pass

    def move_file(self, file_path, filename, provider, success=True):
        """Move the files from the current directory to the _Processed if successful, else _Error if unsuccessful.

        Creates _Processed and _Error directories within current directory if they don't exist.

        :param file_path: str - current directory location
        :param filename: str - file name in the current directory to move
        :param provider: dict - Ingest provider details to which the current directory has been configured
        :param success: bool - default value is True. When True moves to _Processed directory else _Error directory.
        :raises IngestFileError.folderCreateError() if creation of _Processed or _Error directories fails
        :raises IngestFileError.fileMoveError() if failed to move the file pointed by filename
        """

        try:
            if not os.path.exists(os.path.join(file_path, "_PROCESSED/")):
                os.makedirs(os.path.join(file_path, "_PROCESSED/"))
            if not os.path.exists(os.path.join(file_path, "_ERROR/")):
                os.makedirs(os.path.join(file_path, "_ERROR/"))
        except Exception as ex:
            raise IngestFileError.folderCreateError(ex, provider)

        try:
            if success:
                shutil.copy2(os.path.join(file_path, filename),
                             os.path.join(file_path, "_PROCESSED/"))
            else:
                shutil.copy2(os.path.join(file_path, filename),
                             os.path.join(file_path, "_ERROR/"))
        except Exception as ex:
            raise IngestFileError.fileMoveError(ex, provider)
        finally:
            os.remove(os.path.join(file_path, filename))

    def is_latest_content(self, last_updated, provider_last_updated=None):
        """
        Parse file only if it's not older than provider last update -10m
        """

        if not provider_last_updated:
            provider_last_updated = utcnow() - timedelta(days=7)

        return provider_last_updated - timedelta(minutes=10) < last_updated

    def is_old_content(self, last_updated):
        """Test if file is old so it wouldn't probably work in is_latest_content next time.

        Such files can be moved to `_ERROR` folder, it wouldn't be ingested anymore.

        :param last_updated: file last updated datetime
        """
        return last_updated < utcnow() - timedelta(minutes=10)
Ejemplo n.º 6
0
# at https://www.sourcefabric.org/superdesk/license*.


import os
import logging
from datetime import datetime
from superdesk.io.file_ingest_service import FileIngestService
from superdesk.utc import utc
from superdesk.io import register_provider
from superdesk.utils import get_sorted_files, FileSortAttributes
from superdesk.errors import ParserError, ProviderError
from superdesk.io.iptc7901 import Iptc7901FileParser

logger = logging.getLogger(__name__)
PROVIDER = 'dpa'
errors = [ParserError.IPTC7901ParserError().get_error_description(),
          ProviderError.ingestError().get_error_description()]


class DPAIngestService(FileIngestService):

    def __init__(self):
        self.parser = Iptc7901FileParser()

    def _update(self, provider):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.info('No path')
            return []
Ejemplo n.º 7
0
    def parse_file(self, filename):
        """Parse 7901 file by given filename.

        :param filename
        """
        try:
            item = {'type': 'preformatted'}
            item['guid'] = generate_guid(type=GUID_TAG)
            item['versioncreated'] = utcnow()

            with open(filename, 'rb') as f:
                lines = [line for line in f]
            # parse first header line
            m = re.match(
                b'\x01([a-zA-Z]*)([0-9]*) (.) (.) ([0-9]*) ([a-zA-Z0-9 ]*)',
                lines[0],
                flags=re.I)
            if m:
                item['original_source'] = m.group(1).decode()
                item['ingest_provider_sequence'] = m.group(2).decode()
                item['priority'] = self.map_priority(m.group(3).decode())
                item['anpa-category'] = {
                    'qcode': self.map_category(m.group(4).decode())
                }
                item['word_count'] = int(m.group(5).decode())

            inHeader = True
            inText = False
            inNote = False
            for line in lines[1:]:
                # STX starts the body of the story
                if line[0:1] == b'\x02':
                    # pick the rest of the line off as the headline
                    item['headline'] = line[1:].decode().rstrip('\r\n')
                    item['body_html'] = ''
                    inText = True
                    inHeader = False
                    continue
                # ETX denotes the end of the story
                if line[0:1] == b'\x03':
                    break
                if inText:
                    if line.decode().find(
                            'The following information is not for publication'
                    ) != -1:
                        inNote = True
                        inText = False
                        item['ednote'] = ''
                        continue
                    item['body_html'] += line.decode()
                if inNote:
                    item['ednote'] += line.decode()
                    continue
                if inHeader:
                    if 'slugline' not in item:
                        item['slugline'] = ''
                    item['slugline'] += line.decode().rstrip('/\r\n')
                    continue

            return item
        except Exception as ex:
            raise ParserError.IPTC7901ParserError(filename, ex)
Ejemplo n.º 8
0
    def parse_content_ats(self, file_path, provider=None):
        try:
            item = {
                ITEM_TYPE: CONTENT_TYPE.TEXT,
                'guid': generate_guid(type=GUID_TAG),
                'versioncreated': utcnow()
            }

            with open(file_path, 'rb') as f:
                lines = [line for line in f]
            # parse first header line
            m = re.match(
                b'\x7f\x01([a-zA-Z]*)([0-9]*) (.) ([A-Z]{1,3}) ([0-9]*) ([a-zA-Z0-9 ]*)',
                lines[1],
                flags=re.I)
            if m:
                item['original_source'] = m.group(1).decode(
                    'latin-1', 'replace')
                item['ingest_provider_sequence'] = m.group(2).decode()
                item['priority'] = self.map_priority(m.group(3).decode())
                item['anpa_category'] = [{
                    'qcode':
                    self.map_category(m.group(4).decode())
                }]
                item['word_count'] = int(m.group(5).decode())

            inHeader = False
            inBody = False
            line_count = 0
            item['headline'] = ''
            item['body_html'] = ''
            # start check each line for get information
            for bline in lines[1:]:
                line = bline.decode('latin-1', 'replace')
                line_count += 1

                # dpa start header when line number is 3
                if bline[0:1] == b'\x02':
                    line = line[1:]
                    inHeader = True

                # dpa end at especially characters
                if bline[0:1] == b'\x03':
                    break

                if inHeader is True:
                    # dpa end header when line end with especially characters (ex '=\r\n')
                    end_string = self.check_mendwith(
                        line, BelgaIPTC7901FeedParser.txt_type[2])
                    if end_string:
                        if line.startswith('By '):
                            item['byline'] = line.replace(
                                'By ', '').rstrip(end_string)
                        else:
                            item['headline'] += line.rstrip(end_string)
                        inHeader = False
                        # set flag inBody when header is end
                        inBody = True
                    else:
                        item['headline'] += line
                        inHeader = True
                    continue
                # dpa start body when the header is end
                if inBody:
                    item['body_html'] += line
                    continue
            return item
        except Exception as ex:
            raise ParserError.IPTC7901ParserError(exception=ex,
                                                  provider=provider)
Ejemplo n.º 9
0
    def parse_content_dpa(self, file_path, provider=None):
        try:
            item = {
                ITEM_TYPE: CONTENT_TYPE.TEXT,
                'guid': generate_guid(type=GUID_TAG),
                'versioncreated': utcnow()
            }

            with open(file_path, 'rb') as f:
                lines = [line for line in f]
            # parse first header line
            m = re.match(
                b'([a-zA-Z]*)([0-9]*) (.) ([A-Z]{1,3}) ([0-9]*) ([a-zA-Z0-9 ]*)',
                lines[0],
                flags=re.I)
            if m:
                item['original_source'] = m.group(1).decode(
                    'latin-1', 'replace')
                item['ingest_provider_sequence'] = m.group(2).decode()
                item['priority'] = self.map_priority(m.group(3).decode())
                item['anpa_category'] = [{
                    'qcode':
                    self.map_category(m.group(4).decode())
                }]
                item['word_count'] = int(m.group(5).decode())

            inHeader = False
            inBody = False
            inNote = False
            line_count = 0
            item['headline'] = ''
            item['body_html'] = ''
            # start check each line for get information
            for line in lines[1:]:
                line = line.decode('latin-1', 'replace')
                line_count += 1
                # slugline is before the header
                if line_count < 3:
                    if 'slugline' not in item:
                        item['slugline'] = ''
                    item['slugline'] += line.rstrip('/\r\n')
                    continue
                # dpa start header when line number is 3
                if line_count == 3:
                    inHeader = True
                if inHeader is True:
                    if str.isupper(line):
                        if 'anpa_take_key' in item:
                            item['anpa_take_key'] += " " + line.rstrip('\n')
                        else:
                            item['anpa_take_key'] = line.rstrip('\n')
                        continue
                    if line.startswith('(') or line.endswith(')'):
                        if 'anpa_header' in item:
                            item['anpa_header'] += " " + line
                        else:
                            item['anpa_header'] = line
                        continue
                    # dpa end header when line end with especially characters (ex '=\r\n')
                    end_string = self.check_mendwith(
                        line, BelgaIPTC7901FeedParser.txt_type[2])
                    if end_string:
                        if line.startswith('By '):
                            item['byline'] = line.replace(
                                'By ', '').rstrip(end_string)
                        else:
                            item['headline'] += line.rstrip(end_string)
                        inHeader = False
                        # set flag inBody when header is end
                        inBody = True
                    else:
                        item['headline'] += line
                        inHeader = True
                    continue
                # dpa start body when the header is end
                if inBody:
                    if line.find('The following information is not for publication') != -1 or \
                            line.find('The following information is not intended for publication') != -1:
                        inNote = True
                        inBody = False
                        item['ednote'] = ''
                        continue
                    item['body_html'] += line
                if inNote:
                    item['ednote'] += line
                    continue
            return item
        except Exception as ex:
            raise ParserError.IPTC7901ParserError(exception=ex,
                                                  provider=provider)
Ejemplo n.º 10
0
    def parse_content_dpa(self, file_path, provider=None):
        try:
            item = {
                ITEM_TYPE: CONTENT_TYPE.TEXT,
                'guid': generate_guid(type=GUID_TAG),
                'versioncreated': utcnow()
            }

            with open(file_path, 'rb') as f:
                lines = list(f)
            # parse first header line
            m = re.match(
                b'([a-zA-Z]*)([0-9]*) (.) ([A-Z]{1,3}) ([0-9]*) ([a-zA-Z0-9 ]*)',
                lines[0],
                flags=re.I)
            if m:
                qcode = m.group(4).decode().upper()
                item['original_source'] = m.group(1).decode(
                    'latin-1', 'replace')
                item['ingest_provider_sequence'] = m.group(2).decode()
                item['priority'] = self.map_priority(m.group(3).decode())
                item['anpa_category'] = [{'qcode': self.map_category(qcode)}]
                # mapping product
                qcode = self.MAPPING_PRODUCTS['dpa'].get(qcode, 'NEWS/GENERAL')
                item.setdefault('subject', []).append({
                    'name':
                    qcode,
                    'qcode':
                    qcode,
                    'parent':
                    'NEWS',
                    'scheme':
                    'services-products'
                })
                # source is DPA
                credit = {"name": 'DPA', "qcode": 'DPA', "scheme": "sources"}
                item.setdefault('subject', []).append(credit)
                # Distribution is default
                dist = {
                    "name": 'default',
                    "qcode": 'default',
                    "scheme": "distribution"
                }
                item.setdefault('subject', []).append(dist)
                item['word_count'] = int(m.group(5).decode())

            inHeader = False
            inBody = False
            inNote = False
            line_count = 0
            item['headline'] = ''
            item['body_html'] = ''
            # start check each line for get information
            for line in lines[1:]:
                line = line.decode('latin-1', 'replace')
                line_count += 1
                # slugline is before the header
                if line_count < 3:
                    if 'slugline' not in item:
                        item['slugline'] = ''
                    item['slugline'] += line.rstrip('/\r\n')
                    continue
                # dpa start header when line number is 3
                if line_count == 3:
                    inHeader = True
                if inHeader is True:
                    if str.isupper(line):
                        if 'anpa_take_key' in item:
                            item['anpa_take_key'] += " " + line.rstrip('\n')
                        else:
                            item['anpa_take_key'] = line.rstrip('\n')
                        continue
                    if line.startswith('(') or line.endswith(')'):
                        if 'anpa_header' in item:
                            item['anpa_header'] += " " + line
                        else:
                            item['anpa_header'] = line
                        continue
                    # dpa end header when line end with especially characters (ex '=\r\n')
                    end_string = self.check_mendwith(
                        line, self.types[BelgaIPTC7901FeedParser.txt_type][1])
                    if end_string:
                        if line.startswith('By '):
                            item['byline'] = line.replace(
                                'By ', '').rstrip(end_string)
                        else:
                            item['headline'] += line.rstrip(end_string)
                        inHeader = False
                        # set flag inBody when header is end
                        inBody = True
                    else:
                        item['headline'] += line
                        inHeader = True
                    continue
                # dpa start body when the header is end
                if inBody:
                    if line.find('The following information is not for publication') != -1 or \
                            line.find('The following information is not intended for publication') != -1:
                        inNote = True
                        inBody = False
                        item['ednote'] = ''
                        continue
                    item['body_html'] += line
                if inNote:
                    item['ednote'] += line
                    continue
            return item
        except Exception as ex:
            raise ParserError.IPTC7901ParserError(exception=ex,
                                                  provider=provider)
Ejemplo n.º 11
0
    def parse_content_ats(self, file_path, provider=None):
        try:
            item = {
                ITEM_TYPE: CONTENT_TYPE.TEXT,
                'guid': generate_guid(type=GUID_TAG),
                'versioncreated': utcnow(),
                'language': 'fr',
            }

            with open(file_path, 'rb') as f:
                lines = list(f)

            # parse first header line
            m = re.match(
                b'\x7f\x01([a-zA-Z]*)([0-9]*) (.) ([A-Z]{1,3}) ([0-9]*) ([a-zA-Z0-9 ]*)',
                lines[1],
                flags=re.I)
            if m:
                qcode = m.group(4).decode().upper()
                item['original_source'] = m.group(1).decode(
                    'latin-1', 'replace')
                item['ingest_provider_sequence'] = m.group(2).decode()
                item['priority'] = self.map_priority(m.group(3).decode())
                item['anpa_category'] = [{'qcode': self.map_category(qcode)}]
                qcode = self.MAPPING_PRODUCTS['ats'].get(qcode, 'NEWS/GENERAL')
                item.setdefault('subject', []).append({
                    'name':
                    qcode,
                    'qcode':
                    qcode,
                    'parent':
                    'NEWS',
                    'scheme':
                    'services-products'
                })
                item['subject'].extend([
                    {
                        "name": 'ATS',
                        "qcode": 'ATS',
                        "scheme": "sources"
                    },
                    {
                        "name": 'default',
                        "qcode": 'default',
                        "scheme": "distribution"
                    },
                ])
                item['word_count'] = int(m.group(5).decode())

            content = b'\n'.join(lines[1:]).decode('latin-1', 'replace')
            header = re.search(r'.*=', content)
            item['headline'] = header.group(0).strip() if header else ''

            body = re.search(r'(?s)=\s{2,}(.*)', content)
            if body:
                body = re.split(r'(?s)\s{3,}', body.group(1), 1)
                if len(body) == 2:
                    item['abstract'], item['body_html'] = body
                    city = re.search(r'^(\S*)', item['abstract'])
                    if city:
                        item.setdefault('extra',
                                        {})['city'] = city.group(0).strip()
                else:
                    item['body_html'] = body[0]
            return item
        except Exception as ex:
            raise ParserError.IPTC7901ParserError(exception=ex,
                                                  provider=provider)
Ejemplo n.º 12
0
class EventFileFeedingService(FileFeedingService):
    """
    Feeding Service class which can read the configured local file system for article(s).
    """

    NAME = 'event_file'
    ERRORS = [
        ParserError.IPTC7901ParserError().get_error_description(),
        ParserError.nitfParserError().get_error_description(),
        ParserError.newsmlOneParserError().get_error_description(),
        ProviderError.ingestError().get_error_description(),
        ParserError.parseFileError().get_error_description()
    ]

    label = 'Event File Feed'
    """
    Defines the collection service to be used with this ingest feeding service.
    """
    service = 'events'

    def _update(self, provider, update):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.warn(
                'File Feeding Service {} is configured without path. Please check the configuration'
                .format(provider['name']))
            return []

        registered_parser = self.get_feed_parser(provider)
        for filename in get_sorted_files(self.path,
                                         sort_by=FileSortAttributes.created):
            try:
                last_updated = None
                file_path = os.path.join(self.path, filename)
                if os.path.isfile(file_path):
                    stat = os.lstat(file_path)
                    last_updated = datetime.fromtimestamp(stat.st_mtime,
                                                          tz=utc)

                    if self.is_latest_content(last_updated,
                                              provider.get('last_updated')):
                        if isinstance(registered_parser,
                                      NTBEventXMLFeedParser):
                            logger.info('Ingesting xml events')
                            with open(file_path, 'rb') as f:
                                xml = ElementTree.parse(f)
                                parser = self.get_feed_parser(
                                    provider, xml.getroot())
                                item = parser.parse(xml.getroot(), provider)
                        elif isinstance(registered_parser, IcsTwoFeedParser):
                            logger.info('Ingesting ics events')
                            with open(file_path, 'rb') as f:
                                cal = Calendar.from_ical(f.read())
                                parser = self.get_feed_parser(provider, cal)
                                item = parser.parse(cal, provider)
                        else:
                            logger.info('Ingesting events with unknown parser')
                            parser = self.get_feed_parser(provider, file_path)
                            item = parser.parse(file_path, provider)

                        self.after_extracting(item, provider)
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)

                        if isinstance(item, list):
                            yield item
                        else:
                            yield [item]
                    else:
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)
            except Exception as ex:
                if last_updated and self.is_old_content(last_updated):
                    self.move_file(self.path,
                                   filename,
                                   provider=provider,
                                   success=False)
                raise ParserError.parseFileError(
                    '{}-{}'.format(provider['name'], self.NAME), filename, ex,
                    provider)

        push_notification('ingest:update')
Ejemplo n.º 13
0
    def parse(self, file_path, provider=None):
        try:
            item = {
                ITEM_TYPE: CONTENT_TYPE.TEXT,
                "guid": generate_guid(type=GUID_TAG),
                "versioncreated": utcnow()
            }

            with open(file_path, "rb") as f:
                lines = [line for line in f]
            # parse first header line
            m = re.match(
                b"\x01([a-zA-Z]*)([0-9]*) (.) ([A-Z]{1,3}) ([0-9]*) ([a-zA-Z0-9 ]*)",
                lines[0],
                flags=re.I)
            if m:
                item["original_source"] = m.group(1).decode(
                    "latin-1", "replace")
                item["ingest_provider_sequence"] = m.group(2).decode()
                item["priority"] = self.map_priority(m.group(3).decode())
                item["anpa_category"] = [{
                    "qcode":
                    self.map_category(m.group(4).decode())
                }]
                item["word_count"] = int(m.group(5).decode())

            inHeader = True
            inText = False
            inNote = False
            for line in lines[1:]:
                # STX starts the body of the story
                if line[0:1] == b"\x02":
                    # pick the rest of the line off as the headline
                    item["headline"] = line[1:].decode(
                        "latin-1", "replace").rstrip("\r\n")
                    item["body_html"] = ""
                    inText = True
                    inHeader = False
                    continue
                # ETX denotes the end of the story
                if line[0:1] == b"\x03":
                    break
                if inText:
                    if (line.decode("latin-1", "replace").find(
                            "The following information is not for publication"
                    ) != -1 or line.decode(
                            "latin-1", "replace"
                    ).find("The following information is not intended for publication"
                           ) != -1):
                        inNote = True
                        inText = False
                        item["ednote"] = ""
                        continue
                    item["body_html"] += line.decode("latin-1", "replace")
                if inNote:
                    item["ednote"] += line.decode("latin-1", "replace")
                    continue
                if inHeader:
                    if "slugline" not in item:
                        item["slugline"] = ""
                    item["slugline"] += line.decode("latin-1",
                                                    "replace").rstrip("/\r\n")
                    continue

            return item
        except Exception as ex:
            raise ParserError.IPTC7901ParserError(exception=ex,
                                                  provider=provider)