Beispiel #1
0
    def __new__(metacls, name, bases, attrs):
        provider_name = attrs.get('PROVIDER')

        if provider_name is not None:
            if 'ERRORS' not in attrs:
                raise AttributeError("Provider class {} must define "
                                     "the ERRORS list attribute.".format(name))

            if provider_name in providers:
                raise TypeError("PROVIDER {} already exists ({}).".format(
                    provider_name, providers[provider_name]))

        new_cls = super().__new__(metacls, name, bases, attrs)

        if provider_name is not None:
            register_provider(provider_name, new_cls, new_cls.ERRORS)

        return new_cls
    def __new__(metacls, name, bases, attrs):
        provider_name = attrs.get('PROVIDER')

        if provider_name is not None:
            if 'ERRORS' not in attrs:
                raise AttributeError(
                    "Provider class {} must define "
                    "the ERRORS list attribute.".format(name))

            if provider_name in providers:
                raise TypeError(
                    "PROVIDER {} already exists ({}).".format(
                        provider_name, providers[provider_name])
                )

        new_cls = super().__new__(metacls, name, bases, attrs)

        if provider_name is not None:
            register_provider(provider_name, new_cls, new_cls.ERRORS)

        return new_cls
Beispiel #3
0
        except Exception as error:
            traceback.print_exc()
            raise error

        if response.status_code == 404:
            raise LookupError('Not found %s' % payload)

        try:
            # workaround for httmock lib
            # return etree.fromstring(response.text.encode('utf-8'))
            return etree.fromstring(response.content)
        except UnicodeEncodeError as error:
            traceback.print_exc()
            raise error

    def get_url(self, endpoint):
        """Get API url for given endpoint."""
        return '/'.join([self.URL, endpoint])

    def format_date(self, date):
        """Format date for API usage."""
        return date.strftime(self.DATE_FORMAT)

    def prepare_href(self, href):
        (scheme, netloc, path, params, query, fragment) = urlparse(href)
        new_href = urlunparse((scheme, netloc, path, '', '', ''))
        return '%s?auth_token=%s' % (new_href, self.get_token())


register_provider(PROVIDER, ReutersIngestService())
from superdesk.errors import SuperdeskApiError, ProviderError
from superdesk.io import register_provider
from .tests import setup_providers, teardown_providers
from superdesk.io.ingest_service import IngestService
from superdesk.io.commands.remove_expired_content import get_expired_items, RemoveExpiredContent
from superdesk.celery_task_utils import mark_task_as_not_running, is_task_running
from test_factory import SuperdeskTestCase


class TestProviderService(IngestService):

    def update(self, provider):
        return []


register_provider('test', TestProviderService(), [ProviderError.anpaError(None, None).get_error_description()])


class CeleryTaskRaceTest(SuperdeskTestCase):

    def test_the_second_update_fails_if_already_running(self):
        provider = {'_id': 'abc', 'name': 'test provider', 'update_schedule': {'minutes': 1}}
        removed = mark_task_as_not_running(provider['name'], provider['_id'])
        self.assertFalse(removed)

        failed_to_mark_as_running = is_task_running(provider['name'], provider['_id'], {'minutes': 1})
        self.assertFalse(failed_to_mark_as_running, 'Failed to mark ingest update as running')

        failed_to_mark_as_running = is_task_running(provider['name'], provider['_id'], {'minutes': 1})
        self.assertTrue(failed_to_mark_as_running, 'Ingest update marked as running, possible race condition')
Beispiel #5
0
from nose.tools import assert_raises
from superdesk import get_resource_service
from superdesk.utc import utcnow
from superdesk.tests import setup
from superdesk.errors import SuperdeskApiError
from superdesk.io import register_provider
from superdesk.io.tests import setup_providers, teardown_providers
from superdesk.io.ingest_service import IngestService


class TestProviderService(IngestService):
    def update(self, provider):
        return []


register_provider('test', TestProviderService())


class UpdateIngestTest(TestCase):
    def setUp(self):
        setup(context=self)
        setup_providers(self)

    def tearDown(self):
        teardown_providers(self)

    def _get_provider(self, provider_name):
        return get_resource_service('ingest_providers').find_one(
            name=provider_name, req=None)

    def _get_provider_service(self, provider):
Beispiel #6
0
        """Create a new content package from given content items.

        The package's `main` group contains only the references to given items,
        not the items themselves. In the list of references, the reference to
        the text item preceeds the references to image items.

        :param dict text_item: item representing the text content
        :param list image_items: list of items (dicts) representing the images
            related to the text content
        :return: the created content package
        :rtype: dict
        """
        package = {
            "type": "composite",
            "groups": [
                {"id": "root", "role": "grpRole:NEP", "refs": [{"idRef": "main"}]},
                {"id": "main", "role": "main", "refs": []},
            ],
        }

        item_references = package["groups"][1]["refs"]
        item_references.append({"residRef": text_item["guid"]})

        for image in image_items:
            item_references.append({"residRef": image["guid"]})

        return package


register_provider(PROVIDER, RssIngestService(), errors)
Beispiel #7
0
    def _update(self, provider):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)
        if not self.path:
            return

        for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created):
            try:
                if os.path.isfile(os.path.join(self.path, filename)):
                    filepath = os.path.join(self.path, filename)
                    stat = os.lstat(filepath)
                    last_updated = datetime.fromtimestamp(stat.st_mtime, tz=utc)
                    if self.is_latest_content(last_updated, provider.get('last_updated')):
                        with open(os.path.join(self.path, filename), 'r') as f:
                            item = self.parser.parse_message(etree.fromstring(f.read()))

                            self.add_timestamps(item)
                            self.move_file(self.path, filename, success=True)
                            yield [item]
                    else:
                        self.move_file(self.path, filename, success=True)
            except Exception as err:
                logger.exception(err)
                self.move_file(self.path, filename, success=False)

        push_notification('ingest:update')


register_provider(PROVIDER, AFPIngestService())
Beispiel #8
0
# -*- coding: utf-8; -*-
#
# This file is part of Superdesk.
#
# Copyright 2013, 2014 Sourcefabric z.u. and contributors.
#
# For the full copyright and license information, please see the
# AUTHORS and LICENSE files distributed with this source code, or
# at https://www.sourcefabric.org/superdesk/license

import apps.io.aap  # NOQA
import apps.io.afp  # NOQA
import apps.io.dpa  # NOQA
import apps.io.reuters  # NOQA

from superdesk.io import register_provider


register_provider('search', None, [])
Beispiel #9
0
                    if not filename.lower().endswith(self.FILE_SUFFIX):
                        continue

                    if last_updated:
                        item_last_updated = datetime.strptime(facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc)
                        if item_last_updated < last_updated:
                            continue

                    dest = os.path.join(config['dest_path'], filename)

                    try:
                        with open(dest, 'xb') as f:
                            ftp.retrbinary('RETR %s' % filename, f.write)
                    except FileExistsError:
                        continue

                    xml = etree.parse(dest).getroot()
                    parser = get_xml_parser(xml)
                    if not parser:
                        raise IngestFtpError.ftpUnknownParserError(Exception('Parser not found'),
                                                                   provider, filename)
                    items.append(parser.parse_message(xml, provider))
            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)

register_provider('ftp', FTPService())
Beispiel #10
0
            except imaplib.IMAP4.error:
                raise IngestEmailError.emailLoginError(imaplib.IMAP4.error,
                                                       provider)

            rv, data = imap.select(config.get('mailbox', None), readonly=False)
            if rv == 'OK':
                rv, data = imap.search(None, config.get('filter', None))
                if rv == 'OK':
                    new_items = []
                    for num in data[0].split():
                        rv, data = imap.fetch(num, '(RFC822)')
                        if rv == 'OK':
                            try:
                                new_items.append(
                                    self.parser.parse_email(data, provider))
                            except IngestEmailError:
                                continue
                imap.close()
            imap.logout()
        except IngestEmailError:
            raise
        except Exception as ex:
            raise IngestEmailError.emailError(ex, provider)
        return new_items

    def prepare_href(self, href):
        return url_for_media(href)


register_provider(PROVIDER, EmailReaderService(), errors)
from superdesk.utc import utcnow
from superdesk.tests import setup
from superdesk.errors import SuperdeskApiError
from superdesk.io import register_provider
from superdesk.io.tests import setup_providers, teardown_providers
from superdesk.io.ingest_service import IngestService
from superdesk.io.commands.update_ingest import is_scheduled, update_provider, filter_expired_items, apply_rule_set


class TestProviderService(IngestService):

    def update(self, provider):
        return []


register_provider('test', TestProviderService())


class UpdateIngestTest(TestCase):
    def setUp(self):
        setup(context=self)
        setup_providers(self)

    def tearDown(self):
        teardown_providers(self)

    def _get_provider(self, provider_name):
        return get_resource_service('ingest_providers').find_one(name=provider_name, req=None)

    def _get_provider_service(self, provider):
        return self.provider_services[provider.get('type')]
Beispiel #12
0
                            self.DATE_FORMAT).replace(tzinfo=utc)
                        if item_last_updated < last_updated:
                            continue

                    dest = os.path.join(config['dest_path'], filename)

                    try:
                        with open(dest, 'xb') as f:
                            ftp.retrbinary('RETR %s' % filename, f.write)
                    except FileExistsError:
                        continue

                    xml = etree.parse(dest).getroot()
                    parser = get_xml_parser(xml)
                    if not parser:
                        raise IngestFtpError.ftpUnknownParserError(
                            Exception('Parser not found'), provider, filename)
                    parsed = parser.parse_message(xml, provider)
                    if isinstance(parsed, dict):
                        parsed = [parsed]

                    items.append(parsed)
            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)


register_provider('ftp', FTPService(), errors)
Beispiel #13
0
        :param dict data: parsed data of a single feed entry
        :param field_aliases: (optional) field name aliases. Used for content
             fields that are named differently in retrieved data.
        :type field_aliases: dict or None

        :return: created content item
        :rtype: dict
        """
        if field_aliases is None:
            field_aliases = {}
        else:
            field_aliases = merge_dicts(field_aliases)

        item = dict(type='text')

        for field in self.item_fields:
            data_field_name = field_aliases.get(
                field.name_in_data, field.name_in_data
            )
            field_value = data.get(data_field_name)

            if (field.type is datetime) and field_value:
                field_value = utcfromtimestamp(timegm(field_value))

            item[field.name] = field_value

        return item


register_provider(PROVIDER, RssIngestService(), errors)
Beispiel #14
0
                    if last_updated:
                        item_last_updated = datetime.strptime(
                            facts['modify'],
                            self.DATE_FORMAT).replace(tzinfo=utc)
                        if item_last_updated < last_updated:
                            continue

                    dest = '%s/%s' % (config['dest_path'], filename)

                    try:
                        with open(dest, 'xb') as f:
                            ftp.retrbinary('RETR %s' % filename, f.write)
                    except FileExistsError:
                        continue

                    xml = etree.parse(dest).getroot()
                    parser = get_xml_parser(xml)
                    if not parser:
                        raise IngestFtpError.ftpUnknownParserError(
                            Exception('Parser not found'), provider, filename)
                    items.append(parser.parse_message(xml, provider))
            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)


register_provider('ftp', FTPService())
from superdesk.tests import setup
from superdesk.errors import SuperdeskApiError, ProviderError
from superdesk.io import register_provider
from superdesk.io.tests import setup_providers, teardown_providers
from superdesk.io.ingest_service import IngestService
from superdesk.io.commands.remove_expired_content import get_expired_items
from superdesk.celery_task_utils import mark_task_as_not_running, is_task_running


class TestProviderService(IngestService):
    def update(self, provider):
        return []


register_provider(
    'test', TestProviderService(),
    [ProviderError.anpaError(None, None).get_error_description()])


class CeleryTaskRaceTest(TestCase):
    def setUp(self):
        setup(context=self)

    def test_the_second_update_fails_if_already_running(self):
        provider = {
            '_id': 'abc',
            'name': 'test provider',
            'update_schedule': {
                'minutes': 1
            }
        }
Beispiel #16
0
        for filename in os.listdir(self.path):
            try:
                if os.path.isfile(os.path.join(self.path, filename)):
                    filepath = os.path.join(self.path, filename)
                    stat = os.lstat(filepath)
                    last_updated = datetime.fromtimestamp(stat.st_mtime,
                                                          tz=utc)
                    if self.is_latest_content(last_updated,
                                              provider.get('updated')):
                        with open(os.path.join(self.path, filename), 'r') as f:
                            item = self.parser.parse_message(
                                etree.fromstring(f.read()))

                            item['firstcreated'] \
                                = normalize_date(item.get('firstcreated'), self.tz)
                            item['versioncreated'] \
                                = normalize_date(item.get('versioncreated'), self.tz)

                            self.move_file(self.path, filename, success=True)
                            yield [item]
                    else:
                        self.move_file(self.path, filename, success=True)
            except Exception as err:
                logger.exception(err)
                self.move_file(self.path, filename, success=False)

        push_notification('ingest:update')


register_provider(PROVIDER, AAPIngestService())
Beispiel #17
0
    def parse_file(self, filename, provider):
        try:
            path = provider.get('config', {}).get('path', None)

            if not path:
                return []

            item = self.parser.parse_file(os.path.join(path, filename))

            return [item]
        except Exception as ex:
            self.move_file(self.path, filename, provider=provider, success=False)
            raise ParserError.parseFileError('Teletype', filename, ex, provider)

    def parse_file(self, filename, provider):
        try:
            path = provider.get('config', {}).get('path', None)

            if not path:
                return []

            item = self.parser.parse_file(os.path.join(path, filename), provider)

            return [item]
        except Exception as ex:
            raise ParserError.parseFileError('Teletype', filename, ex, provider)


register_provider(PROVIDER, TeletypeIngestService(), errors)
Beispiel #18
0
        except Exception as error:
            traceback.print_exc()
            raise error

        if response.status_code == 404:
            raise LookupError('Not found %s' % payload)

        try:
            # workaround for httmock lib
            # return etree.fromstring(response.text.encode('utf-8'))
            return etree.fromstring(response.content)
        except UnicodeEncodeError as error:
            traceback.print_exc()
            raise error

    def get_url(self, endpoint):
        """Get API url for given endpoint."""
        return '/'.join([self.URL, endpoint])

    def format_date(self, date):
        """Format date for API usage."""
        return date.strftime(self.DATE_FORMAT)

    def prepare_href(self, href):
        (scheme, netloc, path, params, query, fragment) = urlparse(href)
        new_href = urlunparse((scheme, netloc, path, '', '', ''))
        return '%s?auth_token=%s' % (new_href, self.get_token())


register_provider(PROVIDER, ReutersIngestService())
Beispiel #19
0
            imap = imaplib.IMAP4_SSL(host=server, port=port)
            try:
                imap.login(config.get('user', None), config.get('password', None))
            except imaplib.IMAP4.error:
                raise IngestEmailError.emailLoginError(imaplib.IMAP4.error, provider)

            rv, data = imap.select(config.get('mailbox', None), readonly=False)
            if rv == 'OK':
                rv, data = imap.search(None, config.get('filter', None))
                if rv == 'OK':
                    new_items = []
                    for num in data[0].split():
                        rv, data = imap.fetch(num, '(RFC822)')
                        if rv == 'OK':
                            try:
                                new_items.append(self.parser.parse_email(data, provider))
                            except IngestEmailError:
                                continue
                imap.close()
            imap.logout()
        except IngestEmailError:
            raise
        except Exception as ex:
            raise IngestEmailError.emailError(ex, provider)
        return new_items

    def prepare_href(self, href):
        return url_for_media(href)

register_provider(PROVIDER, EmailReaderService(), errors)